In [25]:
# importing of libaries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [14]:
def load_and_explore_dataset(filepath):
    print('=' * 60)
    print("LOAD AND EXPLORE DATASET")
    print('=' * 60)

    df = pd.read_csv(filepath)

    print('shape of the dataset:')
    print(df.shape)
    print('\nCheck for missing values:')
    print(df.isnull().sum())
    print('\nFirst five rows:')
    print(df.head())
    print('\nDescriptive Stats:')
    print(df.describe())
    print('\nDataset info:')
    print(df.info())
    print('\nConditon Distribution:')
    print(df['condition'].value_counts())
    print('\nTransmission Distribution:')
    print(df['transmission'].value_counts())
    
    return df

In [11]:
def preprocessing_data(df):
    print('\n' + '=' * 60)
    print('PREPROCESSING LOAD DATA')
    print('=' * 60)

    df_processed = df.copy()
    df_processed = df.dropna()

    label_encoder = {}
    df_columns = ['make', 'model', 'condition', 'transmission']
    for col in df_columns:
        le = LabelEncoder()
        df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])
        label_encoder[col] = le
        print(f'\n{col} encoded')
        for i, label in enumerate(le.classes_):
            print(f" {label}: {i}")
    print("preprocessed_dataset shape", df_processed.shape)

    return df_processed, label_encoder     

In [17]:
def features_data(df_processed):
    print('\n' + '=' * 60)
    print('FEATURES DATA')
    print('=' * 60)

    feature_cols = ['year', 'make_encoded', 'model_encoded', 'condition_encoded', 'transmission_encoded']
    target_col = ['price']

    X = df_processed[feature_cols]
    y = df_processed[target_col]
    
    print('\nFeatures shape', df_processed[feature_cols].shape)
    print('\nTarget shape', df_processed[target_col].shape)

    
    return X, y, feature_cols

In [20]:
def split_data(X, y, test_size=0.2, random_state=42):
    print('\n' + '=' * 60)
    print('SPLITING  DATA')
    print('=' * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size, random_state
    )

    print('\n Training set data', X_train.shope[0])
    print('Testing set data', X_test.shape[0])
    print('\nTraining set price range{:.2f} - {:.2f}'.format(y_tain.min(),y_train.max()))
    print('\nTesting set price range{:.2f} - {:.2f}'.format(y_test.min(),y_test.max()))


    return X_train, X_test, y_train, y_test

In [None]:
def scale_features(X_train, X_test):
    print('\n' + '=' * 60)
    print('SCALE FEATURES  DATA')
    print('=' * 60)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    Print('\nFeatures scaled successfully')
    print('Trainnig scaled set:', X_train_scaled.shape)
    print('Testing scaled set:', X_test_scaled.shape)

    return  X_train_scaled, X_test_scaled, scaler

In [None]:
def train_model(X_train_scaled, y_train, feature_cols):
    print('\n' + '=' * 60)
    print('SCALE FEATURES  DATA')
    print('=' * 60)

    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    print('\nModel trained successfully')
    print('\nModel coefficient')
    for features ,coef in zip(feature_cols, model.coef_):
        print(f" {features} : {coef:.2f}")
    print(f'\nModel intercept:{model.intercept_:.2f}')

In [None]:
def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test):
    print('\n' + '=' * 60)
    print('EVALUATING  DATA')
    print('=' * 60)

    # make prediction

    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # calculate metrics
    train_r2 = r2_score(y_train_pred, y_train)
    test_r2 = r2_score(y_test_pred, y_test)
    train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
    test_rmse = np.sqrt(mean_squared_error(y_test_pred, y_test))
    train_mae = mean_absolute_error(y_train_pred, y_train)
    test_mae = mean_absolute_error(y_test_pred, y_test)
    
    print('\n' + '=' * 60)
    print('MODEL  PERFORMANCE')
    print('=' * 60)
    print('\nTraining set')
    print(f'R2 score: {train_r2:.2f}')
    print(f'RMSE: {train_rmse:2.2f}')
    print(f'MAE: {train_mae:2.2f}')
    print('\nTestin set')
    print(f'R2 score: {test_r2:.2f}')
    print(f'RMSE: {test_rmse:2.2f}')
    print(f'MAE: {test_mae:2.2f}')
    print('=' * 60)

    # cross validation score
    cv_score = cross_val_score(model, X_train_scaled, cv=5, scoring='r2')
    print('\nCross Validation (5-folds)')
    print(f'R2 Score:{cv_score:.2f}')
    print(f'Cross Validation mean: {cv_score.mean():.2f}')
    print(f"Cross Validation STD : {cv_score.std():.4f}")

    metrics = {
        "train_r2":train_r2,
        "test_r2": test_r2,
        "train_rmse": train_rmse,
        "test_rmse": test_rmse,
        "train_mae": train_mae,
        "test_mae": test_mae,
        "y_train_pred": y_train_pred,
        'y_test_pred':y_test_pred,
        "cv_score":cv_score
    }

    return metrics

In [21]:
def main():
    filepath = 'cleaned_jiji_car_dataset.csv'
    df = load_and_explore_dataset(filepath)
    df_processed, label_encoder = preprocessing_data(df)
    X, y, feature_cols = features_data(df_processed)

In [8]:
if __name__ == "__main__":
    main()

LOAD AND EXPLORE DATASET
shape of the dataset:
(1755, 7)

Check for missing values:
title           0
make            0
model           0
year            0
condition       0
transmission    0
price           0
dtype: int64

First five rows:
                                               title           make  \
0                            Lexus RX 350 2009 White          Lexus   
1  Hyundai Sonata Limited w/Brown Leather 4dr Sed...        Hyundai   
2                Toyota Highlander Limited 2012 Gray         Toyota   
3  Mercedes-Benz C300 Base AWD 4Matic Sedan (2.0L...  Mercedes-Benz   
4                        Hyundai Elantra 2014 Silver        Hyundai   

                model  year     condition transmission       price  
0              RX 350  2009  Foreign used    Automatic  12850000.0  
1      Sonata Limited  2015  Foreign used    Automatic  15450000.0  
2  Highlander Limited  2012    Local used    Automatic  14500000.0  
3           C300 Base  2015    Local used    Automatic  