# Data Importing

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
import miceforest as mf
import optuna
import lightgbm as lgb
import xgboost as xgb

from utils import *
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn import metrics
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('../data/processed/after_prep.csv')
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           6019 non-null   object 
 1   Year               6019 non-null   int64  
 2   Kilometers_Driven  6019 non-null   int64  
 3   Fuel_Type          6019 non-null   object 
 4   Transmission       6019 non-null   object 
 5   Owner_Type         6019 non-null   object 
 6   Seats              5976 non-null   float64
 7   Price              6019 non-null   float64
 8   Brand              6019 non-null   object 
 9   Series             6019 non-null   object 
 10  Type               6019 non-null   object 
 11  Mileage (kmpl)     5951 non-null   float64
 12  Engine (CC)        5983 non-null   float64
 13  Power (bhp)        5876 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 658.5+ KB


# Preprocessing

In [4]:
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

(6018, 14)

In [19]:
# Drop missing values
df = df.dropna()
null_checker(df)

Unnamed: 0,null (sum),null (%)
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0
Owner_Type,0,0.0
Seats,0,0.0
Price,0,0.0
Brand,0,0.0
Series,0,0.0


## Train test split

In [6]:
# melakukan train test split di awal untuk mencegah data bocor ke test set saat dilakukan encoding/imputation
features = df.drop(columns=['Price'])
target = df['Price']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

## Encoding

In [7]:
# One hot encoding
col_to_encode = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Brand']
oh_encoder = ce.OneHotEncoder(cols=col_to_encode,
                              use_cat_names=True)
oh_encoder.fit(X_train)

# Encoding train set
X_train = oh_encoder.transform(X_train)
# Encoding test set
X_test = oh_encoder.transform(X_test)

In [8]:
# Target encoding/One hot encoding untuk feature dengan kategori yang banyak
col_to_encode = ['Series', 'Type']
target_encoder = ce.TargetEncoder(cols=col_to_encode)
target_encoder.fit(X_train, y_train)

# Encoding train set
X_train = target_encoder.transform(X_train)
# Encoding test set
X_test = target_encoder.transform(X_test)

# Modeling

## Functions

In [9]:
def get_cv_score(models, X_train, y_train):
    
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    summary = []
    for label, model in models.items():
        cv_results = cross_validate(model, X_train, y_train, cv=cv, 
                                    scoring=['r2',
                                             'neg_root_mean_squared_error',
                                             'neg_mean_absolute_error'])
        
        temp = pd.DataFrame(cv_results).copy()
        temp['Model'] = label
        summary.append(temp)
    
    summary = pd.concat(summary)
    summary = summary.groupby('Model').mean()
    
    summary.drop(columns=['fit_time', 'score_time'], inplace=True)
    summary.columns = ['CV R2', 'CV RMSE', 'CV MAE']
    summary[['CV RMSE', 'CV MAE']] = summary[['CV RMSE', 'CV MAE']] * -1
    
    return summary

In [10]:
def evaluate_model(models, X_train, X_test, y_train, y_test):

    summary = {'Model':[], 'Train R2':[], 'Train RMSE':[], 'Train MAE':[],
               'Test R2':[], 'Test RMSE':[], 'Test MAE':[]}

    for label, model in models.items():
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        summary['Model'].append(label)

        summary['Train R2'].append(
            metrics.r2_score(y_train, y_train_pred))
        summary['Train RMSE'].append(
            np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
        summary['Train MAE'].append(
            metrics.mean_absolute_error(y_train, y_train_pred))

        summary['Test R2'].append(
            metrics.r2_score(y_test, y_test_pred))
        summary['Test RMSE'].append(
            np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
        summary['Test MAE'].append(
            metrics.mean_absolute_error(y_test, y_test_pred))
    
    summary = pd.DataFrame(summary)
    summary.set_index('Model', inplace=True)

    cv_scores = get_cv_score(models, X_train, y_train)
    summary = summary.join(cv_scores)
    summary = summary[['Train R2', 'CV R2', 'Test R2',
                       'Train RMSE', 'CV RMSE', 'Test RMSE',
                       'Train MAE', 'CV MAE', 'Test MAE']]
    
    return round(summary.sort_values(by='Test RMSE'), 4)

## Base Model

In [11]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
xgb_model = XGBRegressor(objective='reg:squarederror')
lgb_model = LGBMRegressor()
cat_model = CatBoostRegressor(verbose=0, iterations=2000)
lr_model = LinearRegression()
lasso_model = Lasso()

models_tree = {'DecisionTreeRegressor' : tree_model,
          'RandomForestRegressor' : rf_model,
          'XGBRegressor' : xgb_model,
          'CatBoostRegressor' : cat_model,
          'LGBMRegressor' : lgb_model}

models_linear ={'LinearRegression': lr_model,
          'LassoRegression': lasso_model}

### Unscaled dataset

In [12]:
evaluate_model(models_tree, X_train, X_test, y_train, y_test)

Unnamed: 0_level_0,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CatBoostRegressor,0.9936,0.9282,0.9241,0.8776,2.8746,3.2346,0.6022,1.1218,1.2277
LGBMRegressor,0.9729,0.919,0.9031,1.8035,3.0825,3.6549,0.7936,1.1914,1.3504
RandomForestRegressor,0.9882,0.9162,0.8993,1.1884,3.1446,3.726,0.4632,1.262,1.4293
XGBRegressor,0.9977,0.9134,0.8913,0.5278,3.167,3.8712,0.3761,1.2154,1.3612
DecisionTreeRegressor,1.0,0.8367,0.7932,0.0131,4.3782,5.341,0.0006,1.7404,1.8852


In [13]:
# evaluasi model memakai function
unscaled = evaluate_model(models_tree, X_train, X_test, y_train, y_test)

### Scaled dataset

In [14]:
# Scaling data
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# evaluasi model memakai function
scaled = evaluate_model(models_tree, X_train_scaled, X_test_scaled, y_train, y_test)

### Summarizing

In [16]:
unscaled['Dataset Version'] = 'dropna + all + unscaled'
scaled['Dataset Version'] = 'dropna + all + scaled'

In [17]:
dropna_all = pd.concat([unscaled, scaled], axis=0)
dropna_all

Unnamed: 0_level_0,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE,Dataset Version
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CatBoostRegressor,0.9936,0.9282,0.9241,0.8776,2.8746,3.2346,0.6022,1.1218,1.2277,dropna + all + unscaled
LGBMRegressor,0.9729,0.919,0.9031,1.8035,3.0825,3.6549,0.7936,1.1914,1.3504,dropna + all + unscaled
RandomForestRegressor,0.9888,0.914,0.8959,1.159,3.1808,3.7885,0.4676,1.264,1.4497,dropna + all + unscaled
XGBRegressor,0.9977,0.9134,0.8913,0.5278,3.167,3.8712,0.3761,1.2154,1.3612,dropna + all + unscaled
DecisionTreeRegressor,1.0,0.8548,0.7637,0.0131,4.159,5.7092,0.0006,1.7353,1.9412,dropna + all + unscaled
CatBoostRegressor,0.9936,0.9282,0.9241,0.8776,2.8746,3.2346,0.6022,1.1218,1.2277,dropna + all + scaled
LGBMRegressor,0.9731,0.919,0.9036,1.7988,3.0816,3.6471,0.7914,1.1868,1.3393,dropna + all + scaled
RandomForestRegressor,0.9865,0.9153,0.8985,1.2749,3.1569,3.7422,0.467,1.2512,1.4496,dropna + all + scaled
XGBRegressor,0.9977,0.9134,0.8913,0.5278,3.167,3.8711,0.3761,1.2155,1.361,dropna + all + scaled
DecisionTreeRegressor,1.0,0.857,0.7837,0.0131,4.1237,5.4613,0.0006,1.7187,1.9386,dropna + all + scaled


In [18]:
dropna_all.to_csv('../data/processed/summary_dropna_all.csv')