# Data Importing

In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce

from utils import null_checker, evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [2]:
df = pd.read_csv('../data/processed/after_prep.csv')
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           6019 non-null   object 
 1   Year               6019 non-null   int64  
 2   Kilometers_Driven  6019 non-null   int64  
 3   Fuel_Type          6019 non-null   object 
 4   Transmission       6019 non-null   object 
 5   Owner_Type         6019 non-null   object 
 6   Seats              5976 non-null   float64
 7   Price              6019 non-null   float64
 8   Brand              6019 non-null   object 
 9   Series             6019 non-null   object 
 10  Type               6019 non-null   object 
 11  Mileage (kmpl)     5951 non-null   float64
 12  Engine (CC)        5983 non-null   float64
 13  Power (bhp)        5876 non-null   float64
dtypes: float64(5), int64(2), object(7)
memory usage: 658.5+ KB


# Preprocessing

In [4]:
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

(6018, 14)

In [5]:
# Drop missing values
df= df.dropna()
null_checker(df)

Unnamed: 0,null (sum),null (%)
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0
Owner_Type,0,0.0
Seats,0,0.0
Price,0,0.0
Brand,0,0.0
Series,0,0.0


## Train test split

In [6]:
# melakukan train test split di awal untuk mencegah data leakage
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Encoding

In [7]:
# Define category mapping for label encoding
mapping_owner = {
    'First': 1, 
    'Second': 2, 
    'Third': 3, 
    'Fourth & Above': 4
}
mapping_trans = {
    'Manual': 0, 
    'Automatic': 1, 
}

# Encoding train set
X_train["Owner_Type"] = X_train["Owner_Type"].map(mapping_owner)
X_train["Transmission"] = X_train["Transmission"].map(mapping_trans)
# Encoding test set
X_test["Owner_Type"] = X_test["Owner_Type"].map(mapping_owner)
X_test["Transmission"] = X_test["Transmission"].map(mapping_trans)

In [8]:
# One hot encoding for low cardinality feature + Brand
col_to_encode = ['Location', 'Fuel_Type', 'Brand']
oh_encoder = ce.OneHotEncoder(cols=col_to_encode,
                              use_cat_names=True)
oh_encoder.fit(X_train)

# Encoding train set
X_train = oh_encoder.transform(X_train)
# Encoding test set
X_test = oh_encoder.transform(X_test)

In [9]:
# Target encoding for high cardinality feature
col_to_encode = X_train.select_dtypes("object").columns
encoder = ce.TargetEncoder(cols=col_to_encode)
encoder.fit(X_train, y_train)

# Encoding train set
X_train = encoder.transform(X_train)
# Encoding test set
X_test = encoder.transform(X_test)

## Feature Selection

In [10]:
# Memfilter feature dengan korelasi tinggi
corr_price = X_train.join(y_train).corr()['Price']
index = corr_price[(corr_price < -0.20) | (corr_price > 0.20)].index

X_train =  X_train[index[:-1]]
X_test = X_test[index[:-1]]

# Modeling

In [11]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
xgb_model = XGBRegressor()
lgb_model = LGBMRegressor()
cat_model = CatBoostRegressor(silent=True)
lr_model = LinearRegression()
lasso_model = Lasso()
ridge_model = Ridge()

models = {'DecisionTree' : tree_model,
          'RandomForest' : rf_model,
          'XGBoost' : xgb_model,
          'CatBoost' : cat_model,
          'LightGBM' : lgb_model,
          'Linear': lr_model,
          'Lasso': lasso_model,
          'Ridge': ridge_model}

### Unscaled dataset

In [12]:
# evaluasi model memakai function
unscaled = evaluate_model(models, X_train, X_test, y_train, y_test)

### Scaled dataset

In [13]:
# Scaling data
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# evaluasi model memakai function
scaled = evaluate_model(models, X_train_scaled, X_test_scaled, y_train, y_test)

### Summarizing

In [15]:
unscaled['Dataset Version'] = 'dropna + selected + unscaled'
scaled['Dataset Version'] = 'dropna + selected + scaled'

In [16]:
dropna_selected = pd.concat([unscaled, scaled], axis=0)
dropna_selected

Unnamed: 0_level_0,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LightGBM,0.1773,0.9561,0.9031,0.8824,2.2947,3.3883,4.0274,dropna + selected + unscaled
CatBoost,3.4547,0.9799,0.902,0.9119,1.5554,3.3884,3.4853,dropna + selected + unscaled
RandomForest,1.3959,0.9783,0.9007,0.8797,1.614,3.4205,4.0724,dropna + selected + unscaled
XGBoost,0.2003,0.9866,0.8763,0.8723,1.2692,3.8017,4.1964,dropna + selected + unscaled
DecisionTree,0.0237,0.9897,0.8458,0.7906,1.1131,4.2085,5.3736,dropna + selected + unscaled
Ridge,0.0041,0.8028,0.799,0.7537,4.8667,4.9071,5.8284,dropna + selected + unscaled
Linear,0.0112,0.8028,0.7989,0.7536,4.8666,4.9086,5.8289,dropna + selected + unscaled
Lasso,0.0068,0.7979,0.7975,0.7479,4.9266,4.9249,5.8966,dropna + selected + unscaled
CatBoost,2.99,0.9799,0.9021,0.9119,1.5554,3.3883,3.4853,dropna + selected + scaled
LightGBM,0.1845,0.9564,0.9016,0.8816,2.2874,3.413,4.0407,dropna + selected + scaled


In [17]:
dropna_selected.to_csv('../data/processed/summary_dropna_selected.csv')