# Data Understanding

In [1]:
# !pip install -r requirements.txt

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils as explor
import plotly.graph_objects as go
import plotly.express as px
import missingno as msno
import category_encoders as ce
import miceforest as mf

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [4]:
sns.set(style='darkgrid', palette='muted')

In [5]:
df = pd.read_csv('../data/raw/used_car_data.csv')

In [6]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [6]:
df.shape

(6019, 12)

In [7]:
df[df['Mileage'].isna()]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
4446,Mahindra E Verito D4,Chennai,2016,50000,Electric,Automatic,First,,72 CC,41 bhp,5.0,13.0
4904,Toyota Prius 2009-2016 Z4,Mumbai,2011,44000,Electric,Automatic,First,,1798 CC,73 bhp,5.0,12.75


In [8]:
explor.null_checker(df)

Unnamed: 0,null (sum),null (%)
Seats,42,0.7
Engine,36,0.6
Power,36,0.6
Mileage,2,0.03
Name,0,0.0
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 564.4+ KB


In [10]:
df['Brand'] = df['Name'].apply(lambda x: x.split(' ')[0])
df['Series'] = df['Name'].apply(lambda x: x.split(' ')[1])
df['Type'] = df['Name'].apply(lambda x: x.split(' ')[2])
df.drop(columns='Name', inplace=True)

In [11]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Series,Type
0,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75,Maruti,Wagon,R
1,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5,Hyundai,Creta,1.6
2,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5,Honda,Jazz,V
3,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0,Maruti,Ertiga,VDI
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74,Audi,A4,New


In [12]:
df[(df['Engine']=='72 CC')]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,Series,Type
4446,Chennai,2016,50000,Electric,Automatic,First,,72 CC,41 bhp,5.0,13.0,Mahindra,E,Verito


In [13]:
print('Satuan pada feature Mileage:', df['Mileage'].apply(lambda x: x if pd.isna(x) else x.split(' ')[1]).unique())
print('Satuan pada feature Engine:', df['Engine'].apply(lambda x: x if pd.isna(x) else x.split(' ')[1]).unique())
print('Satuan pada feature Power:', df['Power'].apply(lambda x: x if pd.isna(x) else x.split(' ')[1]).unique())

Satuan pada feature Mileage: ['km/kg' 'kmpl' nan]
Satuan pada feature Engine: ['CC' nan]
Satuan pada feature Power: ['bhp' nan]


In [14]:
print('Invalid Value pada feature Mileage:', pd.Series([x for x in df['Mileage'] if str(x).split(' ')[0].isalpha()]).unique())
print('Invalid Value pada feature Engine:', pd.Series([x for x in df['Engine'] if str(x).split(' ')[0].isalpha()]).unique())
print('Invalid Value pada feature Power:', pd.Series([x for x in df['Power'] if str(x).split(' ')[0].isalpha()]).unique())

Invalid Value pada feature Mileage: [nan]
Invalid Value pada feature Engine: [nan]
Invalid Value pada feature Power: ['null bhp' nan]


In [15]:
df['Mileage'].apply(lambda x: x if pd.isna(x) else x.split(' ')[1]).value_counts()

kmpl     5951
km/kg      66
Name: Mileage, dtype: int64

In [16]:
df['Mileage (kmpl)'] = df['Mileage'].apply(lambda x: x if pd.isna(x) else x.split(' ')[0])
df['Engine (CC)'] = df['Engine'].apply(lambda x: x if pd.isna(x) else x.split(' ')[0])
df['Power (bhp)'] = df['Power'].apply(lambda x: x if pd.isna(x) else x.split(' ')[0])

df['Mileage (kmpl)'] = pd.to_numeric(df['Mileage (kmpl)'], errors='coerce')
df['Engine (CC)'] = pd.to_numeric(df['Engine (CC)'], errors='coerce')
df['Power (bhp)'] = pd.to_numeric(df['Power (bhp)'], errors='coerce')

df.drop(columns=['Mileage', 'Engine', 'Power'], inplace=True)

In [17]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [18]:
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price,Mileage (kmpl),Engine (CC),Power (bhp)
count,6019.0,6019.0,5977.0,6019.0,6017.0,5983.0,5876.0
mean,2013.358199,58738.38,5.278735,9.479468,18.134961,1621.27645,113.25305
std,3.269742,91268.84,0.80884,11.187917,4.582289,601.355233,53.874957
min,1998.0,171.0,0.0,0.44,0.0,72.0,34.2
25%,2011.0,34000.0,5.0,3.5,15.17,1198.0,75.0
50%,2014.0,53000.0,5.0,5.64,18.15,1493.0,97.7
75%,2016.0,73000.0,5.0,9.95,21.1,1984.0,138.1
max,2019.0,6500000.0,10.0,160.0,33.54,5998.0,560.0


In [19]:
df[df['Mileage (kmpl)']==0]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
14,Pune,2012,85000,Diesel,Automatic,Second,5.0,17.50,Land,Rover,Freelander,0.0,2179.0,115.0
67,Coimbatore,2019,15369,Diesel,Automatic,First,5.0,35.67,Mercedes-Benz,C-Class,Progressive,0.0,1950.0,194.0
79,Hyderabad,2005,87591,Petrol,Manual,First,5.0,1.30,Hyundai,Santro,Xing,0.0,1086.0,
194,Ahmedabad,2007,60006,Petrol,Manual,First,,2.95,Honda,City,1.5,0.0,,
229,Bangalore,2015,70436,Diesel,Manual,First,,3.60,Ford,Figo,Diesel,0.0,1498.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5647,Mumbai,2001,227000,Diesel,Manual,Fourth & Above,8.0,2.20,Toyota,Qualis,Fleet,0.0,2446.0,
5875,Ahmedabad,2019,4000,Diesel,Automatic,First,5.0,35.00,Mercedes-Benz,C-Class,Progressive,0.0,1950.0,194.0
5943,Chennai,2002,75000,Diesel,Manual,First,6.0,1.70,Mahindra,Jeep,MM,0.0,2112.0,
5972,Mumbai,2008,65000,Petrol,Manual,Second,5.0,1.39,Hyundai,Santro,Xing,0.0,1086.0,62.0


In [20]:
df[df['Seats']==0]

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
3999,Hyderabad,2012,125000,Petrol,Automatic,First,0.0,18.0,Audi,A4,3.2,10.5,3197.0,


In [21]:
df['Mileage (kmpl)'] = df['Mileage (kmpl)'].replace(0, np.nan)
df['Seats'] = df['Seats'].replace(0, np.nan)

In [22]:
cat_cols = [col for col in df.columns if df[col].dtypes == 'object']
df[cat_cols].nunique()

Location         11
Fuel_Type         5
Transmission      2
Owner_Type        4
Brand            31
Series          212
Type            339
dtype: int64

In [23]:
['a', 'b']

['a', 'b']

In [24]:
for col in cat_cols:
  print(col, df[col].unique(), '\n')

Location ['Mumbai' 'Pune' 'Chennai' 'Coimbatore' 'Hyderabad' 'Jaipur' 'Kochi'
 'Kolkata' 'Delhi' 'Bangalore' 'Ahmedabad'] 

Fuel_Type ['CNG' 'Diesel' 'Petrol' 'LPG' 'Electric'] 

Transmission ['Manual' 'Automatic'] 

Owner_Type ['First' 'Second' 'Fourth & Above' 'Third'] 

Brand ['Maruti' 'Hyundai' 'Honda' 'Audi' 'Nissan' 'Toyota' 'Volkswagen' 'Tata'
 'Land' 'Mitsubishi' 'Renault' 'Mercedes-Benz' 'BMW' 'Mahindra' 'Ford'
 'Porsche' 'Datsun' 'Jaguar' 'Volvo' 'Chevrolet' 'Skoda' 'Mini' 'Fiat'
 'Jeep' 'Smart' 'Ambassador' 'Isuzu' 'ISUZU' 'Force' 'Bentley'
 'Lamborghini'] 

Series ['Wagon' 'Creta' 'Jazz' 'Ertiga' 'A4' 'EON' 'Micra' 'Innova' 'Vento'
 'Indica' 'Ciaz' 'City' 'Swift' 'Rover' 'Pajero' 'Amaze' 'Duster' 'New'
 '3' 'S' 'A6' 'i20' 'Alto' 'WRV' 'Corolla' 'Ssangyong' 'Vitara' 'KUV'
 'M-Class' 'Polo' 'Nano' 'Elantra' 'Xcent' 'Thar' 'Grand' 'KWID' 'i10'
 'X-Trail' 'Zen' 'Figo' 'C-Class' 'Cayenne' 'XUV500' 'Terrano' 'Brio'
 'Fiesta' 'Santro' 'Zest' 'Ritz' '5' 'Fortuner' 'Ecosport' 'Verna

In [25]:
df['Brand'] = df['Brand'].replace('ISUZU', 'Isuzu')

In [26]:
print('Brand', df['Brand'].unique())

Brand ['Maruti' 'Hyundai' 'Honda' 'Audi' 'Nissan' 'Toyota' 'Volkswagen' 'Tata'
 'Land' 'Mitsubishi' 'Renault' 'Mercedes-Benz' 'BMW' 'Mahindra' 'Ford'
 'Porsche' 'Datsun' 'Jaguar' 'Volvo' 'Chevrolet' 'Skoda' 'Mini' 'Fiat'
 'Jeep' 'Smart' 'Ambassador' 'Isuzu' 'Force' 'Bentley' 'Lamborghini']


In [27]:
df['Fuel_Type'].value_counts()

Diesel      3205
Petrol      2746
CNG           56
LPG           10
Electric       2
Name: Fuel_Type, dtype: int64

In [28]:
df['Series'].nunique()

212

In [29]:
df.head()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
0,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,Maruti,Wagon,R,26.6,998.0,58.16
1,Pune,2015,41000,Diesel,Manual,First,5.0,12.5,Hyundai,Creta,1.6,19.67,1582.0,126.2
2,Chennai,2011,46000,Petrol,Manual,First,5.0,4.5,Honda,Jazz,V,18.2,1199.0,88.7
3,Chennai,2012,87000,Diesel,Manual,First,7.0,6.0,Maruti,Ertiga,VDI,20.77,1248.0,88.76
4,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,Audi,A4,New,15.2,1968.0,140.8


In [30]:
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price,Mileage (kmpl),Engine (CC),Power (bhp)
count,6019.0,6019.0,5976.0,6019.0,5949.0,5983.0,5876.0
mean,2013.358199,58738.38,5.279618,9.479468,18.342252,1621.27645,113.25305
std,3.269742,91268.84,0.806019,11.187917,4.175475,601.355233,53.874957
min,1998.0,171.0,2.0,0.44,6.4,72.0,34.2
25%,2011.0,34000.0,5.0,3.5,15.3,1198.0,75.0
50%,2014.0,53000.0,5.0,5.64,18.2,1493.0,97.7
75%,2016.0,73000.0,5.0,9.95,21.1,1984.0,138.1
max,2019.0,6500000.0,10.0,160.0,33.54,5998.0,560.0


In [31]:
df.describe(include=['object']) 

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Brand,Series,Type
count,6019,6019,6019,6019,6019,6019,6019.0
unique,11,5,2,4,30,212,339.0
top,Mumbai,Diesel,Manual,First,Maruti,Swift,1.5
freq,790,3205,4299,4929,1211,353,286.0


In [32]:
explor.null_checker(df)

Unnamed: 0,null (sum),null (%)
Power (bhp),143,2.38
Mileage (kmpl),70,1.16
Seats,43,0.71
Engine (CC),36,0.6
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0
Owner_Type,0,0.0


In [33]:
df.loc[df['Fuel_Type']=='Electric', 'Mileage (kmpl)'] = df.loc[df['Fuel_Type']=='Electric', 'Mileage (kmpl)'].replace(np.nan, 0)

In [34]:
df.loc[df['Fuel_Type']=='Electric']

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,Brand,Series,Type,Mileage (kmpl),Engine (CC),Power (bhp)
4446,Chennai,2016,50000,Electric,Automatic,First,5.0,13.0,Mahindra,E,Verito,0.0,72.0,41.0
4904,Mumbai,2011,44000,Electric,Automatic,First,5.0,12.75,Toyota,Prius,2009-2016,0.0,1798.0,73.0


# Preprocessing

In [35]:
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

(6018, 14)

In [36]:
# Drop missing values
df= df.dropna()
explor.null_checker(df)

Unnamed: 0,null (sum),null (%)
Location,0,0.0
Year,0,0.0
Kilometers_Driven,0,0.0
Fuel_Type,0,0.0
Transmission,0,0.0
Owner_Type,0,0.0
Seats,0,0.0
Price,0,0.0
Brand,0,0.0
Series,0,0.0


## Train test split

In [37]:
# melakukan train test split di awal untuk mencegah data bocor ke test set saat dilakukan encoding/imputation
features = df.drop(columns=['Price'])
target = df['Price']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

## Encoding

In [38]:
encodes = ['Location','Fuel_Type','Transmission','Owner_Type', 'Brand']
encoder = ce.OneHotEncoder(cols=encodes,
                          use_cat_names=True)
encoder.fit(X_train)

# encoding train set
X_train = encoder.transform(X_train)

# encoding test set
X_test = encoder.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [39]:
# Target encoding/One hot encoding untuk feature dengan kategori yang banyak
encodes = ['Series','Type']
target_encodes = ce.TargetEncoder(cols= encodes)
target_encodes.fit(X_train,y_train)

# Encoding train set
X_train = target_encodes.transform(X_train)

# Encoding test set
X_test = target_encodes.transform(X_test)

  elif pd.api.types.is_categorical(cols):


## Feature Selection

In [40]:
# Memfilter feature dengan korelasi tinggi
corr_price = X_train.join(y_train).corr()['Price']
index = corr_price[(corr_price < -0.20) | (corr_price > 0.20)].index

X_train_selected = X_train[index[:-1]]
X_test_selected = X_test[index[:-1]]

In [42]:
X_train.shape

(4383, 58)

In [41]:
X_train_selected.shape

(4383, 16)

# Modeling

## Functions

In [41]:
def get_cv_score(models, X_train, y_train):
    
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    summary = []
    for label, model in models.items():
        cv_results = cross_validate(model, X_train, y_train, cv=cv, 
                                    scoring=['r2',
                                             'neg_root_mean_squared_error',
                                             'neg_mean_absolute_error'])
        
        temp = pd.DataFrame(cv_results).copy()
        temp['Model'] = label
        summary.append(temp)
    
    summary = pd.concat(summary)
    summary = summary.groupby('Model').mean()
    
    summary.drop(columns=['fit_time', 'score_time'], inplace=True)
    summary.columns = ['CV R2', 'CV RMSE', 'CV MAE']
    summary[['CV RMSE', 'CV MAE']] = summary[['CV RMSE', 'CV MAE']] * -1
    
    return summary

In [42]:
def evaluate_model(models, X_train, X_test, y_train, y_test):

    summary = {'Model':[], 'Train R2':[], 'Train RMSE':[], 'Train MAE':[],
               'Test R2':[], 'Test RMSE':[], 'Test MAE':[]}

    for label, model in models.items():
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        summary['Model'].append(label)

        summary['Train R2'].append(
            metrics.r2_score(y_train, y_train_pred))
        summary['Train RMSE'].append(
            np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
        summary['Train MAE'].append(
            metrics.mean_absolute_error(y_train, y_train_pred))

        summary['Test R2'].append(
            metrics.r2_score(y_test, y_test_pred))
        summary['Test RMSE'].append(
            np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
        summary['Test MAE'].append(
            metrics.mean_absolute_error(y_test, y_test_pred))
    
    summary = pd.DataFrame(summary)
    summary.set_index('Model', inplace=True)

    cv_scores = get_cv_score(models, X_train, y_train)
    summary = summary.join(cv_scores)
    summary = summary[['Train R2', 'CV R2', 'Test R2',
                       'Train RMSE', 'CV RMSE', 'Test RMSE',
                       'Train MAE', 'CV MAE', 'Test MAE']]
    
    return round(summary.sort_values(by='Test RMSE'), 4)

## Base Model

In [43]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
xgb_model = XGBRegressor(objective='reg:squarederror')
lgb_model = LGBMRegressor()
cat_model = CatBoostRegressor(silent=True)
lr_model = LinearRegression()
lasso_model = Lasso()

models = {'DecisionTreeRegressor' : tree_model,
          'RandomForestRegressor' : rf_model,
          'XGBRegressor' : xgb_model,
          'CatBoostRegressor' : cat_model,
          'LGBMRegressor' : lgb_model,
          'LinearRegression': lr_model,
          'LassoRegression': lasso_model}

### Unscaled dataset

In [44]:
# evaluasi model memakai function
unscaled = evaluate_model(models, X_train_selected, X_test_selected, y_train, y_test)

### Scaled dataset

Dengan adanya pencilan, StandardScaler tidak menjamin skala fitur yang seimbang, karena pengaruh pencilan saat menghitung rata-rata empiris dan deviasi standar. Hal ini menyebabkan penyusutan kisaran nilai fitur.

In [45]:
# Scaling data with standard scaller
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train_selected)
X_train_selected_scaled = scaler.transform(X_train_selected)
X_test_selected_scaled = scaler.transform(X_test_selected)

In [46]:
# evaluasi model memakai function
evaluate_model(models, X_train_selected_scaled, X_test_selected_scaled, y_train, y_test)

Unnamed: 0_level_0,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CatBoostRegressor,0.9799,0.9002,0.9064,1.5551,3.4138,3.5924,0.9705,1.4074,1.4859
RandomForestRegressor,0.9783,0.8982,0.883,1.6138,3.4551,4.0162,0.7738,1.4642,1.5982
LGBMRegressor,0.956,0.9014,0.8812,2.2978,3.4137,4.0483,1.0976,1.43,1.5562
XGBRegressor,0.9866,0.8763,0.8723,1.2692,3.8021,4.1963,0.7536,1.4897,1.5929
DecisionTreeRegressor,0.9897,0.8446,0.7547,1.1131,4.2376,5.8168,0.4923,1.7182,1.9454
LinearRegression,0.8028,0.7989,0.7536,4.8666,4.9086,5.8289,2.6504,2.6724,2.9158
LassoRegression,0.7817,0.7813,0.7229,5.1203,5.1169,6.1814,2.4851,2.496,2.7646


In [47]:
# Scaling data with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train_selected)
X_train_selected_scaled_m = scaler.transform(X_train_selected)
X_test_selected_scaled_m = scaler.transform(X_test_selected)

In [48]:
# evaluasi model memakai function
evaluate_model(models, X_train_selected_scaled_m, X_test_selected_scaled_m, y_train, y_test)

Unnamed: 0_level_0,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CatBoostRegressor,0.9799,0.9002,0.9064,1.5551,3.4138,3.5925,0.9705,1.4073,1.486
RandomForestRegressor,0.9789,0.8964,0.8827,1.591,3.4813,4.0223,0.7792,1.4661,1.5893
LGBMRegressor,0.9561,0.9031,0.8824,2.2947,3.3883,4.0274,1.0968,1.4315,1.5524
XGBRegressor,0.9866,0.8763,0.873,1.2692,3.8016,4.1852,0.7536,1.4895,1.5862
DecisionTreeRegressor,0.9897,0.8407,0.7687,1.1131,4.2764,5.6477,0.4923,1.7162,1.9146
LinearRegression,0.8028,0.7989,0.7536,4.8666,4.9086,5.8289,2.6504,2.6724,2.9158
LassoRegression,0.3274,0.3291,0.3315,8.9873,8.9643,9.6016,5.1494,5.1544,5.3816


RobustScaler mengurangi median kolom dan membaginya dengan rentang interkuartil.

In [49]:
# Scaling data with RobustScaler
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(X_train_selected)
X_train_selected_scaled_r = scaler.transform(X_train_selected)
X_test_selected_scaled_r = scaler.transform(X_test_selected)

In [50]:
# evaluasi model memakai function
scaled = evaluate_model(models, X_train_selected_scaled_r, X_test_selected_scaled_r, y_train, y_test)

### Summarizing

In [51]:
unscaled['Dataset Version'] = 'dropna + selected + unscaled'
scaled['Dataset Version'] = 'dropna + selected + scaled'

dropna_selected = pd.concat([unscaled, scaled], axis=0)
dropna_selected

Unnamed: 0_level_0,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE,Dataset Version
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CatBoostRegressor,0.9799,0.9002,0.9064,1.5551,3.4138,3.5924,0.9705,1.4074,1.4859,dropna + selected + unscaled
LGBMRegressor,0.9561,0.9031,0.8824,2.2947,3.3883,4.0274,1.0968,1.4315,1.5524,dropna + selected + unscaled
RandomForestRegressor,0.9773,0.8985,0.8797,1.6523,3.4462,4.0725,0.782,1.4635,1.6051,dropna + selected + unscaled
XGBRegressor,0.9866,0.8763,0.8723,1.2692,3.8017,4.1964,0.7536,1.4895,1.5928,dropna + selected + unscaled
DecisionTreeRegressor,0.9897,0.8461,0.7729,1.1131,4.2118,5.5967,0.4923,1.7214,1.8947,dropna + selected + unscaled
LinearRegression,0.8028,0.7989,0.7536,4.8666,4.9086,5.8289,2.6504,2.6724,2.9158,dropna + selected + unscaled
LassoRegression,0.7979,0.7975,0.7479,4.9266,4.9249,5.8966,2.605,2.6155,2.854,dropna + selected + unscaled
CatBoostRegressor,0.9799,0.9002,0.9064,1.5551,3.4138,3.5924,0.9705,1.4074,1.4859,dropna + selected + scaled
RandomForestRegressor,0.9775,0.8952,0.885,1.6435,3.5026,3.9817,0.781,1.4694,1.5869,dropna + selected + scaled
LGBMRegressor,0.9564,0.9016,0.8816,2.2874,3.413,4.0407,1.0958,1.4355,1.5599,dropna + selected + scaled


In [52]:
dropna_selected.to_csv('dropna_selected.csv')