In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder, MultiC, 

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [2]:
df = pd.read_csv('carprice.csv',index_col='ID')
df.head()

Unnamed: 0_level_0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5


In [3]:
print("Jumlah Rows           : ", df.shape[0])
print("Jumlah Cols           : ", df.shape[1])
print("Jumlah Missing Values : ", df.isnull().sum().sum())

Jumlah Rows           :  5953
Jumlah Cols           :  12
Jumlah Missing Values :  222


## Cek Missing Values 

In [4]:
df.isnull().sum()

Name                   0
Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage_kmpl           2
Engine_CC             36
Power_bhp            142
Seats                 42
Price                  0
dtype: int64

# Create Simple model 

### Simple feature selection 

In [5]:
## Hapus kolom name
df.drop(columns='Name', inplace=True)
df.head()

Unnamed: 0_level_0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
6,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5


### Split Data

In [6]:
X = df.drop(columns='Price')
y = df.Price
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4762, 10), (1191, 10), (4762,), (1191,))

In [7]:
X['Location'].unique()

array(['Pune', 'Chennai', 'Coimbatore', 'Jaipur', 'Mumbai', 'Kochi',
       'Kolkata', 'Delhi', 'Bangalore', 'Hyderabad', 'Ahmedabad'],
      dtype=object)

# Train Model

### Create numerical and categorical pipeline

In [8]:
num_pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='mean')),
    ('scaling', StandardScaler())
])

cat_pipeline = Pipeline([
    ('inputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder())
])

In [9]:
df.columns

Index(['Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission',
       'Owner_Type', 'Mileage_kmpl', 'Engine_CC', 'Power_bhp', 'Seats',
       'Price'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0_level_0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
6,Jaipur,2013,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5


### Tranform 

In [11]:
prepocessor = ColumnTransformer([
    ('numeric', num_pipeline, ["Year","Kilometers_Driven","Mileage_kmpl","Engine_CC","Power_bhp","Seats"]),
    ('categoric', cat_pipeline, ["Location","Fuel_Type","Transmission","Owner_Type"])
])

In [12]:
from sklearn.svm import SVR

In [13]:
pipeline = Pipeline([
    ('prep', prepocessor),
    ('algo', SVR(max_iter=500))
])

### Tuning

In [14]:
parameter = {
    'algo__C' : np.logspace(-3,3, 7),
    'algo__gamma' : np.logspace(-3,3, 7)
}

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=2)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:   17.5s finished


{'algo__C': 10.0, 'algo__gamma': 0.1}
0.8264008636262723 0.7959754570321597 0.7842068185853249


In [17]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

In [18]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling='robust'), ["Year","Kilometers_Driven","Mileage_kmpl","Engine_CC","Power_bhp","Seats"]),
    ('categoric', cat_pipe(encoder='ordinal'), ["Location","Fuel_Type","Transmission","Owner_Type"]),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])

model = GridSearchCV(pipeline, gsp.svm_params, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:   14.0s finished


{'algo__C': 10.0, 'algo__gamma': 0.1}
0.8444644134097161 0.799031574222111 0.8054506309837969


In [19]:
df.corr()['Price']

Year                 0.306539
Kilometers_Driven   -0.011548
Mileage_kmpl        -0.304747
Engine_CC            0.657220
Power_bhp            0.772143
Seats                0.049978
Price                1.000000
Name: Price, dtype: float64

In [20]:
df_baru = df.copy(deep=True)

In [21]:
df.Year = pd.cut(df.Year, bins=[0, 2000,2010,2015,2100], labels=["Sebelum 2000", "2000-2010", "2010-2015", "2015-Sekarang"])
df.head()

Unnamed: 0_level_0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Pune,2010-2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Chennai,2010-2015,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Chennai,2010-2015,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Coimbatore,2010-2015,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74
6,Jaipur,2010-2015,86999,Diesel,Manual,First,23.08,1461.0,63.1,5.0,3.5


In [22]:
df.Seats = pd.cut(x=df.Seats, bins=2, labels=["Kurang dari 5", "lebih dari 5"])
df.head()

Unnamed: 0_level_0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Pune,2010-2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,Kurang dari 5,12.5
2,Chennai,2010-2015,46000,Petrol,Manual,First,18.2,1199.0,88.7,Kurang dari 5,4.5
3,Chennai,2010-2015,87000,Diesel,Manual,First,20.77,1248.0,88.76,lebih dari 5,6.0
4,Coimbatore,2010-2015,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,Kurang dari 5,17.74
6,Jaipur,2010-2015,86999,Diesel,Manual,First,23.08,1461.0,63.1,Kurang dari 5,3.5


In [23]:
df.Kilometers_Driven.value_counts()

60000    82
45000    70
65000    68
50000    61
55000    58
         ..
54540     1
70920     1
75014     1
32005     1
83969     1
Name: Kilometers_Driven, Length: 3062, dtype: int64

In [24]:
df.Kilometers_Driven = pd.cut(df.Kilometers_Driven, bins=[0, 40000,100000], labels=["Kurang dari 40000", "Lebih dari 40000"])
df.head()

Unnamed: 0_level_0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats,Price
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Pune,2010-2015,Lebih dari 40000,Diesel,Manual,First,19.67,1582.0,126.2,Kurang dari 5,12.5
2,Chennai,2010-2015,Lebih dari 40000,Petrol,Manual,First,18.2,1199.0,88.7,Kurang dari 5,4.5
3,Chennai,2010-2015,Lebih dari 40000,Diesel,Manual,First,20.77,1248.0,88.76,lebih dari 5,6.0
4,Coimbatore,2010-2015,Lebih dari 40000,Diesel,Automatic,Second,15.2,1968.0,140.8,Kurang dari 5,17.74
6,Jaipur,2010-2015,Lebih dari 40000,Diesel,Manual,First,23.08,1461.0,63.1,Kurang dari 5,3.5


In [25]:
df.Kilometers_Driven.value_counts()

Lebih dari 40000     3469
Kurang dari 40000    1977
Name: Kilometers_Driven, dtype: int64

In [26]:
X = df.drop(columns='Price')
y = df.Price
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4762, 10), (1191, 10), (4762,), (1191,))

In [27]:
num_feature = ["Mileage_kmpl","Engine_CC","Power_bhp"]
cat_feature = X_train.drop(columns=num_feature).columns
cat_feature

Index(['Location', 'Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission',
       'Owner_Type', 'Seats'],
      dtype='object')

In [28]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling="standard"), num_feature),
    ('categoric', cat_pipe(encoder='onehot'), cat_feature),
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:   19.2s finished


{'algo__C': 10.0, 'algo__gamma': 0.1}
0.7591520085084921 0.8067100187803312 0.7232890213585577


In [29]:
## Hasil Feature Enginer Masah Jadi Jelek

In [41]:
preprocessor.named_transformers_

{'numeric': Pipeline(memory=None,
          steps=[('imputer',
                  SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                missing_values=nan, strategy='median',
                                verbose=0)),
                 ('scaler',
                  StandardScaler(copy=True, with_mean=True, with_std=True))],
          verbose=False), 'categoric': Pipeline(memory=None,
          steps=[('imputer',
                  SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                missing_values=nan, strategy='most_frequent',
                                verbose=0)),
                 ('onehot',
                  OneHotEncoder(categories='auto', drop=None,
                                dtype=<class 'numpy.float64'>,
                                handle_unknown='ignore', sparse=True))],
          verbose=False)}

In [38]:
X_train

Unnamed: 0_level_0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage_kmpl,Engine_CC,Power_bhp,Seats
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4380,Coimbatore,2015-Sekarang,Lebih dari 40000,Petrol,Manual,First,20.73,1373.0,91.1,Kurang dari 5
2676,Kochi,2015-Sekarang,Lebih dari 40000,Diesel,Manual,First,19.67,1582.0,126.2,Kurang dari 5
647,Jaipur,2000-2010,,Petrol,Manual,First,0.00,1086.0,,Kurang dari 5
5966,Hyderabad,2015-Sekarang,Kurang dari 40000,Petrol,Manual,First,18.90,998.0,67.1,Kurang dari 5
2198,Ahmedabad,2010-2015,Lebih dari 40000,Diesel,Manual,Second,23.50,1498.0,90.0,Kurang dari 5
...,...,...,...,...,...,...,...,...,...,...
3819,Jaipur,2010-2015,Lebih dari 40000,Petrol,Automatic,First,13.40,1997.0,135.1,Kurang dari 5
5248,Ahmedabad,2010-2015,Lebih dari 40000,Diesel,Manual,First,22.90,1248.0,74.0,Kurang dari 5
5283,Mumbai,2010-2015,Kurang dari 40000,Petrol,Manual,First,16.20,1199.0,74.0,Kurang dari 5
5447,Kochi,2015-Sekarang,Kurang dari 40000,Diesel,Automatic,First,17.90,2143.0,136.0,Kurang dari 5


In [50]:
onehot_feature = ["Kilometers_Driven","Transmission","Seats"]
ordinal_frature = ["Location","Year","Fuel_Type","Owner_Type"]

In [51]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling="standard"), num_feature),
    ('onehot', cat_pipe(encoder='onehot'), onehot_feature),
    ('ordinal', cat_pipe(encoder='ordinal'), ordinal_frature)
])

pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', SVR(max_iter=500))
])

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 49 candidates, totalling 147 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 147 out of 147 | elapsed:   15.4s finished


{'algo__C': 10.0, 'algo__gamma': 0.1}
0.7904504355733017 0.7738223915258011 0.7529696139760607
