In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
#Import model library
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, f1_score, classification_report
#Import library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('used_cars.csv')

### Data Exploration

In [None]:
df.head()

In [None]:
#number of records and column
df.shape

In [None]:
df.describe()

In [None]:
#datatype for each column
df.dtypes

In [None]:
df._get_numeric_data().columns

In [None]:
#correlation matrix

corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, square=True)

In [None]:
#check missing value

total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
#check outliers using boxplot

plt.figure(figsize=(60, 60))
f, axes = plt.subplots(1, 5)
sns.boxplot(y=df["price"], ax=axes[0])
sns.boxplot(y=df["odometer"], ax=axes[1])
sns.boxplot(y=df["year"], ax=axes[2])
sns.boxplot(y=df["lat"], ax=axes[3])
sns.boxplot(y=df["long"], ax=axes[4])
plt.subplots_adjust(wspace=1)

#### Shows the number of occurrences of some category

In [None]:
df['condition'].value_counts()

In [None]:
sns.countplot(x = 'condition', data = df)

In [None]:
df['fuel'].value_counts()

In [None]:
sns.countplot(x = 'fuel', data = df)

In [None]:
sns.countplot(x = 'cylinders', data = df)

In [None]:
sns.countplot(x = 'type', data = df)

In [None]:
sns.countplot(x = 'title_status', data = df)

In [None]:
sns.countplot(x = 'transmission', data = df)

In [None]:
sns.countplot(x = 'drive', data = df)

In [None]:
sns.countplot(x = 'size', data = df)

In [None]:
sns.countplot(x = 'state', data = df)

In [None]:
sns.countplot(x = 'region', data = df)

In [None]:
sns.countplot(x = 'paint_color', data = df)

#### Check data distribution

In [None]:
sns.distplot(df[df['year'].notnull()]['year'], kde=False, bins=10);

In [None]:
sns.distplot(df[df['odometer'].notnull()]['odometer'], kde=False, bins=10);

In [None]:
sns.distplot(df[df['price'].notnull()]['price'], kde=False, bins=10);

In [None]:
#correlation between odometer and price

plt.figure(figsize=(100, 100))
df_nona= df[df['odometer'].notnull()]
g = sns.pairplot(df_nona[['price', 'odometer']] )

In [None]:
#correlation between price and year

plt.figure(figsize=(100, 100))
df_nona= df[df['year'].notnull()]
g = sns.pairplot(df_nona[['price', 'year']] )

### Data Preparation

In [3]:
#county doesn't have any values and we don't need unnamed column
df = df.drop(columns = ['Unnamed: 0','county'])

In [4]:
df.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'vin', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'state', 'lat', 'long'],
      dtype='object')

#### Dealing with Missing Value

In [5]:
#Max fill function for manufacturer
df['model'].fillna(df['model'].value_counts().idxmax(), inplace=True)

In [6]:
df = df.replace(0, np.NaN)

# fill missing values with mean column values
df['odometer'].fillna(df['odometer'].mean(), inplace=True)
df['price'].fillna(df['price'].mean(), inplace=True)
df['year'].fillna(df['year'].mean(), inplace=True)
df['lat'].fillna(df['lat'].mean(), inplace=True)
df['long'].fillna(df['long'].mean(), inplace=True)

In [7]:
#fill missing values with 'other'

df['cylinders'].fillna('other', inplace=True)

In [8]:
df['cylinders'].isnull().sum()

0

In [9]:
#fill missing values with 'other'

df['fuel'].fillna('other', inplace=True)

In [10]:
df['fuel'].isnull().sum()

0

In [11]:
#Max fill function for title_status
df['title_status'].fillna(df['title_status'].value_counts().idxmax(), inplace=True)

In [12]:
df['title_status'].isnull().sum()

0

In [13]:
#fill missing values with 'other'

df['transmission'].fillna('other', inplace=True)

In [14]:
df['transmission'].isnull().sum()

0

In [15]:
#fill missing values with 'other'

df['drive'].fillna('other', inplace=True)

In [16]:
df['drive'].isnull().sum()

0

In [17]:
#fill missing values with 'other'

df['type'].fillna('other', inplace=True)

In [18]:
df['type'].isnull().sum()

0

In [19]:
df = df.drop(columns = ['vin', 'paint_color','size','id','url','region_url','image_url','description'])

In [20]:
df.dropna(inplace=True)

##### Dealing with Outliers

In [21]:
from scipy import stats

z = np.abs(stats.zscore(df[['odometer','price','lat', 'long']]))
df= df[(z < 3).all(axis=1)]

In [22]:
df.shape

(10414, 16)

### Feature Engineering

#### Binning

In [23]:
#binning for cylinders to reduce imbalance data

cylinders_conditions = [
    df['cylinders'].str.contains('6 cylinders'),
    df['cylinders'].str.contains('4 cylinders'),
    df['cylinders'].str.contains('8 cylinders')]
choices = ['medium', 'small', 'large']
df['cylinders'] = np.select(cylinders_conditions, choices, default='other')

In [24]:
#binning for fuel to reduce imbalance data

fuel_conditions = [
    df['fuel'].str.contains('gas'),
    df['fuel'].str.contains('diesel')]
fuel_choices = ['gas', 'diesel']
df['fuel'] = np.select(fuel_conditions, fuel_choices, default='other')

In [25]:
#binning for condition to reduce imbalance data

conditions = [
    df['condition'].str.contains('excellent'),
    df['condition'].str.contains('good'),
    df['condition'].str.contains('like new'),
    df['condition'].str.contains('fair'),
    df['condition'].str.contains('salvage'),
    df['condition'].str.contains('new')]
choices = ['excellent', 'good', 'excellent', 'good', 'good', 'excellent']
df['condition'] = np.select(conditions, choices, default='other')

In [26]:
#binning for title status to reduce imbalance data

status_conditions = [
    df['title_status'].str.contains('clean'),
    df['title_status'].str.contains('rebuilt')]
choices = ['excellent', 'good']
df['title_status'] = np.select(status_conditions, choices, default='fair')

#### Encoding categorical data

In [27]:
labelencoder = LabelEncoder()
df['cylinders'] = labelencoder.fit_transform(df['cylinders'])
labelencoder = LabelEncoder()
df['title_status'] = labelencoder.fit_transform(df['title_status'])
labelencoder = LabelEncoder()
df['manufacturer'] = labelencoder.fit_transform(df['manufacturer'])
labelencoder = LabelEncoder()
df['model'] = labelencoder.fit_transform(df['model'])
labelencoder = LabelEncoder()
df['region'] = labelencoder.fit_transform(df['region'])

In [28]:
#one hot encoding
enc = pd.get_dummies(df[['drive', 'fuel', 'state', 'transmission', 'type']])
df = df.join(enc)
df.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,...,type_coupe,type_hatchback,type_mini-van,type_offroad,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon
0,11,17899.0,2012.0,36,1395,excellent,3,gas,63500.0,0,...,0,1,0,0,0,0,0,0,0,0
1,11,87528.597156,2016.0,10,1096,excellent,2,gas,10.0,0,...,0,0,0,0,1,0,0,0,0,0
2,11,46463.0,2015.0,11,2184,excellent,2,gas,7554.0,0,...,0,0,0,0,1,0,0,0,0,0
3,11,87528.597156,2016.0,10,1096,excellent,2,gas,10.0,0,...,0,0,0,0,1,0,0,0,0,0
22,11,9500.0,2001.0,10,1033,good,0,diesel,99333.548629,0,...,0,0,0,0,1,0,0,0,0,0


##### Scaling

In [29]:
sc = MinMaxScaler()
df[['price', 'year', 'odometer', 'cylinders', 'title_status', 'manufacturer', 'model', 'region', 'lat', 'long']] = sc.fit_transform(df[['price', 'year', 'odometer', 'cylinders', 'title_status', 'manufacturer', 'model','region', 'lat', 'long']])
df.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,...,type_coupe,type_hatchback,type_mini-van,type_offroad,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon
0,0.733333,0.132579,0.92233,0.972973,0.49857,excellent,1.0,gas,0.181426,0.0,...,0,1,0,0,0,0,0,0,0,0
1,0.733333,0.648357,0.961165,0.27027,0.391708,excellent,0.666667,gas,2.6e-05,0.0,...,0,0,0,0,1,0,0,0,0,0
2,0.733333,0.344166,0.951456,0.297297,0.780558,excellent,0.666667,gas,0.02158,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.733333,0.648357,0.961165,0.27027,0.391708,excellent,0.666667,gas,2.6e-05,0.0,...,0,0,0,0,1,0,0,0,0,0
22,0.733333,0.070363,0.815534,0.27027,0.369192,good,0.0,diesel,0.283808,0.0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
# dataset with just missing value, binning condition, scaling, encoding
# df.to_csv('used_car_ver1.csv')

In [30]:
#dataset with missing value, outlier, binning, encoding, scaling
# df.to_csv('used_car_ver2.csv')

### Eksperimen dengan dataset versi 1

In [None]:
df1 = pd.read_csv('used_car_ver1.csv')

#### Feature Selection

In [None]:
y = df1['condition']
X = df1.drop(columns = ['drive', 'state','fuel', 'transmission', 'type', 'condition', 'title_status', 'manufacturer', 'model'])

In [None]:
# y = df['transmission']
# X = df.drop(columns = ['drive', 'state','fuel', 'transmission', 'type', 'condition', 'title_status', 'transmission_automatic', 'transmission_manual', 'transmission_other', 'price', 'state_ut', 'state_va', 'state_vt', 'state_wa',  'odometer' ])

#### Split dataset into data train and test 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
# #Create a svm Classifier
# svmc = SVC(kernel='linear') # Linear Kernel

# #Train the model using the training sets
# svmc.fit(X_train, y_train)

# #Predict the response for test dataset
# svm_pred = svmc.predict(X_test)

# # Model Accuracy
# print("Accuracy: ", metrics.accuracy_score(y_test,svm_pred))
# print("F1 Score: ",f1_score(y_test, svm_pred, average='macro'))

In [None]:
print('Random Forest Classifier\n')

#Create Random Forest Classifier and Train the model using the training sets
rfc = RandomForestClassifier(n_estimators=20).fit(X_train, y_train)

#Predict the response for test dataset
rfc_pred = rfc.predict(X_test)

print('Confusion Matrix :')
print(results) 
print('Report : ')
print(classification_report(y_test, rfc_pred))
print('Accuracy Score : ',metrics.accuracy_score(y_test, rfc_pred))
print('F1 Score: ',f1_score(y_test, rfc_pred, average='macro'))

In [None]:
print('XGB Classifier\n')

#Create XGB Classifier
model = XGBClassifier()

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
xgb_pred = model.predict(X_test)

results = confusion_matrix(y_test, xgb_pred)
print('Confusion Matrix :')
print(results) 
print('Report : ')
print(classification_report(y_test, xgb_pred))
print('Accuracy Score : ',metrics.accuracy_score(y_test, xgb_pred))
print('F1 Score: ',f1_score(y_test, xgb_pred, average='macro'))

In [None]:
# n = [3,4,5,7,9]
# eval_score = []
# for i in n:
#     #Create KNN Classifier with neighbours = 4
#     knn = KNeighborsClassifier(n_neighbors=i)
#     #Train the model using the training sets
#     knn.fit(X_train, y_train)
#     #Predict the response for test dataset
#     knn_pred = knn.predict(X_test)

#     print("Accuracy: ",metrics.accuracy_score(y_test, knn_pred))
#     print("F1 Score: ",f1_score(y_test, knn_pred, average='macro'))
    


In [None]:
#Search best max_depth 
depth = [3,7,11,15,17,20]
eval_score = []
for d in depth:
    #Create Decission Tree Classifier and Train the model using the training sets
    dec = DecisionTreeClassifier(max_depth = d).fit(X_train, y_train)
    #Predict the response for test dataset
    dec_pred = dec.predict(X_test)
    
    dec_acc = metrics.accuracy_score(y_test, dec_pred)
    dec_f1 = f1_score(y_test, dec_pred, average='macro')
    eval_score.append([dec_acc, dec_f1, d])
    
best_dtc = max([sublist for sublist in eval_score])

In [None]:
print('Decision Tree\n')
#Create Decission Tree Classifier and Train the model using the training sets
dec = DecisionTreeClassifier(max_depth = best_dtc[2]).fit(X_train, y_train)
#Predict the response for test dataset
dec_pred = dec.predict(X_test)

print('Confusion Matrix :')
print(results) 
print('Report : ')
print(classification_report(y_test, dec_pred))
print('Accuracy Score : ',metrics.accuracy_score(y_test, dec_pred))
print('F1 Score: ',f1_score(y_test, dec_pred, average='macro'))

### Eksperimen dengan dataset versi 2

In [31]:
df2 = pd.read_csv('used_car_ver2.csv')

In [32]:
y = df2['condition']
X = df2.drop(columns = ['drive', 'state','fuel', 'transmission', 'type', 'condition', 'title_status', 'manufacturer', 'model'])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [34]:
# #Create a svm Classifier
# svm = SVC(kernel='linear') # Linear Kernel

# #Train the model using the training sets
# svm.fit(X_train, y_train)

# #Predict the response for test dataset
# svm_pred = svm.predict(X_test)

# # Model Accuracy
# print("Accuracy: ", metrics.accuracy_score(y_test,svm_pred))
# print("F1 Score: ",f1_score(y_test, svm_pred, average='macro'))

In [None]:
gb = GradientBoostingClassifier(n_estimators=20, learning_rate=0.22, max_features=3, max_depth=8, random_state=0)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

# print("Accuracy score (training): {0:.3f}".format(gb_pred.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(metrics.accuracy_score(y_test, gb_pred)))
print("F1 Score: ",f1_score(y_test, gb_pred, average='macro'))

In [None]:
confusion_matrix(y_test, xgb_pred)

In [None]:
#Create Random Forest Classifier and Train the model using the training sets
rfc = RandomForestClassifier(n_estimators=15).fit(X_train, y_train)

#Predict the response for test dataset
rfc_pred = rfc.predict(X_test)

print("Accuracy: ",metrics.accuracy_score(y_test, rfc_pred))
print("F1 Score: ",f1_score(y_test, rfc_pred, average='macro'))

In [None]:
#Create XGB Classifier
model = XGBClassifier()

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
xgb_pred = model.predict(X_test)

print("Accuracy: ",metrics.accuracy_score(y_test, xgb_pred))
print("F1 Score: ",f1_score(y_test, xgb_pred, average='macro'))

In [None]:
results = confusion_matrix(y_test, xgb_pred)
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',metrics.accuracy_score(y_test, xgb_pred) )
print ('Report : ')
print (classification_report(y_test, xgb_pred) )

In [None]:
n = [3,4,5,7,9]
for i in n:
    #Create KNN Classifier with neighbours = 4
    knn = KNeighborsClassifier(n_neighbors=i)
    #Train the model using the training sets
    knn.fit(X_train, y_train)
    #Predict the response for test dataset
    knn_pred = knn.predict(X_test)

    print("Accuracy: ",metrics.accuracy_score(y_test, knn_pred))
    print("F1 Score: ",f1_score(y_test, knn_pred, average='macro'))

In [None]:
depth = [3,5,7,15,17,21]
for i in depth:
    #Create Decission Tree Classifier and Train the model using the training sets
    dec = DecisionTreeClassifier(max_depth = i).fit(X_train, y_train)
    #Predict the response for test dataset
    dec_pred = dec.predict(X_test)
    
    print("Accuracy: ",metrics.accuracy_score(y_test, dec_pred))
    print("F1 Score: ",f1_score(y_test, dec_pred, average='macro'))