In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
#Import model library
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import f1_score
#Import library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('used_cars.csv')

### Data Exploration

In [None]:
df.head()

In [None]:
#number of records and column
df.shape

In [None]:
df.describe()

In [None]:
#datatype for each column
df.dtypes

In [None]:
df._get_numeric_data().columns

In [None]:
#correlation matrix

corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, square=True)

In [None]:
#check missing value

total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

In [None]:
#check outliers using boxplot

plt.figure(figsize=(60, 60))
f, axes = plt.subplots(1, 5)
sns.boxplot(y=df["price"], ax=axes[0])
sns.boxplot(y=df["odometer"], ax=axes[1])
sns.boxplot(y=df["year"], ax=axes[2])
sns.boxplot(y=df["lat"], ax=axes[3])
sns.boxplot(y=df["long"], ax=axes[4])
plt.subplots_adjust(wspace=1)

#### Shows the number of occurrences of some category

In [None]:
df['condition'].value_counts()

In [None]:
sns.countplot(x = 'condition', data = df)

In [None]:
df['fuel'].value_counts()

In [None]:
sns.countplot(x = 'fuel', data = df)

In [None]:
sns.countplot(x = 'cylinders', data = df)

In [None]:
sns.countplot(x = 'type', data = df)

In [None]:
sns.countplot(x = 'title_status', data = df)

In [None]:
sns.countplot(x = 'transmission', data = df)

In [None]:
sns.countplot(x = 'drive', data = df)

In [None]:
sns.countplot(x = 'size', data = df)

In [None]:
sns.countplot(x = 'state', data = df)

In [None]:
sns.countplot(x = 'region', data = df)

In [None]:
sns.countplot(x = 'paint_color', data = df)

#### Check data distribution

In [None]:
sns.distplot(df[df['year'].notnull()]['year'], kde=False, bins=10);

In [None]:
sns.distplot(df[df['odometer'].notnull()]['odometer'], kde=False, bins=10);

In [None]:
sns.distplot(df[df['price'].notnull()]['price'], kde=False, bins=10);

In [None]:
#correlation between odometer and price

plt.figure(figsize=(100, 100))
df_nona= df[df['odometer'].notnull()]
g = sns.pairplot(df_nona[['price', 'odometer']] )

In [None]:
#correlation between price and year

plt.figure(figsize=(100, 100))
df_nona= df[df['year'].notnull()]
g = sns.pairplot(df_nona[['price', 'year']] )

### Data Preparation

In [None]:
#county doesn't have any values and we don't need unnamed column
df = df.drop(columns = ['Unnamed: 0','county'])

In [None]:
df.columns

#### Dealing with Missing Value

In [None]:
# cdf['cylinders']=cdf['cylinders'].str.replace('cylinders','')

In [None]:
#Max fill function for manufacturer
df['model'].fillna(df['model'].value_counts().idxmax(), inplace=True)

In [None]:
df = df.replace(0, np.NaN)

# fill missing values with mean column values
df['odometer'].fillna(df['odometer'].median(), inplace=True)
df['price'].fillna(df['price'].median(), inplace=True)
df['year'].fillna(df['year'].median(), inplace=True)
df['lat'].fillna(df['lat'].median(), inplace=True)
df['long'].fillna(df['long'].median(), inplace=True)

In [None]:
#fill missing values with 'other'

df['cylinders'].fillna('other', inplace=True)

In [None]:
df['cylinders'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['fuel'].fillna('other', inplace=True)

In [None]:
df['fuel'].isnull().sum()

In [None]:
#Max fill function for title_status
df['title_status'].fillna(df['title_status'].value_counts().idxmax(), inplace=True)

In [None]:
df['title_status'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['transmission'].fillna('other', inplace=True)

In [None]:
df['transmission'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['drive'].fillna('other', inplace=True)

In [None]:
df['drive'].isnull().sum()

In [None]:
#fill missing values with 'other'

df['type'].fillna('other', inplace=True)

In [None]:
df['type'].isnull().sum()

In [None]:
df = df.drop(columns = ['vin', 'paint_color','size','id','url','region_url','image_url','description'])

In [None]:
df.dropna(inplace=True)

##### Dealing with Outliers

In [None]:
from scipy import stats

z = np.abs(stats.zscore(df[['odometer','price']]))
df= df[(z < 3).all(axis=1)]

In [None]:
df.shape

### Feature Engineering

#### Binning

In [None]:
#binning for cylinders to reduce imbalance data

cylinders_conditions = [
    df['cylinders'].str.contains('6 cylinders'),
    df['cylinders'].str.contains('4 cylinders'),
    df['cylinders'].str.contains('8 cylinders')]
choices = ['medium', 'small', 'large']
df['cylinders'] = np.select(cylinders_conditions, choices, default='other')

In [None]:
#binning for fuel to reduce imbalance data

fuel_conditions = [
    df['fuel'].str.contains('gas'),
    df['fuel'].str.contains('diesel')]
fuel_choices = ['gas', 'diesel']
df['fuel'] = np.select(fuel_conditions, fuel_choices, default='other')

In [None]:
#binning for condition to reduce imbalance data

conditions = [
    df['condition'].str.contains('excellent'),
    df['condition'].str.contains('good'),
    df['condition'].str.contains('like new'),
    df['condition'].str.contains('fair'),
    df['condition'].str.contains('salvage'),
    df['condition'].str.contains('new')]
choices = ['excellent', 'good', 'excellent', 'good', 'good', 'excellent']
df['condition'] = np.select(conditions, choices, default='other')

In [None]:
#binning for condition to reduce imbalance data

status_conditions = [
    df['title_status'].str.contains('clean'),
    df['title_status'].str.contains('rebuilt')]
choices = ['excellent', 'good']
df['title_status'] = np.select(status_conditions, choices, default='fair')

In [None]:
# df = df[['condition', 'cylinders', 'drive', 'odometer', 'fuel', 'year', 'state', 'price']]

#### Encoding categorical data

In [None]:
labelencoder = LabelEncoder()
df['cylinders'] = labelencoder.fit_transform(df['cylinders'])
labelencoder = LabelEncoder()
df['title_status'] = labelencoder.fit_transform(df['title_status'])
labelencoder = LabelEncoder()
df['manufacturer'] = labelencoder.fit_transform(df['manufacturer'])
labelencoder = LabelEncoder()
df['model'] = labelencoder.fit_transform(df['model'])
labelencoder = LabelEncoder()
df['region'] = labelencoder.fit_transform(df['region'])

In [None]:
#one hot encoding
enc = pd.get_dummies(df[['drive', 'fuel', 'state', 'transmission', 'type']])
df = df.join(enc)
df.head()

##### Scaling

In [None]:
sc = MinMaxScaler()
df[['price', 'year', 'odometer', 'cylinders', 'title_status', 'manufacturer', 'model', 'region', 'lat', 'long']] = sc.fit_transform(df[['price', 'year', 'odometer', 'cylinders', 'title_status', 'manufacturer', 'model','region', 'lat', 'long']])
df.head()

In [None]:
#dataset with just missing value, scaling, encoding
# df.to_csv('used_car_ver1.csv')

In [None]:
#dataset with missing value, outlier, binning, encoding, scaling
df.to_csv('used_car_ver2.csv')

### Eksperimen dengan dataset versi 1

In [None]:
df1 = pd.read_csv('used_car_ver1.csv')

#### Feature Selection

In [None]:
y = df1['condition']
X = df1.drop(columns = ['drive', 'state','fuel', 'transmission', 'type', 'condition', 'title_status', 'manufacturer', 'model'])

In [None]:
# y = df['transmission']
# X = df.drop(columns = ['drive', 'state','fuel', 'transmission', 'type', 'condition', 'title_status', 'transmission_automatic', 'transmission_manual', 'transmission_other', 'price', 'state_ut', 'state_va', 'state_vt', 'state_wa',  'odometer' ])

#### Split dataset into data train and test 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
#Create a svm Classifier
svm = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
svm.fit(X_train, y_train)

#Predict the response for test dataset
svm_pred = clf.predict(X_test)

# Model Accuracy
print("Accuracy: ", metrics.accuracy_score(y_test,svm_pred))
print("F1 Score: ",f1_score(y_test, svm_pred, average='macro'))

In [None]:
#Create Random Forest Classifier and Train the model using the training sets
rfc = RandomForestClassifier(n_estimators=20).fit(X_train, y_train)

#Predict the response for test dataset
rfc_pred = rfc.predict(X_test)

print("Accuracy: ",metrics.accuracy_score(y_test, rfc_pred))
print("F1 Score: ",f1_score(y_test, rfc_pred, average='macro'))

In [None]:
#Create XGB Classifier
model = XGBClassifier()

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
xgb_pred = model.predict(X_test)

print("Accuracy: ",metrics.accuracy_score(y_test, xgb_pred))
print("F1 Score: ",f1_score(y_test, xgb_pred, average='macro'))

In [None]:
n = [3,4,5,7,9]
for i in n:
    #Create KNN Classifier with neighbours = 4
    knn = KNeighborsClassifier(n_neighbors=n4)
    #Train the model using the training sets
    knn.fit(X_train, y_train)
    #Predict the response for test dataset
    knn_pred = knn.predict(X_test)

    print("Accuracy: ",metrics.accuracy_score(y_test, knn_pred))
    print("F1 Score: ",f1_score(y_test, knn_pred, average='macro'))

In [None]:
depth = [3,5,7,11,15,17]
for i in depth:
    #Create Decission Tree Classifier and Train the model using the training sets
    dec = DecisionTreeClassifier(max_depth = i).fit(X_train, y_train)
    #Predict the response for test dataset
    dec_pred = dec.predict(X_test)
    
    print("Accuracy: ",metrics.accuracy_score(y_test, dec_pred))
    print("F1 Score: ",f1_score(y_test, dec_pred, average='macro'))

In [None]:
df2 = pd.read_csv('used_cars_ver2.csv')

In [None]:
y = df2['condition']
X = df2.drop(columns = ['drive', 'state','fuel', 'transmission', 'type', 'condition', 'title_status', 'manufacturer', 'model'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
#Create a svm Classifier
svm = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
svm.fit(X_train, y_train)

#Predict the response for test dataset
svm_pred = clf.predict(X_test)

# Model Accuracy
print("Accuracy: ", metrics.accuracy_score(y_test,svm_pred))
print("F1 Score: ",f1_score(y_test, svm_pred, average='macro'))

In [None]:
#Create Random Forest Classifier and Train the model using the training sets
rfc = RandomForestClassifier(n_estimators=20).fit(X_train, y_train)

#Predict the response for test dataset
rfc_pred = rfc.predict(X_test)

print("Accuracy: ",metrics.accuracy_score(y_test, rfc_pred))
print("F1 Score: ",f1_score(y_test, rfc_pred, average='macro'))

In [None]:
#Create XGB Classifier
model = XGBClassifier()

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
xgb_pred = model.predict(X_test)

print("Accuracy: ",metrics.accuracy_score(y_test, xgb_pred))
print("F1 Score: ",f1_score(y_test, xgb_pred, average='macro'))

In [None]:
n = [3,4,5,7,9]
for i in n:
    #Create KNN Classifier with neighbours = 4
    knn = KNeighborsClassifier(n_neighbors=n4)
    #Train the model using the training sets
    knn.fit(X_train, y_train)
    #Predict the response for test dataset
    knn_pred = knn.predict(X_test)

    print("Accuracy: ",metrics.accuracy_score(y_test, knn_pred))
    print("F1 Score: ",f1_score(y_test, knn_pred, average='macro'))

In [None]:
depth = [3,5,7,11,15,17]
for i in depth:
    #Create Decission Tree Classifier and Train the model using the training sets
    dec = DecisionTreeClassifier(max_depth = i).fit(X_train, y_train)
    #Predict the response for test dataset
    dec_pred = dec.predict(X_test)
    
    print("Accuracy: ",metrics.accuracy_score(y_test, dec_pred))
    print("F1 Score: ",f1_score(y_test, dec_pred, average='macro'))