In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [None]:
data = pd.read_csv("car.data")
data.describe(include='all')

In [9]:
proc = preprocessing.LabelEncoder()

data['buy_price'] = proc.fit_transform(list(data["buy_price"])) #['vhigh', 'high', 'med', 'low'] - [3, 0, 2, 1]
data['maint_price'] = proc.fit_transform(list(data["maint_price"])) #['vhigh', 'high', 'med', 'low'] - [3, 0, 2, 1]
data['doors'] = proc.fit_transform(list(data["doors"])) #['2', '3', '4', '5more'] - [0, 1, 2, 3]
data['persons'] = proc.fit_transform(list(data["persons"])) #['2', '4', 'more'] - [0, 1, 2]
data['lug_boot'] = proc.fit_transform(list(data["lug_boot"])) #['small', 'med', 'big'] -[2, 1, 0]
data['safety'] = proc.fit_transform(list(data["safety"])) #['low', 'med', 'high'] - [1, 2, 0]
data['class'] = proc.fit_transform(list(data["class"])) #['unacc', 'acc', 'vgood', 'good'] - [2, 0, 3, 1]


In [10]:
predict = 'class'

X = data.drop([predict],axis=1)
y = data[predict]

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [None]:
#Train the model
KNN = KNeighborsClassifier()
KNN.fit(X_train,y_train)
RF = RandomForestClassifier()
RF.fit(X_train,y_train)
NB = CategoricalNB()
NB.fit(X_train,y_train)

#Make prediction on the various model
y_pred = KNN.predict(X_test)
y_pred1 = RF.predict(X_test)
y_pred2 = NB.predict(X_test)

#Shows the reliability of different model on the current dataset 
print("KNN : \n", metrics.confusion_matrix(y_test,y_pred))
print("RF : \n", metrics.confusion_matrix(y_test,y_pred1))
print("NB : \n", metrics.confusion_matrix(y_test,y_pred2))
print("KNN : \n", metrics.classification_report(y_test,y_pred))
print("RF : \n", metrics.classification_report(y_test,y_pred1))
print("NB : \n", metrics.classification_report(y_test,y_pred2))

In [28]:
#Settings for hyperparameter tuning
#Number of trees inside the random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

#Number of features to consider at every split
max_features = [2,3]

#Maximum depth of the tree
max_depth = [3,5,7]

In [None]:
#Hyperparameter tuning
RF = RandomForestClassifier()
param_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth' : max_depth}
gs = GridSearchCV(RF, param_grid, cv=3, n_jobs=-1, verbose=10)
gs.fit(X_train,y_train)

In [None]:
#Check best parameter in hyperparameter tuning
best_param = gs.best_params_
RF = gs.best_estimator_
print(best_param)
print(RF.score(X_test,y_test))

In [None]:
#Create new data to classify
data = {'buy_price' : 1 ,
        'maint_price' : 0,
        'doors' : 3,
        'persons' : 1,
        'lug_boot' : 0,
        'safety' : 2}

df = pd.DataFrame(data,index=[0])
df

In [None]:
#Prediction on new data
prediction = RF.predict(df)
print(proc.inverse_transform(prediction))