In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:

def change_to_numerical(colName,df):
    a = df[colName]
    s = set(a)
    size =  len(s)
    l = []
    d = dict()
    count = 0
    for val in s:
        d[val] = count
        count += 1
    # df[colName] = df[colName].apply(d)
    df[colName] = df[colName].map(d)
    return df


In [3]:
filePath = "../../Data/Dataset/"
df = pd.read_csv(filePath+'car_evaluation.csv')

In [4]:
df.describe()

Unnamed: 0,COST,MAINTENANCE,DOORS,PERSONS,LUGGAGE,SAFETY,ACCEPTABILITY
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [5]:
df.head()

Unnamed: 0,COST,MAINTENANCE,DOORS,PERSONS,LUGGAGE,SAFETY,ACCEPTABILITY
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
df['ACCEPTABILITY'].unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [7]:
df['COST'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [8]:
df['MAINTENANCE'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [9]:
df['LUGGAGE'].unique()

array(['small', 'med', 'big'], dtype=object)

In [10]:
df['SAFETY'].unique()

array(['low', 'med', 'high'], dtype=object)

Here Acceptability is the output class where as others are input. we have all 6 categorical features
Among these as we can see that different categorical features have different categories. Cost & Maintainance can be 
categorized as vhigh to low, so we can treat them as 4,3,2,1 and Luggage and safety can have categories as 3,2,1. The reason
we can mark them numerically is that they are relative to each other and their value ranges from high to low.

In [11]:
df['DOORS'].unique()

array(['2', '3', '4', '5more'], dtype=object)

In [12]:
df['PERSONS'].unique()

array(['2', '4', 'more'], dtype=object)

In [13]:
df['COST'].unique()

array(['vhigh', 'high', 'med', 'low'], dtype=object)

In [14]:
def cat_to_numeric_1(x):
    if x == 'vhigh':
        return 4
    elif x == 'high':
        return 3
    elif x == 'med':
        return 2
    elif x == 'low':
        return 1
    else:
        return 0

df['COST'] = df['COST'].apply(cat_to_numeric_1)
df['MAINTENANCE'] = df['MAINTENANCE'].apply(cat_to_numeric_1)

In [15]:
def cat_to_numeric_2(x):
    if x == 'big':
        return 3
    elif x == 'med':
        return 2
    elif x == 'small':
        return 1
    else:
        return 0
df['LUGGAGE'] = df['LUGGAGE'].apply(cat_to_numeric_1)

In [16]:
def cat_to_numeric_3(x):
    if x == 'high':
        return 3
    elif x == 'med':
        return 2
    elif x == 'low':
        return 1
    else:
        return 0
df['SAFETY'] = df['SAFETY'].apply(cat_to_numeric_1)

In [17]:
def cat_to_numeric_4(x):
    if x== '5more' or x=='more':
        return 6
    else:
        return x
df['DOORS'] = df['DOORS'].apply(cat_to_numeric_4)
df['PERSONS'] = df['PERSONS'].apply(cat_to_numeric_4)

In [18]:
train, test = train_test_split(df, test_size=0.2)

train_x = train[train.columns[train.columns!='ACCEPTABILITY'] ]
train_y = train["ACCEPTABILITY"]

test_x = test[test.columns[test.columns!='ACCEPTABILITY'] ]
test_y = test["ACCEPTABILITY"]

In [19]:
dst = DecisionTreeClassifier()

In [20]:
dst.fit(train_x,train_y)

In [22]:
y_pred = dst.predict(test_x)

In [23]:
accuracy_score(test_y,y_pred)

0.8121387283236994

In [26]:
confusion_matrix(test_y,y_pred)

array([[ 58,   6,  12,   6],
       [  5,   5,   0,   6],
       [ 18,   1, 212,   1],
       [  5,   4,   1,   6]], dtype=int64)

In [25]:
print(classification_report(test_y,y_pred))

              precision    recall  f1-score   support

         acc       0.67      0.71      0.69        82
        good       0.31      0.31      0.31        16
       unacc       0.94      0.91      0.93       232
       vgood       0.32      0.38      0.34        16

    accuracy                           0.81       346
   macro avg       0.56      0.58      0.57       346
weighted avg       0.82      0.81      0.82       346



In [27]:
# Let us apply now different classification algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from datetime import datetime

In [29]:
models = {'Logistic Regression':LogisticRegression(), 
         'Support Vector Classifier':SVC(),
          'Decision Tree':DecisionTreeClassifier(),
         'Random Forest':RandomForestClassifier(),
          'Neural Network':MLPClassifier(),
         'Stochastic Gradient Descent':SGDClassifier(), 
          'KNN':KNeighborsClassifier()}

for model, algorithm in models.items():
    start_time = datetime.now() 
    pipe = Pipeline([('model', algorithm)])
    pipe.fit(train_x ,train_y)
    end_time = datetime.now() 
    prediction = pipe.predict(test_x)
    print("\n \n ======= For {} ============".format(model))
    print('Accuracy Score : {} '.format(accuracy_score(prediction,test_y)))
    print('Confusion Matrix \n\n  ',confusion_matrix(prediction,test_y))
    print('\n Classification Report \n ')
    print(classification_report(prediction,test_y))
    time_difference = (end_time - start_time).total_seconds() * 10**3
    print("Execution time of program is: ", time_difference, "ms")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 
Accuracy Score : 0.7861271676300579 
Confusion Matrix 

   [[ 48   5  14   5]
 [  3   4   6   0]
 [ 31   1 209   0]
 [  0   6   3  11]]

 Classification Report 
 
              precision    recall  f1-score   support

         acc       0.59      0.67      0.62        72
        good       0.25      0.31      0.28        13
       unacc       0.90      0.87      0.88       241
       vgood       0.69      0.55      0.61        20

    accuracy                           0.79       346
   macro avg       0.61      0.60      0.60       346
weighted avg       0.80      0.79      0.79       346

Execution time of program is:  85.647 ms

 
Accuracy Score : 0.8554913294797688 
Confusion Matrix 

   [[ 68   6  17   7]
 [  4   7   1   2]
 [ 10   0 214   0]
 [  0   3   0   7]]

 Classification Report 
 
              precision    recall  f1-score   support

         acc       0.83      0.69      0.76        98
        good       0.44      0.50      0.47        14
       unacc       0.92      

