In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
#read data in
df = pd.read_csv('data_cars.csv',header=None)
for i in range(len(df.columns)):
    df[i] = df[i].astype('category')
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
#map catgories to values
map0 = dict( list(zip( df[0].cat.categories, list(range( len(df[0].cat.categories ))))))
#print map0
map1 = dict( list(zip( df[1].cat.categories, list(range( len(df[1].cat.categories ))))))
map2 = dict( list(zip( df[2].cat.categories, list(range( len(df[2].cat.categories ))))))
map3 = dict( list(zip( df[3].cat.categories, list(range( len(df[3].cat.categories ))))))
map4 = dict( list(zip( df[4].cat.categories, list(range( len(df[4].cat.categories ))))))
map5 = dict( list(zip( df[5].cat.categories, list(range( len(df[5].cat.categories ))))))
map6 = dict( list(zip( df[6].cat.categories, list(range( len(df[6].cat.categories ))))))

cat_cols = df.select_dtypes(['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

df = df.iloc[np.random.permutation(len(df))]
print(df.head())

      0  1  2  3  4  5  6
1596  1  2  3  0  1  1  2
1610  1  2  3  1  0  0  3
1384  1  3  3  0  0  2  2
924   2  3  2  0  0  1  2
694   0  2  1  2  2  2  2


In [4]:
df_f1 = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_precision = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
df_recall = pd.DataFrame(columns=['method']+sorted(map6, key=map6.get))
def CalcMeasures(method,y_pred,y_true,df_f1=df_f1
                 ,df_precision=df_precision,df_recall=df_recall):

    df_f1.loc[len(df_f1)] = [method]+list(f1_score(y_pred,y_true,average=None))
    df_precision.loc[len(df_precision)] = [method]+list(precision_score(y_pred,y_true,average=None))
    df_recall.loc[len(df_recall)] = [method]+list(recall_score(y_pred,y_true,average=None))
    
X= df[df.columns[:-1]].values
Y = df[df.columns[-1]].values

In [5]:
cv = 10
method = 'linear support vector machine'
clf = svm.SVC(kernel='linear',C=50)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'rbf support vector machine'
clf = svm.SVC(kernel='rbf',C=50)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'poly support vector machine'
clf = svm.SVC(kernel='poly',C=50)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'decision tree'
clf = DecisionTreeClassifier(random_state=0)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'random forest'
clf = RandomForestClassifier(n_estimators=50,random_state=0,max_features=None)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'naive bayes'
clf = MultinomialNB()
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'logistic regression'
clf = LogisticRegression()
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

method = 'k nearest neighbours'
clf = KNeighborsClassifier(weights='distance',n_neighbors=5)
y_pred = model_selection.cross_val_predict(clf, X,Y, cv=cv)
CalcMeasures(method,y_pred,Y)

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [6]:
df_f1

Unnamed: 0,method,acc,good,unacc,vgood
0,linear support vector machine,0.26378,0.0,0.847903,0.0
1,rbf support vector machine,0.996109,1.0,0.999173,0.992248
2,poly support vector machine,0.786755,0.837209,0.935181,0.823529
3,decision tree,0.963731,0.888889,0.992562,0.96124
4,random forest,0.970169,0.937931,0.993773,0.977099
5,naive bayes,0.040302,0.0,0.825299,0.0
6,logistic regression,0.273476,0.0,0.821574,0.027397
7,k nearest neighbours,0.796813,0.4375,0.954202,0.645833


In [7]:
df_precision

Unnamed: 0,method,acc,good,unacc,vgood
0,linear support vector machine,0.174479,0.0,0.98595,0.0
1,rbf support vector machine,1.0,1.0,0.998347,0.984615
2,poly support vector machine,0.773438,0.782609,0.947934,0.753846
3,decision tree,0.96875,0.869565,0.992562,0.953846
4,random forest,0.973958,0.985507,0.989256,0.984615
5,naive bayes,0.020833,0.0,0.997521,0.0
6,logistic regression,0.216146,0.0,0.919008,0.015385
7,k nearest neighbours,0.78125,0.304348,0.990083,0.476923


In [8]:
df_recall

Unnamed: 0,method,acc,good,unacc,vgood
0,linear support vector machine,0.540323,0.0,0.743766,0.0
1,rbf support vector machine,0.992248,1.0,1.0,1.0
2,poly support vector machine,0.800539,0.9,0.922767,0.907407
3,decision tree,0.958763,0.909091,0.992562,0.96875
4,random forest,0.966408,0.894737,0.998332,0.969697
5,naive bayes,0.615385,0.0,0.70379,0.0
6,logistic regression,0.372197,0.0,0.742819,0.125
7,k nearest neighbours,0.813008,0.777778,0.92083,1.0


In [9]:
labels_counts=df[6].value_counts()
pd.Series(map6).map(labels_counts)

acc       384
good       69
unacc    1210
vgood      65
dtype: int64