## Initialization

In [1]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [6]:
df = pd.read_csv('balance-scale.data', header = None)
df.shape

(625, 5)

In [7]:
df.head()

Unnamed: 0,0,1,2,3,4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [8]:
df.columns = ['Class', 'LW', 'LD', 'RW', 'RD']

In [9]:
df.head()

Unnamed: 0,Class,LW,LD,RW,RD
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


## Preprocess the Data

In [10]:
le = LabelEncoder()

In [11]:
encoded_Class = le.fit(df['Class'])

In [12]:
df['e_Class'] = df['Class'].map(lambda x : encoded_Class.transform([x]))

In [13]:
df['e_Class'] = df['e_Class'].map(lambda x : x[0])

In [14]:
df.head()

Unnamed: 0,Class,LW,LD,RW,RD,e_Class
0,B,1,1,1,1,0
1,R,1,1,1,2,2
2,R,1,1,1,3,2
3,R,1,1,1,4,2
4,R,1,1,1,5,2


## Creating X and Y

In [21]:
Y = df['e_Class']
Y.shape

(625,)

In [16]:
Y.head()

0    0
1    2
2    2
3    2
4    2
Name: e_Class, dtype: int64

In [22]:
X = df[['LW', 'LD', 'RW', 'RD']]
X.shape

(625, 4)

In [20]:
X.head()

Unnamed: 0,LW,LD,RW,RD
0,1,1,1,1
1,1,1,1,2
2,1,1,1,3
3,1,1,1,4
4,1,1,1,5


## Create Train and Test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [24]:
X_train.shape

(500, 4)

In [25]:
X_test.shape

(125, 4)

In [26]:
Y_train.shape

(500,)

In [27]:
Y_test.shape

(125,)

In [28]:
pd.DataFrame(pd.DataFrame(Y_train)['e_Class'].value_counts())

Unnamed: 0,e_Class
2,237
1,218
0,45


In [29]:
pd.DataFrame(pd.DataFrame(Y_test)['e_Class'].value_counts())

Unnamed: 0,e_Class
1,70
2,51
0,4


## Decision Tree Classifier

In [33]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [34]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [35]:
Y_pred_dt = clf_dt.predict(X_test)

In [36]:
cm_dt = confusion_matrix(Y_pred_dt, Y_test)
cm_dt

array([[ 0, 11,  8],
       [ 1, 55,  0],
       [ 3,  4, 43]])

## Random Forest Classifier

In [37]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [38]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [39]:
Y_pred_rf = clf_rf.predict(X_test)

In [40]:
cm_rf = confusion_matrix(Y_pred_rf, Y_test)
cm_rf

array([[ 0, 10,  5],
       [ 0, 60,  1],
       [ 4,  0, 45]])

## Naive Bayes Classifier

In [41]:
clf_nb = GaussianNB()

In [42]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [43]:
Y_pred_nb = clf_nb.predict(X_test)

In [44]:
cm_nb = confusion_matrix(Y_pred_nb, Y_test)
cm_nb

array([[ 0,  0,  0],
       [ 0, 67,  1],
       [ 4,  3, 50]])

## KNN Classifier

In [45]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [46]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [47]:
Y_pred_knn = clf_knn.predict(X_test)

In [48]:
cm_knn = confusion_matrix(Y_pred_knn, Y_test)
cm_knn

array([[ 0,  8,  8],
       [ 1, 61,  2],
       [ 3,  1, 41]])

## Logistic Regression

In [49]:
clf_lr = LogisticRegression()

In [50]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [51]:
Y_pred_lr = clf_lr.predict(X_test)

In [52]:
cm_lr = confusion_matrix(Y_pred_lr, Y_test)
cm_lr

array([[ 0,  0,  0],
       [ 1, 65,  3],
       [ 3,  5, 48]])

## SVC Linear

In [53]:
clf_lsvc = SVC(kernel = "linear")

In [54]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [55]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [56]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[ 3,  5,  2],
       [ 0, 64,  1],
       [ 1,  1, 48]])

## SVC Kernel

In [57]:
clf_ksvc = SVC(kernel = "rbf")

In [58]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [59]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [60]:
cm_ksvc = confusion_matrix(Y_pred_ksvc, Y_test)
cm_ksvc

array([[ 0,  1,  1],
       [ 0, 68,  1],
       [ 4,  1, 49]])

## Accuracy of Various Classifiers

In [61]:
model_accuracies

{'DT': 1,
 'KNN': 1,
 'KernelSVC': 1,
 'LinearSVC': 1,
 'LogReg': 1,
 'NB': 1,
 'RF': 1}

In [62]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.78400000000000003,
 'KNN': 0.81599999999999995,
 'KernelSVC': 0.93600000000000005,
 'LinearSVC': 0.92000000000000004,
 'LogReg': 0.90400000000000003,
 'NB': 0.93600000000000005,
 'RF': 0.83999999999999997}