## Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [19]:
dataset = pd.read_csv('Dataset.csv', delimiter=';')
dataset.shape

(165633, 19)

In [20]:
dataset.head()

Unnamed: 0,user,gender,age,how_tall_in_meters,weight,body_mass_index,x1,y1,z1,x2,y2,z2,x3,y3,z3,x4,y4,z4,class
0,debora,Woman,46,1.62,75,28.6,-3,92,-63,-23,18,-19,5,104,-92,-150,-103,-147,sitting
1,debora,Woman,46,1.62,75,28.6,-3,94,-64,-21,18,-18,-14,104,-90,-149,-104,-145,sitting
2,debora,Woman,46,1.62,75,28.6,-1,97,-61,-12,20,-15,-13,104,-90,-151,-104,-144,sitting
3,debora,Woman,46,1.62,75,28.6,-2,96,-57,-15,21,-16,-13,104,-89,-153,-103,-142,sitting
4,debora,Woman,46,1.62,75,28.6,-1,96,-61,-13,20,-15,-13,104,-89,-153,-104,-143,sitting


## Create X and Y

In [21]:
X = dataset.iloc[:, 1:18].values
Y = dataset.iloc[:, 18].values

In [22]:
X.shape

(165633, 17)

In [23]:
Y.shape

(165633,)

In [24]:
X

array([['Woman', 46, 1.62, ..., -150, -103, -147],
       ['Woman', 46, 1.62, ..., -149, -104, -145],
       ['Woman', 46, 1.62, ..., -151, -104, -144],
       ..., 
       ['Man', 75, 1.67, ..., -185, -80, -153],
       ['Man', 75, 1.67, ..., -185, -84, -156],
       ['Man', 75, 1.67, ..., -210, -88, -148]], dtype=object)

In [25]:
Y

array(['sitting', 'sitting', 'sitting', ..., 'walking', 'walking',
       'walking'], dtype=object)

## Preprocess the Data

In [26]:
le_Y = LabelEncoder()

In [27]:
Y = le_Y.fit_transform(Y)

In [28]:
Y

array([0, 0, 0, ..., 4, 4, 4], dtype=int64)

In [29]:
le_X = LabelEncoder()

In [30]:
X[:, 0] = le_X.fit_transform(X[:, 0])

In [31]:
X

array([[1, 46, 1.62, ..., -150, -103, -147],
       [1, 46, 1.62, ..., -149, -104, -145],
       [1, 46, 1.62, ..., -151, -104, -144],
       ..., 
       [0, 75, 1.67, ..., -185, -80, -153],
       [0, 75, 1.67, ..., -185, -84, -156],
       [0, 75, 1.67, ..., -210, -88, -148]], dtype=object)

In [32]:
sc_X = StandardScaler()

In [33]:
X = sc_X.fit_transform(X)



In [34]:
X

array([[ 0.79616611,  0.58668263, -0.37319438, ...,  0.46047725,
        -0.51955844,  0.95687879],
       [ 0.79616611,  0.58668263, -0.37319438, ...,  0.48657926,
        -0.56963719,  1.10815332],
       [ 0.79616611,  0.58668263, -0.37319438, ...,  0.43437524,
        -0.56963719,  1.18379059],
       ..., 
       [-1.25601929,  2.78631009,  0.57341349, ..., -0.45309309,
         0.63225278,  0.5030552 ],
       [-1.25601929,  2.78631009,  0.57341349, ..., -0.45309309,
         0.43193779,  0.2761434 ],
       [-1.25601929,  2.78631009,  0.57341349, ..., -1.10564333,
         0.23162279,  0.88124153]])

## Create Train and Test Data

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [36]:
X_train.shape

(132506, 17)

In [37]:
X_test.shape

(33127, 17)

In [38]:
Y_train.shape

(132506,)

In [39]:
Y_test.shape

(33127,)

In [40]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,40443
2,37922
4,34751
3,9971
1,9419


In [41]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,10188
2,9448
4,8639
3,2444
1,2408


## DecisionTree

In [42]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [43]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [44]:
Y_pred_dt = clf_dt.predict(X_test)

In [45]:
confusion_matrix(Y_pred_dt, Y_test)

array([[10178,     3,     0,     8,     1],
       [    2,  2312,     9,    67,    31],
       [    0,    12,  9348,    19,    75],
       [    5,    55,    30,  2328,    29],
       [    3,    26,    61,    22,  8503]], dtype=int64)

## Random Forest

In [71]:
clf_rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')

In [72]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
Y_pred_rf = clf_rf.predict(X_test)

In [74]:
confusion_matrix(Y_pred_rf, Y_test)

array([[10187,     0,     0,     1,     0],
       [    1,  2386,     0,    23,     5],
       [    0,     6,  9421,     6,     6],
       [    0,     9,     6,  2407,     1],
       [    0,     7,    21,     7,  8627]], dtype=int64)

## Naive Bayes

In [50]:
clf_nb = GaussianNB()

In [51]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [52]:
Y_pred_nb = clf_nb.predict(X_test)

In [53]:
confusion_matrix(Y_pred_nb, Y_test)

array([[9215,  217,   69,  223,  150],
       [  84, 1382,  119,  503,  352],
       [ 823,  651, 8730, 1031, 1786],
       [  64,   31,   37,  328,  509],
       [   2,  127,  493,  359, 5842]], dtype=int64)

## KNN

In [54]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [55]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [56]:
Y_pred_knn = clf_knn.predict(X_test)

In [57]:
confusion_matrix(Y_pred_knn, Y_test)

array([[10179,     1,     1,     6,     1],
       [    1,  2372,     3,    48,    24],
       [    0,    15,  9419,    28,    89],
       [    6,    17,    18,  2358,    19],
       [    2,     3,     7,     4,  8506]], dtype=int64)

## Logistic Regression

In [58]:
clf_lr = LogisticRegression()

In [59]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
Y_pred_lr = clf_lr.predict(X_test)

In [61]:
confusion_matrix(Y_pred_lr, Y_test)

array([[10140,   187,     0,   134,    14],
       [   13,  1267,    19,   355,    79],
       [    0,   574,  8134,   535,  1737],
       [   35,   246,    16,  1181,   150],
       [    0,   134,  1279,   239,  6659]], dtype=int64)

## Linear SVC

In [62]:
clf_lsvc = SVC(kernel = 'linear')

In [63]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [64]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [65]:
confusion_matrix(Y_pred_lsvc, Y_test)

array([[10133,    63,     0,    42,     3],
       [   10,  1867,    82,   251,   384],
       [    0,   196,  7939,   405,  1421],
       [   45,   206,    21,  1599,   106],
       [    0,    76,  1406,   147,  6725]], dtype=int64)

## Kernel SVC

In [66]:
clf_ksvc = SVC(kernel = 'rbf')

In [67]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [68]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [69]:
confusion_matrix(Y_pred_ksvc, Y_test)

array([[10175,     5,     1,    11,     0],
       [    0,  2336,     3,    59,    24],
       [    0,    37,  9386,    51,   206],
       [   12,    21,    26,  2312,    17],
       [    1,     9,    32,    11,  8392]], dtype=int64)

## Accuracy of Various Models

In [75]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.98617441965768105,
 'KNN': 0.99115525100371293,
 'KernelSVC': 0.98412171340598309,
 'LinearSVC': 0.85317112929030703,
 'LogReg': 0.8265463217315181,
 'NB': 0.76967428381682612,
 'RF': 0.9970115011923808}