In [2]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [3]:
cwd = pathlib.Path().cwd()
data_path = cwd/"dataset"/"final_train.csv"
pd.set_option("display.max_rows", None, "display.max_columns", None)
data = pd.read_csv(data_path, index_col=0)

In [4]:
# drop rows
data_d = data.dropna()

In [5]:
def data_to_train_valid(data):
    y = data.loc[:, "Activity"].copy()
    X = data.drop(["Activity", "subject", "void()"], axis=1)
    #simple split
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_valid, y_train, y_valid

In [6]:
def encode_y(y_train, y_valid):
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_valid = label_encoder.transform(y_valid)
    return y_train, y_valid, label_encoder

## Split data, encode y labels

In [7]:
X_train, X_valid, y_train, y_valid = data_to_train_valid(data)

In [8]:
y_train, y_valid, label_encoder = encode_y(y_train, y_valid)

In [9]:
numerical_columns = list(X_train.columns.values)

In [10]:
numerical_transformer = SimpleImputer(strategy='median')
scaler = StandardScaler()
preproc = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns)
       
    ])

# SVC

In [11]:
svc = SVC(C=1)
svc_pipe = Pipeline(steps=[('preproc', preproc),
                            ('scaler', scaler),
                            ('model', svc)]
                     )

# Grid search

In [15]:
params_grid = [{'model__kernel': ['rbf'], 'model__gamma': [1e-3, 1e-4],
                     'model__C': [1, 10, 100, 1000]},
                    {'model__kernel': ['linear'], 'model__C': [1, 10, 100, 1000]}]

In [17]:
svc_grid = GridSearchCV(svc_pipe, params_grid, cv=5, verbose=2, n_jobs=-1)

In [18]:
svc_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preproc', 'scaler', 'model', 'preproc__n_jobs', 'preproc__remainder', 'preproc__sparse_threshold', 'preproc__transformer_weights', 'preproc__transformers', 'preproc__verbose', 'preproc__num', 'preproc__num__add_indicator', 'preproc__num__copy', 'preproc__num__fill_value', 'preproc__num__missing_values', 'preproc__num__strategy', 'preproc__num__verbose', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'model__C', 'model__break_ties', 'model__cache_size', 'model__class_weight', 'model__coef0', 'model__decision_function_shape', 'model__degree', 'model__gamma', 'model__kernel', 'model__max_iter', 'model__probability', 'model__random_state', 'model__shrinking', 'model__tol', 'model__verbose'])

In [19]:
svc_grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preproc',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['angle(X,gravityMean)',
                                                                          'angle(Y,gravityMean)',
                                                                          'angle(Z,gravityMean)',
                                                                          'angle(tBodyAccJerkMean),gravityMean)',
                                                                          'angle(tBodyAccMean,gravity)',
                                                                          'angle(tBodyGyroJerkMean,gravityMean)',
                                                                          'angle(tBodyGyroMean,gravityMean)',
      

In [20]:
svc_grid.best_score_

0.9890619088405501

In [21]:
svc_grid.best_params_

{'model__C': 100, 'model__gamma': 0.001, 'model__kernel': 'rbf'}

In [22]:
best_svc = svc_grid.best_estimator_

In [23]:
y_pred_svc = best_svc.predict(X_valid)

In [24]:
accuracy_score(y_pred_svc, y_valid)

0.9873760144274121

In [25]:
print(label_encoder.classes_)
print(label_encoder.transform(label_encoder.classes_))

['LAYING' 'SITTING' 'STANDING' 'WALKING' 'WALKING_DOWNSTAIRS'
 'WALKING_UPSTAIRS']
[0 1 2 3 4 5]


rows reality, columns predictions, for example (indexing from 0)row=1, col=2 : 21 is how many times sitting was predicted as standing

In [26]:
confusion_matrix(y_pred, y_valid)

NameError: name 'y_pred' is not defined

svc deals greatly with all walkings 

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg_pipe = Pipeline(steps=[('preproc', preproc),
                            ('scaler', scaler),
                            ('model', log_reg)]
                     )

In [None]:
log_reg_pipe.get_params().keys()

In [None]:
params_grid = {'model__C':[0.01, 0.03, 0.1, 0.3, 1, 10, 30], 'model__penalty':['l1', 'l2']}

In [None]:
log_reg_grid = GridSearchCV(log_reg_pipe, params_grid, cv=5, verbose=2, n_jobs=-1)

In [None]:
log_reg_grid.fit(X_train, y_train)

In [None]:
log_reg_best = log_reg_grid.best_estimator_

In [None]:
log_reg_grid.best_params_

In [None]:
log_reg_grid.best_score_

In [None]:
y_pred_log_reg = log_reg_grid.predict(X_valid)
accuracy_score(y_pred, y_valid)

In [None]:
print(label_encoder.classes_)
print(label_encoder.transform(label_encoder.classes_))

In [None]:
confusion_matrix(y_pred_log_reg, y_valid)

log_reg deals greatly with all walkings 

# Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_pipe = Pipeline(steps=[('preproc', preproc),
                          ('scaler', scaler),
                          ('model', rfc)])

In [None]:
rfc_pipe.get_params().keys()

In [None]:
params_grid = {"model__n_estimators":[100,150, 500, 750, 1000],
             "model__max_depth": [1, 5, 10, 25],
             "model__max_features": ["auto", "log2"],
             "model__criterion":["gini", "entropy"]}

In [None]:
rfc_grid = GridSearchCV(rfc_pipe, params_grid, cv=5, verbose=3)

# Random forest to distiguish from sitting and standing

In [None]:
y_train_sit = y_train == 1