In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import time
import sklearn

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import preprocessing

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree

In [4]:
def model_acc_time(X_train, X_test, y_train, y_test, model):
    # creates a model based on training data and returns the accuracy on the held out test set and the time it took to train the model
    # data should be pre processed already
    
    # records the time and accuracy for each model, as well as saving the model itself
    total_time = []
    accuracy = []
    models = []

    # manual 5 fold on test data to select the best model
    for i in range(5):
        # splitting the training data to train and evaluate the model
        kfX_train, kfX_test, kfy_train, kfy_test = train_test_split(
    X_train, y_train, test_size=0.2,shuffle=True)

        # creates a new unfitted model with the inputted parameters
        curModel = sklearn.base.clone(model)


        # times the training of the model and calcualtes the accuracy on the validation set
        t1 = time.time()
        curModel.fit(kfX_train,kfy_train)
        accuracy.append((curModel.predict(kfX_test)==np.array(kfy_test)[0]).mean())
        t2 = time.time()
        models.append(curModel)
        total_time.append(t2-t1)

    # selects the model with the highest accuracy on the validation set and calcuates the accuracy on the held out set
    best_index = accuracy.index(max(accuracy))
    t = total_time[best_index]
    most_acc_model = models[best_index]
    acc = (most_acc_model.predict(X_test)==np.array(y_test)[0]).mean()*100

    return "Accuracy: {0:.2f}% || Time to Train: {1:.3f} seconds".format(acc, t)

In [7]:
phoneData = pd.read_csv("data/phone_activity.csv")
phoneData

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_553,feature_554,feature_555,feature_556,feature_557,feature_558,feature_559,feature_560,feature_561,activity
0,0.289,-0.0203,-0.1330,-0.995,-0.9830,-0.914,-0.995,-0.983,-0.924,-0.93500,...,-0.2990,-0.710,-0.1130,0.03040,-0.465,-0.0184,-0.841,0.180,-0.0586,5
1,0.278,-0.0164,-0.1240,-0.998,-0.9750,-0.960,-0.999,-0.975,-0.958,-0.94300,...,-0.5950,-0.861,0.0535,-0.00743,-0.733,0.7040,-0.845,0.180,-0.0543,5
2,0.280,-0.0195,-0.1130,-0.995,-0.9670,-0.979,-0.997,-0.964,-0.977,-0.93900,...,-0.3910,-0.760,-0.1190,0.17800,0.101,0.8090,-0.849,0.181,-0.0491,5
3,0.279,-0.0262,-0.1230,-0.996,-0.9830,-0.991,-0.997,-0.983,-0.989,-0.93900,...,-0.1170,-0.483,-0.0368,-0.01290,0.640,-0.4850,-0.849,0.182,-0.0477,5
4,0.277,-0.0166,-0.1150,-0.998,-0.9810,-0.990,-0.998,-0.980,-0.990,-0.94200,...,-0.3510,-0.699,0.1230,0.12300,0.694,-0.6160,-0.848,0.185,-0.0439,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10294,0.310,-0.0534,-0.0991,-0.288,-0.1410,-0.215,-0.356,-0.149,-0.232,0.18500,...,-0.3760,-0.751,-0.3370,0.34600,0.885,-0.6990,-0.652,0.275,0.1850,2
10295,0.363,-0.0392,-0.1060,-0.305,0.0281,-0.196,-0.374,-0.030,-0.270,0.18500,...,-0.3200,-0.700,-0.7370,-0.37300,-0.657,0.3230,-0.655,0.274,0.1820,2
10296,0.350,0.0301,-0.1160,-0.330,-0.0421,-0.250,-0.388,-0.133,-0.347,0.00747,...,-0.1190,-0.467,-0.1820,0.08860,0.697,0.3630,-0.655,0.274,0.1810,2
10297,0.238,0.0185,-0.0965,-0.323,-0.2300,-0.208,-0.392,-0.280,-0.289,0.00747,...,-0.2050,-0.618,0.4450,-0.81900,0.929,-0.0084,-0.660,0.265,0.1880,2


In [8]:
phoneData.activity.value_counts(normalize=True)

6    0.188756
5    0.185067
4    0.172541
1    0.167201
2    0.149917
3    0.136518
Name: activity, dtype: float64

In [9]:
phoneData.corr()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_553,feature_554,feature_555,feature_556,feature_557,feature_558,feature_559,feature_560,feature_561,activity
feature_1,1.000000,0.128072,-0.230328,0.004609,-0.016769,-0.036030,0.010310,-0.017470,-0.038760,0.046960,...,-0.006087,-0.002169,-0.553002,0.015760,0.036136,0.034320,-0.041054,0.034039,0.030661,-0.004497
feature_2,0.128072,1.000000,-0.029915,-0.046341,-0.046985,-0.054130,-0.045239,-0.047667,-0.055507,-0.039436,...,-0.000669,-0.002855,0.077742,-0.027572,0.013229,0.077609,-0.007518,-0.005606,-0.016230,0.037329
feature_3,-0.230328,-0.029915,1.000000,-0.024208,-0.023758,-0.015666,-0.022899,-0.022989,-0.009615,-0.040246,...,0.023110,0.023186,0.055028,-0.042151,-0.066246,-0.030737,0.003152,-0.012893,-0.028325,0.043543
feature_4,0.004609,-0.046341,-0.024208,1.000000,0.922528,0.861910,0.998661,0.916087,0.856496,0.981227,...,0.165541,0.135161,-0.034337,-0.017084,0.027459,-0.027118,-0.374103,0.449439,0.393059,-0.729099
feature_5,-0.016769,-0.046985,-0.023758,0.922528,1.000000,0.888255,0.918562,0.997510,0.887053,0.911141,...,0.220249,0.191141,-0.020811,-0.006602,0.001901,-0.015783,-0.381385,0.506122,0.425500,-0.816075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
feature_558,0.034320,0.077609,-0.030737,-0.027118,-0.015783,-0.012185,-0.027106,-0.013406,-0.012783,-0.030903,...,-0.013063,-0.011829,-0.027495,0.023605,-0.112461,1.000000,0.024815,-0.004576,-0.012552,0.014390
feature_559,-0.041054,-0.007518,0.003152,-0.374103,-0.381385,-0.353274,-0.371169,-0.378016,-0.355844,-0.384243,...,-0.085205,-0.081957,0.008042,0.017521,-0.007100,0.024815,1.000000,-0.748247,-0.635230,0.613556
feature_560,0.034039,-0.005606,-0.012893,0.449439,0.506122,0.459099,0.444943,0.507964,0.460362,0.458845,...,0.087657,0.078005,0.003101,-0.007239,-0.006456,-0.004576,-0.748247,1.000000,0.545612,-0.605585
feature_561,0.030661,-0.016230,-0.028325,0.393059,0.425500,0.483420,0.389469,0.424472,0.480028,0.402871,...,0.058737,0.054001,-0.003234,-0.013146,-0.015371,-0.012552,-0.635230,0.545612,1.000000,-0.534107


In [11]:
#Split Data into training and testing
x = phoneData.drop("activity", axis=1)
y = phoneData[["activity"]]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
#Standardize Columns
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [19]:
# DT classifier, using entropy and a max depth of 5
DT = tree.DecisionTreeClassifier(criterion="entropy",max_depth=5)

# using the function above, we can determine the accuracy and training time of a deciosion tree model
model_acc_time(X_train, X_test, y_train, y_test, DT)

'Accuracy: 18.69% || Time to Train: 1.219 seconds'

In [20]:
# MLP Classifier 
MLP = MLPClassifier(max_iter=300, hidden_layer_sizes=(10,10), solver="sgd")

# calculating the accuracy and training time of an MLP Classifier
model_acc_time(X_train, X_test, y_train, y_test, MLP)

'Accuracy: 17.77% || Time to Train: 6.614 seconds'

In [21]:
# Random Forest
RF = RandomForestClassifier(n_estimators=200, max_depth=4,criterion="entropy")

# calculating the accuracy and training time of the random forest Classifier
model_acc_time(X_train, X_test, y_train, y_test, RF)

'Accuracy: 18.54% || Time to Train: 6.470 seconds'

In [22]:
# Extra Trees Classifier
ET = ExtraTreesClassifier(n_estimators=100, max_depth=3,criterion="entropy")

# calculating the accuracy and training time of the extra trees Classifier
model_acc_time(X_train, X_test, y_train, y_test, ET)

'Accuracy: 19.81% || Time to Train: 0.577 seconds'

In [23]:
# SGD Classifier
SGD = SGDClassifier(loss="hinge", max_iter=500, )

# calculating the accuracy and training time of the SGD Classifier
model_acc_time(X_train, X_test, y_train, y_test, SGD)

'Accuracy: 17.77% || Time to Train: 0.833 seconds'

In [24]:
# Gaussian Naive Bayes
GNB = GaussianNB()
# calculating the accuracy and training time of the Gaussian Naive Bayes
model_acc_time(X_train, X_test, y_train, y_test, GNB)

'Accuracy: 14.22% || Time to Train: 0.072 seconds'

In [25]:
# SVM
SVM = svm.SVC(kernel="sigmoid")

# calculating the accuracy and training time of the SVM
model_acc_time(X_train, X_test, y_train, y_test, SVM)


'Accuracy: 17.09% || Time to Train: 6.346 seconds'