# Explorative data analysis of real world data on multiple sclerosis

This script extends the first script of that analysed the dataset. In this script here, I will train a model to address the following research questions:

- Based on the test results, can I classify subjects into diagnosed and control group?
- Which of the used methods is best suited for this classification problem?
- Which metric is most important to predict the participant group?
---

Importing required libraries:

In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import confusion_matrix
sns.set_palette("RdBu_r", 7) # this sets the color palette
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None) # display all columns

In [3]:
# get dataframe from previous script
%store -r df

In [4]:
df.head()

Unnamed: 0,floodlightOpenId,participantCreatedOn,participantIsControl,participantCountryOfResidence,participantSex,participantBirthYear,participantWeightLbs,participantHeightCms,testName,testCode,testMetricName,testMetricCode,testStartedAt,testEndedAt,testResultMetricId,testResultMetricCreatedOn,testResultMetricValue,Delta_test_register,Weekday_testResultCreatedOn,Holiday_testResultCreatedOn,Delta_Weeks_test_register,Delta_Dayss_test_register
0,FL10038084,2018-04-23 00:12:03+00:00,True,US,male,1966,159.0,178.0,Daily Questions,daily_questions,Mood Response,mood_response,2018-04-23 00:22:55+00:00,2018-04-23 00:22:55+00:00,115524,2018-04-23 00:22:57+00:00,4.0,0 days 00:10:54,1,False,0,1
1,FL10038084,2018-04-23 00:12:03+00:00,True,US,male,1966,159.0,178.0,Daily Questions,daily_questions,Mood Response,mood_response,2018-04-25 04:04:31+00:00,2018-04-25 04:04:31+00:00,116040,2018-04-25 04:04:32+00:00,5.0,2 days 03:52:29,3,False,0,3
4,FL10038084,2018-04-23 00:12:03+00:00,True,US,male,1966,159.0,178.0,IPS,ips,Correct Responses,correct_responses,2018-04-25 04:06:35+00:00,2018-04-25 04:06:51+00:00,116043,2018-04-25 04:06:51+00:00,39.0,2 days 03:54:48,3,False,0,3
5,FL10038084,2018-04-23 00:12:03+00:00,True,US,male,1966,159.0,178.0,IPS,ips,Response Time Average,response_time_avg,2018-04-25 04:06:35+00:00,2018-04-25 04:06:51+00:00,116044,2018-04-25 04:06:51+00:00,2.31,2 days 03:54:48,3,False,0,3
6,FL10038084,2018-04-23 00:12:03+00:00,True,US,male,1966,159.0,178.0,Pinching,pinching,Successful Pinches,successful_pinches,2018-04-25 04:06:56+00:00,2018-04-25 04:07:27+00:00,116045,2018-04-25 04:07:27+00:00,27.0,2 days 03:55:24,3,False,0,3


In [5]:
df_model = df

Dropping unnecessary and Timestamp columns:

In [6]:
df_model = df_model.drop(columns=['floodlightOpenId','participantCreatedOn','testCode','testMetricCode','participantIsControl','testStartedAt','testEndedAt','testResultMetricCreatedOn','Delta_test_register'])

Defining the features and dependent variable:

In [7]:
X = df_model
y = df['participantIsControl']
X.head()

Unnamed: 0,participantCountryOfResidence,participantSex,participantBirthYear,participantWeightLbs,participantHeightCms,testName,testMetricName,testResultMetricId,testResultMetricValue,Weekday_testResultCreatedOn,Holiday_testResultCreatedOn,Delta_Weeks_test_register,Delta_Dayss_test_register
0,US,male,1966,159.0,178.0,Daily Questions,Mood Response,115524,4.0,1,False,0,1
1,US,male,1966,159.0,178.0,Daily Questions,Mood Response,116040,5.0,3,False,0,3
4,US,male,1966,159.0,178.0,IPS,Correct Responses,116043,39.0,3,False,0,3
5,US,male,1966,159.0,178.0,IPS,Response Time Average,116044,2.31,3,False,0,3
6,US,male,1966,159.0,178.0,Pinching,Successful Pinches,116045,27.0,3,False,0,3


One hot encoding categorial variables:

In [8]:
#labelencoder = LabelEncoder()
#X['participantCountryOfResidence'] = labelencoder.fit_transform(X['participantCountryOfResidence'])
#onehotencoder = OneHotEncoder()
dummy = pd.get_dummies(X['participantCountryOfResidence'],prefix='Country', columns = ['participantCountryOfResidence'], drop_first=True)
dummy2 = pd.get_dummies(X['participantSex'],prefix='Sex', columns = ['participantSex'], drop_first=True)
dummy3 = pd.get_dummies(X['testName'],prefix='testName', columns = ['testName'], drop_first=True)
dummy4 = pd.get_dummies(X['testMetricName'],prefix='testMetric', columns = ['testMetricName'], drop_first=True)
dummy5 = pd.get_dummies(X['Weekday_testResultCreatedOn'],prefix='Weekday', columns = ['Weekday_testResultCreatedOn'], drop_first=True)
dummy6 = pd.get_dummies(X['Holiday_testResultCreatedOn'],prefix='Holiday', columns = ['Holiday_testResultCreatedOn'], drop_first=True)
X = pd.concat([dummy,dummy2,dummy3,dummy4,dummy5,dummy6,X], axis=1);
X = X.drop(columns=['participantCountryOfResidence','participantSex','testName','testMetricName','Weekday_testResultCreatedOn','Holiday_testResultCreatedOn'], axis=1);
X.head()


Unnamed: 0,Country_BE,Country_BR,Country_CA,Country_CH,Country_CZ,Country_DK,Country_ES,Country_FI,Country_IT,Country_PL,Country_US,Sex_male,testName_Draw A Shape,testName_Five UTurn Test,testName_IPS,testName_Mobility,testName_Pinching,testName_Static Balance,testName_Two Minute Walk Test,testMetric_Circle Hausdorff Distance Best,testMetric_Correct Responses,testMetric_Figure 8 Hausdorff Distance Best,testMetric_Hand Used,testMetric_Life Space Daily,testMetric_Mean Hausdorff Distance Best,testMetric_Mood Response,testMetric_Number of shapes drawn correctly,testMetric_Response Time Average,testMetric_Spiral Hausdorff Distance Best,testMetric_Square Hausdorff Distance Best,testMetric_Steps,testMetric_Successful Pinches,testMetric_Sway Path,testMetric_Top to bottom Hausdorff Distance Best,testMetric_Turn Speed Average,testMetric_Turns,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6,Weekday_7,Holiday_True,participantBirthYear,participantWeightLbs,participantHeightCms,testResultMetricId,testResultMetricValue,Delta_Weeks_test_register,Delta_Dayss_test_register
0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1966,159.0,178.0,115524,4.0,0,1
1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1966,159.0,178.0,116040,5.0,0,3
4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1966,159.0,178.0,116043,39.0,0,3
5,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1966,159.0,178.0,116044,2.31,0,3
6,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1966,159.0,178.0,116045,27.0,0,3


Splitting the dataset into the Training set and Test set:

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

Feature Scaling:

In [10]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Logistic regression: 

In [10]:
classifier_LR = LogisticRegression(solver = 'saga')
classifier_LR.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
y_pred_LR = classifier_LR.predict(X_test)

In [12]:
cm_LR = confusion_matrix(y_test, y_pred_LR)
cm_LR

array([[37363,  1839],
       [ 5190,  5803]])

## k-NN:

In [13]:
classifier_kNN = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_kNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
y_pred_kNN = classifier_kNN.predict(X_test)

In [None]:
cm_kNN = confusion_matrix(y_test, y_pred_kNN)
cm_kNN

array([[37730,  1572],
       [ 3019,  7874]])

## Support Vector Machine:

In [None]:
classifier_SVM = SVC(kernel = 'sigmoid', class_weight ='balanced')
classifier_SVM.fit(X_train, y_train)

In [None]:
y_pred_SVM = classifier_SVM.predict(X_test)

In [None]:
cm_SVM = confusion_matrix(y_test, y_pred_SVM)
cm_SVM

## Naive Bayes:

In [11]:
classifier_NB = GaussianNB()
classifier_NB.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
y_pred_NB = classifier_NB.predict(X_test)

In [14]:
cm_NB = confusion_matrix(y_test, y_pred_NB)
cm_NB

array([[38888,   337],
       [ 8903,  2067]])

## Random Forests:

In [15]:
classifier_RF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
classifier_RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
y_pred_RF = classifier_RF.predict(X_test)

In [17]:
cm_RF = confusion_matrix(y_test, y_pred_RF)
cm_RF

array([[39204,    21],
       [   96, 10874]])