In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv("eye_movements.csv")
df.head

<bound method NDFrame.head of        lineNo  assgNo  fixcount  firstPassCnt  P1stFixation  P2stFixation  \
0           1       1         1             1             1             0   
1           2       1         1             1             1             0   
2           3       1         1             1             1             0   
3           4       1         1             1             1             0   
4           5       1         1             1             1             0   
...       ...     ...       ...           ...           ...           ...   
10931   10932     336         1             1             1             0   
10932   10933     336         1             1             1             0   
10933   10934     336         1             1             1             0   
10934   10935     336         2             1             1             0   
10935   10936     336         1             1             1             1   

       prevFixDur  firstfixDur  firstPassFixD

In [4]:
num_missing_values = df.isna().sum()
num_missing_values # No need to remove any tuples or perform data imputation since none of the data is missing

lineNo             0
assgNo             0
fixcount           0
firstPassCnt       0
P1stFixation       0
P2stFixation       0
prevFixDur         0
firstfixDur        0
firstPassFixDur    0
nextFixDur         0
firstSaccLen       0
lastSaccLen        0
prevFixPos         0
landingPos         0
leavingPos         0
totalFixDur        0
meanFixDur         0
nRegressFrom       0
regressLen         0
nextWordRegress    0
regressDur         0
pupilDiamMax       0
pupilDiamLag       0
timePrtctg         0
nWordsInTitle      0
titleNo            0
wordNo             0
label              0
dtype: int64

In [5]:
attributes = df.columns
attributes

Index(['lineNo', 'assgNo', 'fixcount', 'firstPassCnt', 'P1stFixation',
       'P2stFixation', 'prevFixDur', 'firstfixDur', 'firstPassFixDur',
       'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos',
       'leavingPos', 'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen',
       'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag',
       'timePrtctg', 'nWordsInTitle', 'titleNo', 'wordNo', 'label'],
      dtype='object')

In [6]:
num_unique = {}
for attribute in attributes:
    num_unique[attribute] = len(pd.unique(df[attribute]))
num_unique

{'lineNo': 10936,
 'assgNo': 336,
 'fixcount': 8,
 'firstPassCnt': 7,
 'P1stFixation': 2,
 'P2stFixation': 2,
 'prevFixDur': 61,
 'firstfixDur': 63,
 'firstPassFixDur': 111,
 'nextFixDur': 68,
 'firstSaccLen': 9548,
 'lastSaccLen': 9350,
 'prevFixPos': 7866,
 'landingPos': 6847,
 'leavingPos': 6900,
 'totalFixDur': 149,
 'meanFixDur': 254,
 'nRegressFrom': 6,
 'regressLen': 572,
 'nextWordRegress': 2,
 'regressDur': 381,
 'pupilDiamMax': 3810,
 'pupilDiamLag': 2517,
 'timePrtctg': 1065,
 'nWordsInTitle': 9,
 'titleNo': 10,
 'wordNo': 10,
 'label': 3}

In [7]:
df2 = df.drop(columns=['lineNo','titleNo','wordNo'], axis=1) # Contains unique values for each instance, not going to be useful
# Also contants index values instead of actual data
df2.columns

Index(['assgNo', 'fixcount', 'firstPassCnt', 'P1stFixation', 'P2stFixation',
       'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur',
       'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos',
       'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen',
       'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag',
       'timePrtctg', 'nWordsInTitle', 'label'],
      dtype='object')

In [8]:
x = df2.drop(columns=['label'], axis=1)
y = df2['label']

In [9]:
# get the pearson correlation coefficients for all features to determine which features to train on

corr = df2.corr()
class_correlation = abs(corr["label"])
relevant_features = class_correlation[class_correlation > 0.1]
relevant_features

P2stFixation       0.154854
totalFixDur        0.125466
nRegressFrom       0.212710
nextWordRegress    0.228493
regressDur         0.214826
pupilDiamMax       0.122343
timePrtctg         0.168475
label              1.000000
Name: label, dtype: float64

In [10]:
# features should be independent of each other, test to make sure they aren't highly correlated with each other
feature_strings = ["P2stFixation", "totalFixDur", "nRegressFrom", "regressDur", "nextWordRegress", "pupilDiamMax", "timePrtctg"]  
for i in feature_strings:
    for j in feature_strings:
        if i != j:
            print(abs(df[[i,j]].corr()))
            print()

              P2stFixation  totalFixDur
P2stFixation      1.000000     0.087123
totalFixDur       0.087123     1.000000

              P2stFixation  nRegressFrom
P2stFixation      1.000000      0.250799
nRegressFrom      0.250799      1.000000

              P2stFixation  regressDur
P2stFixation      1.000000    0.251875
regressDur        0.251875    1.000000

                 P2stFixation  nextWordRegress
P2stFixation         1.000000         0.131331
nextWordRegress      0.131331         1.000000

              P2stFixation  pupilDiamMax
P2stFixation      1.000000      0.031823
pupilDiamMax      0.031823      1.000000

              P2stFixation  timePrtctg
P2stFixation      1.000000    0.048122
timePrtctg        0.048122    1.000000

              totalFixDur  P2stFixation
totalFixDur      1.000000      0.087123
P2stFixation     0.087123      1.000000

              totalFixDur  nRegressFrom
totalFixDur      1.000000      0.024128
nRegressFrom     0.024128      1.000000

           

In [11]:
# drop all columns which have a low correlation
# regressDur and timePtrctg have a high correlation with several other attributes and are thus not independent, drop both as well
final_columns = ["P2stFixation", "totalFixDur", "nRegressFrom", "nextWordRegress", "pupilDiamMax"]  
df_preprocessed_correlation = df2
for i in df2.columns:
    if i not in final_columns:
        df_preprocessed_correlation = df_preprocessed_correlation.drop(columns=[i,], axis=1)
        
df_preprocessed_correlation
    


Unnamed: 0,P2stFixation,totalFixDur,nRegressFrom,nextWordRegress,pupilDiamMax
0,0,100,0,0,0.0095
1,0,278,0,0,0.0095
2,0,159,0,0,0.0370
3,0,159,0,0,0.0370
4,0,139,0,0,0.0390
...,...,...,...,...,...
10931,0,139,0,1,0.4730
10932,0,219,1,0,0.4730
10933,0,99,0,1,0.4730
10934,0,358,1,0,0.2150


In [41]:
# perform attribute selection with RFE + decision trees
# RFE (Recursive Feature Elimination) feeds the data to a model, evaluates the performance for each attribute 
# and deletes attributes which don't perform well enough
from sklearn.ensemble import RandomForestClassifier

rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=6)
fit = rfe.fit(x, y)
best_features = []
print(len(fit.support_))
print(len(x.columns))
for i in range(len(fit.support_)):
    if fit.support_[i]:
        best_features.append(x.columns[i])
        
print(best_features)
df_preprocessed_rfe = df2

for i in df2.columns:
    if i not in best_features:
        df_preprocessed_rfe = df_preprocessed_rfe.drop(columns=[i,], axis=1)
        
df_preprocessed_rfe

24
24
['assgNo', 'firstSaccLen', 'lastSaccLen', 'landingPos', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg']


Unnamed: 0,assgNo,firstSaccLen,lastSaccLen,landingPos,pupilDiamMax,pupilDiamLag,timePrtctg
0,1,0.0000,382.8998,62.0081,0.0095,0.145,0.0131
1,1,87.0933,165.0068,83.0060,0.0095,0.183,0.0363
2,1,165.0068,141.1471,24.6982,0.0370,0.183,0.0208
3,1,141.1471,185.0007,8.7321,0.0370,0.183,0.0208
4,1,185.0007,221.6269,101.5788,0.0390,0.183,0.0182
...,...,...,...,...,...,...,...
10931,336,85.1469,194.6124,44.9222,0.4730,0.069,0.0119
10932,336,194.6124,224.5089,99.2472,0.4730,0.069,0.0187
10933,336,224.5089,229.9609,12.2984,0.4730,0.473,0.0084
10934,336,229.9609,0.0000,135.9136,0.2150,0.215,0.0305


In [42]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

selector = SelectFromModel(LassoCV())
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)
df_preprocessed_lasso = df2

for i in df2.columns:
    if i not in best_features:
        df_preprocessed_lasso = df_preprocessed_lasso.drop(columns=[i,], axis=1)
        
df_preprocessed_lasso

10 selected features
['assgNo', 'firstfixDur', 'firstPassFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'totalFixDur', 'regressLen', 'regressDur']


Unnamed: 0,assgNo,firstfixDur,firstPassFixDur,firstSaccLen,lastSaccLen,prevFixPos,landingPos,totalFixDur,regressLen,regressDur
0,1,100,100,0.0000,382.8998,0.0000,62.0081,100,0,0
1,1,278,278,87.0933,165.0068,146.7685,83.0060,278,0,0
2,1,159,159,165.0068,141.1471,148.8624,24.6982,159,0,0
3,1,159,159,141.1471,185.0007,133.0902,8.7321,159,0,0
4,1,139,139,185.0007,221.6269,84.2140,101.5788,139,0,0
...,...,...,...,...,...,...,...,...,...,...
10931,336,139,139,85.1469,194.6124,39.1152,44.9222,139,0,99
10932,336,219,219,194.6124,224.5089,85.0941,99.2472,219,914,358
10933,336,139,139,224.5089,229.9609,218.1857,12.2984,99,0,99
10934,336,219,219,229.9609,0.0000,107.2287,135.9136,358,914,358


In [43]:
from sklearn.ensemble import RandomForestClassifier

selector = SelectFromModel(RandomForestClassifier(n_estimators=100))
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)

df_preprocessed_randomforest = df2

for i in df2.columns:
    if i not in best_features:
        df_preprocessed_randomforest = df_preprocessed_randomforest.drop(columns=[i,], axis=1)
        
df_preprocessed_randomforest

13 selected features
['assgNo', 'prevFixDur', 'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'regressDur', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,prevFixDur,nextFixDur,firstSaccLen,lastSaccLen,prevFixPos,landingPos,leavingPos,regressDur,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
0,1,0,99,0.0000,382.8998,0.0000,62.0081,58.8218,0,0.0095,0.145,0.0131,7
1,1,99,159,87.0933,165.0068,146.7685,83.0060,86.2279,0,0.0095,0.183,0.0363,7
2,1,278,159,165.0068,141.1471,148.8624,24.6982,20.2299,0,0.0370,0.183,0.0208,7
3,1,159,139,141.1471,185.0007,133.0902,8.7321,9.6177,0,0.0370,0.183,0.0208,7
4,1,159,239,185.0007,221.6269,84.2140,101.5788,92.6634,0,0.0390,0.183,0.0182,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10931,336,199,219,85.1469,194.6124,39.1152,44.9222,24.3311,99,0.4730,0.069,0.0119,7
10932,336,139,99,194.6124,224.5089,85.0941,99.2472,99.4082,358,0.4730,0.069,0.0187,7
10933,336,199,219,224.5089,229.9609,218.1857,12.2984,24.9098,99,0.4730,0.473,0.0084,7
10934,336,139,99,229.9609,0.0000,107.2287,135.9136,186.2693,358,0.2150,0.215,0.0305,7


In [44]:
from lightgbm import LGBMClassifier

lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

selector = SelectFromModel(lgbc)
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)

df_preprocessed_lgbm = df2

for i in df2.columns:
    if i not in best_features:
        df_preprocessed_lgbm = df_preprocessed_lgbm.drop(columns=[i,], axis=1)
        
df_preprocessed_lgbm


11 selected features
['assgNo', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'regressDur', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,firstSaccLen,lastSaccLen,prevFixPos,landingPos,leavingPos,regressDur,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
0,1,0.0000,382.8998,0.0000,62.0081,58.8218,0,0.0095,0.145,0.0131,7
1,1,87.0933,165.0068,146.7685,83.0060,86.2279,0,0.0095,0.183,0.0363,7
2,1,165.0068,141.1471,148.8624,24.6982,20.2299,0,0.0370,0.183,0.0208,7
3,1,141.1471,185.0007,133.0902,8.7321,9.6177,0,0.0370,0.183,0.0208,7
4,1,185.0007,221.6269,84.2140,101.5788,92.6634,0,0.0390,0.183,0.0182,7
...,...,...,...,...,...,...,...,...,...,...,...
10931,336,85.1469,194.6124,39.1152,44.9222,24.3311,99,0.4730,0.069,0.0119,7
10932,336,194.6124,224.5089,85.0941,99.2472,99.4082,358,0.4730,0.069,0.0187,7
10933,336,224.5089,229.9609,218.1857,12.2984,24.9098,99,0.4730,0.473,0.0084,7
10934,336,229.9609,0.0000,107.2287,135.9136,186.2693,358,0.2150,0.215,0.0305,7


In [None]:
preprocessed_datasets = [df_preprocessed_correlation, df_preprocessed_rfe, df_preprocessed_lasso, df_preprocessed_randomforest, df_preprocessed_lgbm]