In [105]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFECV, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("eye_movements.csv")
df.head

<bound method NDFrame.head of        lineNo  assgNo  fixcount  firstPassCnt  P1stFixation  P2stFixation  \
0           1       1         1             1             1             0   
1           2       1         1             1             1             0   
2           3       1         1             1             1             0   
3           4       1         1             1             1             0   
4           5       1         1             1             1             0   
...       ...     ...       ...           ...           ...           ...   
10931   10932     336         1             1             1             0   
10932   10933     336         1             1             1             0   
10933   10934     336         1             1             1             0   
10934   10935     336         2             1             1             0   
10935   10936     336         1             1             1             1   

       prevFixDur  firstfixDur  firstPassFixD

In [3]:
num_missing_values = df.isna().sum()
num_missing_values # No need to remove any tuples or perform data imputation since none of the data is missing

lineNo             0
assgNo             0
fixcount           0
firstPassCnt       0
P1stFixation       0
P2stFixation       0
prevFixDur         0
firstfixDur        0
firstPassFixDur    0
nextFixDur         0
firstSaccLen       0
lastSaccLen        0
prevFixPos         0
landingPos         0
leavingPos         0
totalFixDur        0
meanFixDur         0
nRegressFrom       0
regressLen         0
nextWordRegress    0
regressDur         0
pupilDiamMax       0
pupilDiamLag       0
timePrtctg         0
nWordsInTitle      0
titleNo            0
wordNo             0
label              0
dtype: int64

In [4]:
attributes = df.columns
attributes

Index(['lineNo', 'assgNo', 'fixcount', 'firstPassCnt', 'P1stFixation',
       'P2stFixation', 'prevFixDur', 'firstfixDur', 'firstPassFixDur',
       'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos',
       'leavingPos', 'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen',
       'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag',
       'timePrtctg', 'nWordsInTitle', 'titleNo', 'wordNo', 'label'],
      dtype='object')

In [5]:
num_unique = {}
for attribute in attributes:
    num_unique[attribute] = len(pd.unique(df[attribute]))
num_unique

{'lineNo': 10936,
 'assgNo': 336,
 'fixcount': 8,
 'firstPassCnt': 7,
 'P1stFixation': 2,
 'P2stFixation': 2,
 'prevFixDur': 61,
 'firstfixDur': 63,
 'firstPassFixDur': 111,
 'nextFixDur': 68,
 'firstSaccLen': 9548,
 'lastSaccLen': 9350,
 'prevFixPos': 7866,
 'landingPos': 6847,
 'leavingPos': 6900,
 'totalFixDur': 149,
 'meanFixDur': 254,
 'nRegressFrom': 6,
 'regressLen': 572,
 'nextWordRegress': 2,
 'regressDur': 381,
 'pupilDiamMax': 3810,
 'pupilDiamLag': 2517,
 'timePrtctg': 1065,
 'nWordsInTitle': 9,
 'titleNo': 10,
 'wordNo': 10,
 'label': 3}

In [6]:
df2 = df.drop(columns=['lineNo','titleNo','wordNo'], axis=1) # Contains unique values for each instance, not going to be useful
# Also contants index values instead of actual data
df2.columns

Index(['assgNo', 'fixcount', 'firstPassCnt', 'P1stFixation', 'P2stFixation',
       'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur',
       'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos',
       'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen',
       'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag',
       'timePrtctg', 'nWordsInTitle', 'label'],
      dtype='object')

In [7]:
x_full = df2.drop(columns=['label'], axis=1)
y_full = df2['label']
x, x_test, y, y_test = train_test_split(x_full, y_full, test_size=0.2, stratify=y_full)


In [8]:
# get the pearson correlation coefficients for all features to determine which features to train on

corr = df2.corr()
class_correlation = abs(corr["label"])
relevant_features = class_correlation[class_correlation > 0.1]
relevant_features

P2stFixation       0.154854
totalFixDur        0.125466
nRegressFrom       0.212710
nextWordRegress    0.228493
regressDur         0.214826
pupilDiamMax       0.122343
timePrtctg         0.168475
label              1.000000
Name: label, dtype: float64

In [9]:
# features should be independent of each other, test to make sure they aren't highly correlated with each other
feature_strings = ["P2stFixation", "totalFixDur", "nRegressFrom", "regressDur", "nextWordRegress", "pupilDiamMax", "timePrtctg"]  
for i in feature_strings:
    for j in feature_strings:
        if i != j:
            print(abs(df[[i,j]].corr()))
            print()

              P2stFixation  totalFixDur
P2stFixation      1.000000     0.087123
totalFixDur       0.087123     1.000000

              P2stFixation  nRegressFrom
P2stFixation      1.000000      0.250799
nRegressFrom      0.250799      1.000000

              P2stFixation  regressDur
P2stFixation      1.000000    0.251875
regressDur        0.251875    1.000000

                 P2stFixation  nextWordRegress
P2stFixation         1.000000         0.131331
nextWordRegress      0.131331         1.000000

              P2stFixation  pupilDiamMax
P2stFixation      1.000000      0.031823
pupilDiamMax      0.031823      1.000000

              P2stFixation  timePrtctg
P2stFixation      1.000000    0.048122
timePrtctg        0.048122    1.000000

              totalFixDur  P2stFixation
totalFixDur      1.000000      0.087123
P2stFixation     0.087123      1.000000

              totalFixDur  nRegressFrom
totalFixDur      1.000000      0.024128
nRegressFrom     0.024128      1.000000

           

In [10]:
# drop all columns which have a low correlation
# regressDur and timePtrctg have a high correlation with several other attributes and are thus not independent, drop both as well
final_columns = ["P2stFixation", "totalFixDur", "nRegressFrom", "nextWordRegress", "pupilDiamMax"]  
df_preprocessed_correlation = x
for i in x.columns:
    if i not in final_columns:
        df_preprocessed_correlation = df_preprocessed_correlation.drop(columns=[i,], axis=1)
        
df_preprocessed_correlation
    


Unnamed: 0,P2stFixation,totalFixDur,nRegressFrom,nextWordRegress,pupilDiamMax
5060,1,99,1,0,0.2008
5367,0,179,0,0,0.0427
5751,0,199,0,0,0.2010
6149,0,338,0,0,0.0326
9996,0,119,3,1,0.3070
...,...,...,...,...,...
1764,1,119,0,0,0.0369
6017,0,199,0,0,0.0886
4553,0,338,0,0,-0.0328
7634,1,119,0,1,0.1766


In [106]:
# perform attribute selection with RFE + random forests
# RFE (Recursive Feature Elimination) feeds the data to a model, evaluates the performance for each attribute 
# and deletes attributes which don't perform well enough
from sklearn.ensemble import RandomForestClassifier

rfe = RFECV(estimator=RandomForestClassifier(n_estimators=100))
fit = rfe.fit(x, y)
best_features = []
print(len(fit.support_))
print(len(x.columns))
for i in range(len(fit.support_)):
    if fit.support_[i]:
        best_features.append(x.columns[i])
        
print(best_features)
df_preprocessed_rfe = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_rfe = df_preprocessed_rfe.drop(columns=[i,], axis=1)
        
df_preprocessed_rfe

24
24
['assgNo', 'firstPassCnt', 'P1stFixation', 'P2stFixation', 'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen', 'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,firstPassCnt,P1stFixation,P2stFixation,prevFixDur,firstfixDur,firstPassFixDur,nextFixDur,firstSaccLen,lastSaccLen,...,totalFixDur,meanFixDur,nRegressFrom,regressLen,nextWordRegress,regressDur,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
5060,154,1,0,1,219,99,99,278,141.7956,378.2317,...,99,99.0,1,278,0,139,0.2008,0.1788,0.0115,4
5367,162,1,1,0,179,179,179,139,375.5163,313.0994,...,179,179.0,0,0,0,0,0.0427,0.1767,0.0224,7
5751,170,1,1,0,278,199,199,318,198.5674,0.0000,...,199,199.0,0,0,0,199,0.2010,0.2010,0.0566,7
6149,181,2,1,0,219,179,338,219,172.8381,136.4441,...,338,169.0,0,0,0,0,0.0326,0.2251,0.0396,3
9996,306,1,1,0,199,139,139,219,1460.9867,191.0007,...,119,119.0,3,7312,1,2444,0.3070,0.2405,0.0080,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1764,55,1,0,1,219,119,119,80,187.2385,508.4958,...,119,119.0,0,0,0,0,0.0369,0.2439,0.0200,3
6017,178,1,1,0,219,199,199,139,312.1158,133.0883,...,199,199.0,0,0,0,0,0.0886,0.1576,0.0148,5
4553,142,2,1,0,99,199,338,219,521.8058,95.1223,...,338,169.0,0,0,0,0,-0.0328,0.1347,0.0243,4
7634,235,1,1,1,100,258,258,139,177.0007,551.0510,...,119,119.0,0,0,1,2006,0.1766,0.1386,0.0239,5


In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

selector = SelectFromModel(LassoCV())
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)
df_preprocessed_lasso = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_lasso = df_preprocessed_lasso.drop(columns=[i,], axis=1)
        
df_preprocessed_lasso

14 selected features
['assgNo', 'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'totalFixDur', 'meanFixDur', 'regressLen', 'regressDur']


Unnamed: 0,assgNo,prevFixDur,firstfixDur,firstPassFixDur,nextFixDur,firstSaccLen,lastSaccLen,prevFixPos,landingPos,leavingPos,totalFixDur,meanFixDur,regressLen,regressDur
5060,154,219,99,99,278,141.7956,378.2317,105.0762,43.1422,37.0000,99,99.0,278,139
5367,162,179,179,179,139,375.5163,313.0994,312.0064,68.5164,71.0440,179,179.0,0,0
5751,170,278,199,199,318,198.5674,0.0000,159.0786,41.8390,42.7931,199,199.0,0,199
6149,181,219,179,338,219,172.8381,136.4441,105.3233,64.1327,37.8847,338,169.0,0,0
9996,306,199,139,139,219,1460.9867,191.0007,176.0710,1331.2506,1332.7172,119,119.0,7312,2444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1764,55,219,119,119,80,187.2385,508.4958,120.5985,70.3847,70.6629,119,119.0,0,0
6017,178,219,199,199,139,312.1158,133.0883,319.5074,6.5192,6.5765,199,199.0,0,0
4553,142,99,199,338,219,521.8058,95.1223,581.1730,54.7449,58.1055,338,169.0,0,0
7634,235,100,258,258,139,177.0007,551.0510,268.7452,86.7078,82.7179,119,119.0,0,2006


In [112]:
# perform attribute selection with RFECV + random forests
# RFE (Recursive Feature Elimination) feeds the data to a model, evaluates the performance for each attribute 
# and deletes attributes which don't perform well enough
from sklearn.ensemble import GradientBoostingClassifier

rfe = RFECV(estimator=GradientBoostingClassifier(n_estimators=100), min_features_to_select=6, step=2)
fit = rfe.fit(x, y)
best_features = []
print(len(fit.support_))
print(len(x.columns))
for i in range(len(fit.support_)):
    if fit.support_[i]:
        best_features.append(x.columns[i])
        
print(best_features)
df_preprocessed_gb = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_gb = df_preprocessed_gb.drop(columns=[i,], axis=1)
        
df_preprocessed_gb

KeyboardInterrupt: 

In [14]:
from lightgbm import LGBMClassifier

lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

selector = SelectFromModel(lgbc)
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)

df_preprocessed_lgbm = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_lgbm = df_preprocessed_lgbm.drop(columns=[i,], axis=1)
        
df_preprocessed_lgbm


10 selected features
['assgNo', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,firstSaccLen,lastSaccLen,prevFixPos,landingPos,leavingPos,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
5060,154,141.7956,378.2317,105.0762,43.1422,37.0000,0.2008,0.1788,0.0115,4
5367,162,375.5163,313.0994,312.0064,68.5164,71.0440,0.0427,0.1767,0.0224,7
5751,170,198.5674,0.0000,159.0786,41.8390,42.7931,0.2010,0.2010,0.0566,7
6149,181,172.8381,136.4441,105.3233,64.1327,37.8847,0.0326,0.2251,0.0396,3
9996,306,1460.9867,191.0007,176.0710,1331.2506,1332.7172,0.3070,0.2405,0.0080,7
...,...,...,...,...,...,...,...,...,...,...
1764,55,187.2385,508.4958,120.5985,70.3847,70.6629,0.0369,0.2439,0.0200,3
6017,178,312.1158,133.0883,319.5074,6.5192,6.5765,0.0886,0.1576,0.0148,5
4553,142,521.8058,95.1223,581.1730,54.7449,58.1055,-0.0328,0.1347,0.0243,4
7634,235,177.0007,551.0510,268.7452,86.7078,82.7179,0.1766,0.1386,0.0239,5


In [116]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import *

preprocessed_datasets = [x, df_preprocessed_correlation, df_preprocessed_rfe, df_preprocessed_lasso, df_preprocessed_gb, df_preprocessed_lgbm]
dataset_labels = ["control, no preprocessing", "Pearson correlation coefficient", "RFECV w/ RandomForests", "LassoCV ", "RFECV w/ GradientBoost", "LightGBM Selection"]
ensemble_models = [RandomForestClassifier(), 
    ExtraTreesClassifier(), 
    BaggingClassifier(), 
    IsolationForest(), 
    AdaBoostClassifier(), 
    GradientBoostingClassifier()
]

for model in ensemble_models:
    count = 0

    for dataset in preprocessed_datasets:
        x_test_processed = x_test

        for i in x_test.columns:
            if i not in list(dataset.columns):
                x_test_processed = x_test_processed.drop(columns=[i,], axis=1)
        
        model.fit(dataset, y)
        predicted = model.predict(x_test_processed)

        print("Classifier: " + type(model).__name__ + "; Feature Selection:", dataset_labels[count], "[" + str(dataset.shape[1]) + " features] ->", accuracy_score(y_test, predicted))

        count += 1
        
    print()
    

Classifier: RandomForestClassifier; Feature Selection: control, no preprocessing [24 features] -> 0.6284277879341865
Classifier: RandomForestClassifier; Feature Selection: Pearson correlation coefficient [5 features] -> 0.4186471663619744
Classifier: RandomForestClassifier; Feature Selection: RFECV w/ RandomForests [23 features] -> 0.6371115173674589
Classifier: RandomForestClassifier; Feature Selection: LassoCV  [14 features] -> 0.5361060329067642
Classifier: RandomForestClassifier; Feature Selection: RFECV w/ GradientBoost [14 features] -> 0.6106032906764168
Classifier: RandomForestClassifier; Feature Selection: LightGBM Selection [10 features] -> 0.5621572212065814

Classifier: ExtraTreesClassifier; Feature Selection: control, no preprocessing [24 features] -> 0.656764168190128
Classifier: ExtraTreesClassifier; Feature Selection: Pearson correlation coefficient [5 features] -> 0.409963436928702
Classifier: ExtraTreesClassifier; Feature Selection: RFECV w/ RandomForests [23 features]