In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFECV, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


In [11]:
df = pd.read_csv("eye_movements.csv")
df.head

<bound method NDFrame.head of        lineNo  assgNo  fixcount  firstPassCnt  P1stFixation  P2stFixation  \
0           1       1         1             1             1             0   
1           2       1         1             1             1             0   
2           3       1         1             1             1             0   
3           4       1         1             1             1             0   
4           5       1         1             1             1             0   
...       ...     ...       ...           ...           ...           ...   
10931   10932     336         1             1             1             0   
10932   10933     336         1             1             1             0   
10933   10934     336         1             1             1             0   
10934   10935     336         2             1             1             0   
10935   10936     336         1             1             1             1   

       prevFixDur  firstfixDur  firstPassFixD

In [12]:
num_missing_values = df.isna().sum()
num_missing_values # No need to remove any tuples or perform data imputation since none of the data is missing

lineNo             0
assgNo             0
fixcount           0
firstPassCnt       0
P1stFixation       0
P2stFixation       0
prevFixDur         0
firstfixDur        0
firstPassFixDur    0
nextFixDur         0
firstSaccLen       0
lastSaccLen        0
prevFixPos         0
landingPos         0
leavingPos         0
totalFixDur        0
meanFixDur         0
nRegressFrom       0
regressLen         0
nextWordRegress    0
regressDur         0
pupilDiamMax       0
pupilDiamLag       0
timePrtctg         0
nWordsInTitle      0
titleNo            0
wordNo             0
label              0
dtype: int64

In [13]:
attributes = df.columns
attributes

Index(['lineNo', 'assgNo', 'fixcount', 'firstPassCnt', 'P1stFixation',
       'P2stFixation', 'prevFixDur', 'firstfixDur', 'firstPassFixDur',
       'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos',
       'leavingPos', 'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen',
       'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag',
       'timePrtctg', 'nWordsInTitle', 'titleNo', 'wordNo', 'label'],
      dtype='object')

In [14]:
num_unique = {}
for attribute in attributes:
    num_unique[attribute] = len(pd.unique(df[attribute]))
num_unique

{'lineNo': 10936,
 'assgNo': 336,
 'fixcount': 8,
 'firstPassCnt': 7,
 'P1stFixation': 2,
 'P2stFixation': 2,
 'prevFixDur': 61,
 'firstfixDur': 63,
 'firstPassFixDur': 111,
 'nextFixDur': 68,
 'firstSaccLen': 9548,
 'lastSaccLen': 9350,
 'prevFixPos': 7866,
 'landingPos': 6847,
 'leavingPos': 6900,
 'totalFixDur': 149,
 'meanFixDur': 254,
 'nRegressFrom': 6,
 'regressLen': 572,
 'nextWordRegress': 2,
 'regressDur': 381,
 'pupilDiamMax': 3810,
 'pupilDiamLag': 2517,
 'timePrtctg': 1065,
 'nWordsInTitle': 9,
 'titleNo': 10,
 'wordNo': 10,
 'label': 3}

In [15]:
df2 = df.drop(columns=['lineNo','titleNo','wordNo'], axis=1) # Contains unique values for each instance, not going to be useful
# Also contains index values instead of actual data

# Write preprocessed data to csv

df2.to_csv("preprocessed_data.csv")
df2.columns

Index(['assgNo', 'fixcount', 'firstPassCnt', 'P1stFixation', 'P2stFixation',
       'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur',
       'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos',
       'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen',
       'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag',
       'timePrtctg', 'nWordsInTitle', 'label'],
      dtype='object')

In [16]:
x_full = df2.drop(columns=['label'], axis=1)
y_full = df2['label']
x, x_test, y, y_test = train_test_split(x_full, y_full, test_size=0.2, stratify=y_full)

# Write training and testing data to csv

x.to_csv("x_train.csv")
x_test.to_csv("x_test.csv")
y.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")



In [17]:
# get the pearson correlation coefficients for all features to determine which features to train on

corr = df2.corr()
class_correlation = abs(corr["label"])
relevant_features = class_correlation[class_correlation > 0.1]
relevant_features

P2stFixation       0.154854
totalFixDur        0.125466
nRegressFrom       0.212710
nextWordRegress    0.228493
regressDur         0.214826
pupilDiamMax       0.122343
timePrtctg         0.168475
label              1.000000
Name: label, dtype: float64

In [18]:
# features should be independent of each other, test to make sure they aren't highly correlated with each other
feature_strings = ["P2stFixation", "totalFixDur", "nRegressFrom", "regressDur", "nextWordRegress", "pupilDiamMax", "timePrtctg"]  
for i in feature_strings:
    for j in feature_strings:
        if i != j:
            print(abs(df[[i,j]].corr()))
            print()

              P2stFixation  totalFixDur
P2stFixation      1.000000     0.087123
totalFixDur       0.087123     1.000000

              P2stFixation  nRegressFrom
P2stFixation      1.000000      0.250799
nRegressFrom      0.250799      1.000000

              P2stFixation  regressDur
P2stFixation      1.000000    0.251875
regressDur        0.251875    1.000000

                 P2stFixation  nextWordRegress
P2stFixation         1.000000         0.131331
nextWordRegress      0.131331         1.000000

              P2stFixation  pupilDiamMax
P2stFixation      1.000000      0.031823
pupilDiamMax      0.031823      1.000000

              P2stFixation  timePrtctg
P2stFixation      1.000000    0.048122
timePrtctg        0.048122    1.000000

              totalFixDur  P2stFixation
totalFixDur      1.000000      0.087123
P2stFixation     0.087123      1.000000

              totalFixDur  nRegressFrom
totalFixDur      1.000000      0.024128
nRegressFrom     0.024128      1.000000

           

In [19]:
# drop all columns which have a low correlation
# regressDur and timePtrctg have a high correlation with several other attributes and are thus not independent, drop both as well
final_columns = ["P2stFixation", "totalFixDur", "nRegressFrom", "nextWordRegress", "pupilDiamMax"]  
df_preprocessed_correlation = x
for i in x.columns:
    if i not in final_columns:
        df_preprocessed_correlation = df_preprocessed_correlation.drop(columns=[i,], axis=1)
        
df_preprocessed_correlation
    


Unnamed: 0,P2stFixation,totalFixDur,nRegressFrom,nextWordRegress,pupilDiamMax
10317,0,119,0,0,0.2313
784,1,219,2,1,0.0906
4675,0,298,0,0,0.0797
6495,0,318,0,0,-0.0956
4575,1,139,0,0,0.0487
...,...,...,...,...,...
9840,0,100,0,0,0.2325
2768,0,219,1,0,1.8407
2715,1,359,0,0,1.5980
6147,0,378,0,0,0.0081


In [20]:
# perform attribute selection with RFE + random forests
# RFE (Recursive Feature Elimination) feeds the data to a model, evaluates the performance for each attribute 
# and deletes attributes which don't perform well enough
from sklearn.ensemble import RandomForestClassifier

rfe = RFECV(estimator=RandomForestClassifier(n_estimators=100))
fit = rfe.fit(x, y)
best_features = []
print(len(fit.support_))
print(len(x.columns))
for i in range(len(fit.support_)):
    if fit.support_[i]:
        best_features.append(x.columns[i])
        
print(best_features)
df_preprocessed_rfe = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_rfe = df_preprocessed_rfe.drop(columns=[i,], axis=1)
        
df_preprocessed_rfe

24
24
['assgNo', 'P1stFixation', 'P2stFixation', 'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen', 'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,P1stFixation,P2stFixation,prevFixDur,firstfixDur,firstPassFixDur,nextFixDur,firstSaccLen,lastSaccLen,prevFixPos,...,totalFixDur,meanFixDur,nRegressFrom,regressLen,nextWordRegress,regressDur,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
10317,315,1,0,119,119,119,139,306.4841,156.5799,359.3884,...,119,119.0,0,0,0,0,0.2313,0.2023,0.0158,4
784,23,1,1,179,139,675,537,418.9645,218.4176,407.7413,...,219,219.0,2,2961,1,159,0.0906,0.2406,0.0338,3
4675,144,1,0,139,298,298,219,667.4328,142.0317,732.6459,...,298,298.0,0,0,0,0,0.0797,0.1437,0.0326,8
6495,192,1,0,99,179,318,298,153.1837,92.7793,153.4959,...,318,159.0,0,0,0,0,-0.0956,0.2284,0.0843,4
4575,142,1,1,179,139,139,139,561.0893,182.6540,602.2242,...,139,139.0,0,0,0,0,0.0487,0.1072,0.0100,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9840,303,1,0,119,100,100,139,602.8766,1548.5591,517.0039,...,100,100.0,0,0,0,0,0.2325,0.3280,0.0196,7
2768,84,1,0,160,219,219,319,128.0625,215.0471,67.0075,...,219,219.0,1,757,0,0,1.8407,1.7177,0.0531,4
2715,82,0,1,199,100,100,319,326.5061,256.5239,388.2396,...,359,359.0,0,0,0,718,1.5980,2.5820,0.1169,7
6147,181,1,0,80,378,378,219,132.3414,239.2749,88.0909,...,378,378.0,0,0,0,0,0.0081,0.0926,0.0443,4


In [21]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

selector = SelectFromModel(LassoCV())
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)
df_preprocessed_lasso = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_lasso = df_preprocessed_lasso.drop(columns=[i,], axis=1)
        
df_preprocessed_lasso

12 selected features
['assgNo', 'firstfixDur', 'firstPassFixDur', 'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'totalFixDur', 'meanFixDur', 'regressDur']


Unnamed: 0,assgNo,firstfixDur,firstPassFixDur,nextFixDur,firstSaccLen,lastSaccLen,prevFixPos,landingPos,leavingPos,totalFixDur,meanFixDur,regressDur
10317,315,119,119,139,306.4841,156.5799,359.3884,60.2536,71.3320,119,119.0,0
784,23,139,675,537,418.9645,218.4176,407.7413,18.6078,10.0125,219,219.0,159
4675,144,298,298,219,667.4328,142.0317,732.6459,63.0496,51.8483,298,298.0,0
6495,192,179,318,298,153.1837,92.7793,153.4959,99.6255,55.0840,318,159.0,0
4575,142,139,139,139,561.0893,182.6540,602.2242,39.2747,40.1995,139,139.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9840,303,100,100,139,602.8766,1548.5591,517.0039,92.6620,78.9953,100,100.0,0
2768,84,219,219,319,128.0625,215.0471,67.0075,63.3877,55.0818,219,219.0,0
2715,82,100,100,319,326.5061,256.5239,388.2396,63.0635,60.3842,359,359.0,718
6147,181,378,378,219,132.3414,239.2749,88.0909,47.2784,23.1355,378,378.0,0


In [22]:
# perform attribute selection with RFECV + random forests
# RFE (Recursive Feature Elimination) feeds the data to a model, evaluates the performance for each attribute 
# and deletes attributes which don't perform well enough
from sklearn.ensemble import GradientBoostingClassifier

rfe = RFECV(estimator=GradientBoostingClassifier(n_estimators=100), min_features_to_select=6, step=2)
fit = rfe.fit(x, y)
best_features = []
print(len(fit.support_))
print(len(x.columns))
for i in range(len(fit.support_)):
    if fit.support_[i]:
        best_features.append(x.columns[i])
        
print(best_features)
df_preprocessed_gb = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_gb = df_preprocessed_gb.drop(columns=[i,], axis=1)
        
df_preprocessed_gb

24
24
['assgNo', 'firstPassCnt', 'P2stFixation', 'prevFixDur', 'firstfixDur', 'firstPassFixDur', 'nextFixDur', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'totalFixDur', 'meanFixDur', 'nRegressFrom', 'regressLen', 'nextWordRegress', 'regressDur', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,firstPassCnt,P2stFixation,prevFixDur,firstfixDur,firstPassFixDur,nextFixDur,firstSaccLen,lastSaccLen,prevFixPos,...,totalFixDur,meanFixDur,nRegressFrom,regressLen,nextWordRegress,regressDur,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
10317,315,1,0,119,119,119,139,306.4841,156.5799,359.3884,...,119,119.0,0,0,0,0,0.2313,0.2023,0.0158,4
784,23,3,1,179,139,675,537,418.9645,218.4176,407.7413,...,219,219.0,2,2961,1,159,0.0906,0.2406,0.0338,3
4675,144,1,0,139,298,298,219,667.4328,142.0317,732.6459,...,298,298.0,0,0,0,0,0.0797,0.1437,0.0326,8
6495,192,2,0,99,179,318,298,153.1837,92.7793,153.4959,...,318,159.0,0,0,0,0,-0.0956,0.2284,0.0843,4
4575,142,1,1,179,139,139,139,561.0893,182.6540,602.2242,...,139,139.0,0,0,0,0,0.0487,0.1072,0.0100,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9840,303,1,0,119,100,100,139,602.8766,1548.5591,517.0039,...,100,100.0,0,0,0,0,0.2325,0.3280,0.0196,7
2768,84,1,0,160,219,219,319,128.0625,215.0471,67.0075,...,219,219.0,1,757,0,0,1.8407,1.7177,0.0531,4
2715,82,1,1,199,100,100,319,326.5061,256.5239,388.2396,...,359,359.0,0,0,0,718,1.5980,2.5820,0.1169,7
6147,181,1,0,80,378,378,219,132.3414,239.2749,88.0909,...,378,378.0,0,0,0,0,0.0081,0.0926,0.0443,4


In [23]:
from lightgbm import LGBMClassifier

lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

selector = SelectFromModel(lgbc)
selector.fit(x, y)


support = selector.get_support()
best_features = x.loc[:,support].columns.tolist()
print(str(len(best_features)), 'selected features')
print(best_features)

df_preprocessed_lgbm = x

for i in x.columns:
    if i not in best_features:
        df_preprocessed_lgbm = df_preprocessed_lgbm.drop(columns=[i,], axis=1)
        
df_preprocessed_lgbm


10 selected features
['assgNo', 'firstSaccLen', 'lastSaccLen', 'prevFixPos', 'landingPos', 'leavingPos', 'pupilDiamMax', 'pupilDiamLag', 'timePrtctg', 'nWordsInTitle']


Unnamed: 0,assgNo,firstSaccLen,lastSaccLen,prevFixPos,landingPos,leavingPos,pupilDiamMax,pupilDiamLag,timePrtctg,nWordsInTitle
10317,315,306.4841,156.5799,359.3884,60.2536,71.3320,0.2313,0.2023,0.0158,4
784,23,418.9645,218.4176,407.7413,18.6078,10.0125,0.0906,0.2406,0.0338,3
4675,144,667.4328,142.0317,732.6459,63.0496,51.8483,0.0797,0.1437,0.0326,8
6495,192,153.1837,92.7793,153.4959,99.6255,55.0840,-0.0956,0.2284,0.0843,4
4575,142,561.0893,182.6540,602.2242,39.2747,40.1995,0.0487,0.1072,0.0100,6
...,...,...,...,...,...,...,...,...,...,...
9840,303,602.8766,1548.5591,517.0039,92.6620,78.9953,0.2325,0.3280,0.0196,7
2768,84,128.0625,215.0471,67.0075,63.3877,55.0818,1.8407,1.7177,0.0531,4
2715,82,326.5061,256.5239,388.2396,63.0635,60.3842,1.5980,2.5820,0.1169,7
6147,181,132.3414,239.2749,88.0909,47.2784,23.1355,0.0081,0.0926,0.0443,4


In [30]:
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import *

preprocessed_datasets = [df_preprocessed_correlation, df_preprocessed_rfe, df_preprocessed_lasso, df_preprocessed_gb, df_preprocessed_lgbm]
dataset_labels = ["Pearson correlation coefficient", "RFECV w/ RandomForests", "LassoCV ", "RFECV w/ GradientBoost", "LightGBM Selection"]
file_labels = ["Pearson", "RFECV_RF", "LassoCV ", "RFECV_GB", "LightGBM"]
ensemble_models = [RandomForestClassifier(), 
    ExtraTreesClassifier(), 
    BaggingClassifier(), 
    AdaBoostClassifier(), 
    GradientBoostingClassifier()
]

for model in ensemble_models:
    count = 0

    for dataset in preprocessed_datasets:
        x_test_processed = x_test

        for i in x_test.columns:
            if i not in list(dataset.columns):
                x_test_processed = x_test_processed.drop(columns=[i,], axis=1)
        
        model.fit(dataset, y)
        predicted = model.predict(x_test_processed)

        print("Classifier: " + type(model).__name__ + "; Feature Selection:", dataset_labels[count], "[" + str(dataset.shape[1]) + " features] ->", str(round(accuracy_score(y_test, predicted) * 100, 1)) + "%")

        sklearn_confusion_matrix = confusion_matrix(y_test, predicted)

        disp = ConfusionMatrixDisplay(confusion_matrix=sklearn_confusion_matrix, display_labels=["Irrelevant", "Relevant", "Correct Answer"])
        disp = disp.plot()
        plt.savefig("confusion_matrices/" + type(model).__name__ + "_" + file_labels[count] + '.png', bbox_inches='tight')
        plt.close()


        count += 1
        
    print()
    

Classifier: RandomForestClassifier; Feature Selection: Pearson correlation coefficient [5 features] -> 43.3%
Classifier: RandomForestClassifier; Feature Selection: RFECV w/ RandomForests [22 features] -> 63.0%
Classifier: RandomForestClassifier; Feature Selection: LassoCV  [12 features] -> 50.7%
Classifier: RandomForestClassifier; Feature Selection: RFECV w/ GradientBoost [22 features] -> 61.0%
Classifier: RandomForestClassifier; Feature Selection: LightGBM Selection [10 features] -> 54.6%

Classifier: ExtraTreesClassifier; Feature Selection: Pearson correlation coefficient [5 features] -> 42.8%
Classifier: ExtraTreesClassifier; Feature Selection: RFECV w/ RandomForests [22 features] -> 67.3%
Classifier: ExtraTreesClassifier; Feature Selection: LassoCV  [12 features] -> 50.4%
Classifier: ExtraTreesClassifier; Feature Selection: RFECV w/ GradientBoost [22 features] -> 65.6%
Classifier: ExtraTreesClassifier; Feature Selection: LightGBM Selection [10 features] -> 56.7%

Classifier: Baggin