In [1]:
# for manipulating the PDF
# import fitz
# for OCR using PyTesseract
import re
import os
import cv2                              # pre-processing images
import math
import json
from collections import OrderedDict
import numpy as np
import pandas as pd
import pytesseract                      # extracting text from images
import warnings
import matplotlib.pyplot as plt         # displaying output images
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import itertools
warnings.filterwarnings("ignore")
from PIL import Image
from sklearn.model_selection import GridSearchCV

In [2]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
french_filepaths = []
moai_filepaths = []
worldcup_filepaths = []
french_filepaths = [os.path.join("raw_data\\french\\",f) for f in os.listdir("raw_data\\french\\") if f.endswith(".txt")]
moai_filepaths = [os.path.join("raw_data\\moai\\",f) for f in os.listdir("raw_data\\moai\\") if f.endswith(".txt")]
worldcup_filepaths = [os.path.join("raw_data\\worldcup\\",f) for f in os.listdir("raw_data\\worldcup\\") if f.endswith(".txt")]

In [4]:
french_experiments = []
for data_file in french_filepaths:
    df = pd.read_json(data_file, lines = True)
    df["values"]
    french_experiments.append(df)

In [5]:
for text_type, filepaths in zip(["french","moai","worldcup"], [french_filepaths, moai_filepaths, worldcup_filepaths]):
    # Read data
    generic_datalist = []
    for data_file in filepaths:
        with open(data_file) as f:
            lines = f.readlines()

        dataDict = {}
        for i,line in enumerate(lines):
            lineDict = json.loads(line)
            dataDict[i] = lineDict
        generic_datalist.append(dataDict)

    # filter out non-fixation data
    generic_fixation_list = []
    for experiment_dict in generic_datalist:
        trackerDict = {}
        j = 0
        for i in range(len(experiment_dict)):
            if(experiment_dict[i]['category'] == 'tracker'):
                trackerDict[j] = experiment_dict[i]
                j += 1

        fixationDict = {}
        j = 0
        for i in range(len(trackerDict)):
            try:
                if(trackerDict[i]['values']['frame']['fix'] == True):
                    fixationDict[j] = trackerDict[i]
                    j += 1
            except:
                pass
        generic_fixation_list.append(fixationDict)

    # general overview
    print("\n",text_type,": ")
    for i,experiment in enumerate(generic_fixation_list):
        print(str(i) + ":"+ str(len(experiment)), end = ", ")

    # df creation for cond
    generic_list = []
    for experiment_dict in generic_fixation_list:
        experiment_df = pd.DataFrame( columns=list('xy'))
        for i in range(len(experiment_dict)):
            x = experiment_dict[i]['values']['frame']['raw']['x']
            y = experiment_dict[i]['values']['frame']['raw']['y']   
            experiment_df = experiment_df.append({'x':x,'y':y}, ignore_index=True) 
        generic_list.append(experiment_df)

    if (text_type == "french"):
        french_df = generic_list
    elif (text_type == "moai"):
        moai_df = generic_list
    elif (text_type == "worldcup"):
        worldcup_df = generic_list


 french : 
0:841, 1:749, 2:345, 3:334, 4:322, 5:889, 6:140, 7:415, 8:246, 9:602, 10:865, 11:542, 12:308, 13:325, 14:228, 15:924, 16:660, 
 moai : 
0:656, 1:527, 2:290, 3:374, 4:422, 5:193, 6:44, 7:521, 8:271, 9:614, 10:625, 11:544, 12:362, 13:230, 14:250, 15:754, 16:499, 
 worldcup : 
0:673, 1:533, 2:261, 3:336, 4:266, 5:242, 6:91, 7:313, 8:300, 9:475, 10:537, 11:427, 12:208, 13:144, 14:274, 15:726, 16:626, 

In [6]:
french_BoW_list = []
moai_BoW_list = []
worldcup_BoW_list = []
for text_type, dataset in zip(["french","moai","worldcup"], [french_df, moai_df, worldcup_df]):
    screenshot_filepath = "raw_data\\" + text_type + "\\" + text_type + ".png"
    original_image = cv2.imread(screenshot_filepath)
    # convert the image to grayscale
    gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
    # Performing OTSU threshold
    ret, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

    rectangular_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (12, 12))

    # Applying dilation on the threshold image
    dilated_image = cv2.dilate(threshold_image, rectangular_kernel, iterations = 1)
    #plt.figure(figsize=(25, 15))
    #plt.imshow(dilated_image)
    #plt.show()

    # Finding contours
    contours, hierarchy = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Creating a copy of the image
    copied_image = original_image.copy()

    mask = np.zeros(original_image.shape, np.uint8)

    # Looping through the identified contours
    # Then rectangular part is cropped and passed on to pytesseract
    # pytesseract extracts the text inside each contours
    # Extracted text is then written into a text file
    paragraph = ""
    #print(len(contours))
    for experiment_data in dataset:
        bag_of_words = {}
        #print(experiment_data)
        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            # Cropping the text block for giving input to OCR
            cropped = copied_image[y:y + h, x:x + w]
            
            cv2.rectangle(copied_image, (x, y), (x + w, y + h), (36,255,12), 2)
            # Apply OCR on the cropped image
            text = pytesseract.image_to_string(cropped, lang='eng', config='--oem 3 --psm 1')
            text = text.lower()
            text = re.sub('[^a-z]', ' ', text)
            text = re.sub(r'\s+', '', text)

            insideCond = (experiment_data["x"] >= x) & (experiment_data["x"] < x + w) & (experiment_data["y"] >= y) & (experiment_data["y"] < y + h)
            #print(insideCond)
            boundFixations = experiment_data[insideCond]
            #print(boundFixations)
            for i in range(len(boundFixations)):
                try:
                    count = bag_of_words[str(text)]
                    count += 1
                    bag_of_words.update({str(text):count})
                except:
                    bag_of_words[str(text)] = 1
            #del bag_of_words[""]       
        if (text_type == "french"):
            french_BoW_list.append(bag_of_words)
            #print(french_BoW_list)
        elif (text_type == "moai"):
            moai_BoW_list.append(bag_of_words)
            #print(moai_BoW_list)
        elif (text_type == "worldcup"):
            worldcup_BoW_list.append(bag_of_words)
            #print(worldcup_BoW_list)
    masked = cv2.drawContours(mask, [cnt], 0, (255, 255, 255), -1)

In [16]:
from math import log
def calculate_tf_idf(bow):
    # Total number of documents in the collection
    N = len(bow)
    # Calculate the tf-idf values for each word
    tf_idf = {}
    for word, count in bow.items():
        # Calculate the term frequency (tf)
        tf = count / sum(bow.values())
      # Calculate the inverse document frequency (idf)
        # idf is defined as log(N / n), where N is the total number of documents
        # and n is the number of documents that contain the word
        n = sum(1 for doc in bow if word in doc)
        idf = log(N / n)
        # Calculate the tf-idf value
        tf_idf[word] = tf * idf
    return tf_idf

In [473]:
# New td-idf lists for all 3 classes are created using convertToAllDict() function above
train_frenchNew = [] # French td-idf list
idx = 0
for bag_of_word in french_BoW_list:
    bag_of_word_tfidf = calculate_tf_idf(bag_of_word)
    x = convertToAllDict(french_BoW_list, bag_of_word_tfidf) 
    x = sum(x)
    train_frenchNew.append([x,train_french[idx][1]] )
    idx += 1

In [497]:
train_worldcupNew = [] # Worldcup td-idf list
idx=0
for bag_of_word in worldcup_BoW_list:
    bag_of_word_tfidf=calculate_tf_idf(bag_of_word)
    x=convertToAllDict(worldcup_BoW_list,bag_of_word_tfidf)
    x=sum(x)
    train_worldcupNew.append( [x,train_worldcup[idx][1]] )
    idx+=1

In [508]:
train_moaiNew = [] # Moai td-idf list
idx=0
for bag_of_word in moai_BoW_list:
    bag_of_word_tfidf=calculate_tf_idf(bag_of_word)
    x=convertToAllDict(moai_BoW_list,bag_of_word_tfidf)
    x=sum(x)
    train_moaiNew.append( [x,train_moai[idx][1]] )
    idx+=1

In [593]:
from sklearn import preprocessing
from sklearn import utils
cv = LeaveOneOut()
lr = LogisticRegression()
svm=SVC()
clf = RandomForestClassifier(n_estimators = 100)
datasets=[train_frenchNew,train_moaiNew,train_worldcupNew]
idx=0
for dataset in datasets:
    if idx==0:
        print("French Dataset Results:")
    if idx==1:
        print("Moai Dataset Results:")
    if idx==2:
        print("World Cup Dataset Results:")
    X=dataset.loc[:, dataset.columns == 'X']
    y=dataset.loc[:, dataset.columns != 'X']
    label_encoder = preprocessing.LabelEncoder()
    X_transformed = label_encoder.fit_transform(X)
    X=X_transformed.reshape(-1,1)

    scores_lracc = cross_val_score(lr, X, y, scoring='accuracy',
                             cv=cv, n_jobs=-1)
    scores_svmacc = cross_val_score(svm, X, y, scoring='accuracy',
                             cv=cv, n_jobs=-1)
    scores_clfacc=cross_val_score(clf, X, y, scoring='accuracy',
                             cv=cv, n_jobs=-1)
    
    scores_lrf1 = cross_val_score(lr, X, y, scoring='f1_weighted',
                             cv=cv, n_jobs=-1)
    scores_svmf1 = cross_val_score(svm, X, y, scoring='f1_weighted',
                             cv=cv, n_jobs=-1)
    scores_clff1=cross_val_score(clf, X, y, scoring='f1_weighted',
                             cv=cv, n_jobs=-1)
    
    scores_lrpre= cross_val_score(lr, X, y, scoring='precision',
                             cv=cv, n_jobs=-1)
    scores_svmpre = cross_val_score(svm, X, y, scoring='precision',
                             cv=cv, n_jobs=-1)
    scores_clfpre=cross_val_score(clf, X, y, scoring='precision',
                             cv=cv, n_jobs=-1)
    print('Logistic Regression Accuracy: %.3f (%.3f)' % (mean(scores_lracc), std(scores_lracc)))
    print('Logistic Regression f1 Score: %.3f (%.3f)' % (mean(scores_lrf1), std(scores_lrf1)))
    print('Logistic Regression AUC: %.3f (%.3f)' % (mean(scores_lrpre), std(scores_lrpre)))
    
    print('Support Vector Machine Accuracy: %.3f (%.3f)' % (mean(scores_svmacc), std(scores_svmacc)))
    print('Support Vector Machine f1 Score: %.3f (%.3f)' % (mean(scores_svmf1), std(scores_svmf1)))
    print('Support Vector Machine AUC: %.3f (%.3f)' % (mean(scores_svmpre), std(scores_svmpre)))
    
    print('Random Forest Accuracy: %.3f (%.3f)' % (mean(scores_clfacc), std(scores_clfacc)))
    print('Random Forest f1 Score: %.3f (%.3f)' % (mean(scores_clff1), std(scores_clff1)))
    print('Random Forest AUC: %.3f (%.3f)' % (mean(scores_clfpre), std(scores_clfpre)))
    idx+=1
    print("\n")

French Dataset Results:
Logistic Regression Accuracy: 0.118 (0.322)
Logistic Regression f1 Score: 0.118 (0.322)
Logistic Regression AUC: nan (nan)
Support Vector Machine Accuracy: 0.176 (0.381)
Support Vector Machine f1 Score: 0.176 (0.381)
Support Vector Machine AUC: nan (nan)
Random Forest Accuracy: 0.118 (0.322)
Random Forest f1 Score: 0.118 (0.322)
Random Forest AUC: nan (nan)


Moai Dataset Results:
Logistic Regression Accuracy: 0.059 (0.235)
Logistic Regression f1 Score: 0.059 (0.235)
Logistic Regression AUC: nan (nan)
Support Vector Machine Accuracy: 0.118 (0.322)
Support Vector Machine f1 Score: 0.118 (0.322)
Support Vector Machine AUC: nan (nan)
Random Forest Accuracy: 0.294 (0.456)
Random Forest f1 Score: 0.294 (0.456)
Random Forest AUC: nan (nan)


World Cup Dataset Results:
Logistic Regression Accuracy: 0.059 (0.235)
Logistic Regression f1 Score: 0.059 (0.235)
Logistic Regression AUC: nan (nan)
Support Vector Machine Accuracy: 0.118 (0.322)
Support Vector Machine f1 Score: 

In [595]:
idx=0
for dataset in datasets:
    if idx==0:
        print("French Dataset Results:")
    if idx==1:
        print("Moai Dataset Results:")
    if idx==2:
        print("World Cup Dataset Results:")
    # defining parameter range
    param_grid = {'C': [0.1, 1, 10, 100, 1000], 
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['rbf']} 
    grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3,cv=cv)
    X=dataset.loc[:, dataset.columns == 'X']
    y=dataset.loc[:, dataset.columns != 'X']
    label_encoder = preprocessing.LabelEncoder()
    X_transformed = label_encoder.fit_transform(X)
    X=X_transformed.reshape(-1,1)
    grid.fit(X, y)
    # print best parameter after tuning
    print(grid.best_params_)
    
    # print how our model looks after hyper-parameter tuning
    print(grid.best_estimator_)
    
    svm=SVC(C=100,gamma=1)
    scores_svm = cross_val_score(svm, X, y, scoring='accuracy',
                             cv=cv, n_jobs=-1)
    scores_svmf1 = cross_val_score(svm, X, y, scoring='f1_weighted',
                             cv=cv, n_jobs=-1)
    print('Accuracy: %.3f (%.3f)' % (mean(scores_svm), std(scores_svm)))
    print('f1: %.3f (%.3f)' % (mean(scores_svmf1), std(scores_svmf1)))

    idx+=1

French Dataset Results:
Fitting 17 folds for each of 25 candidates, totalling 425 fits
[CV 1/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 2/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 3/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 4/17] END .......C=0.1, gamma=1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 5/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 6/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 7/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 8/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 9/17] END .......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 10/17] END ......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 11/17] END ......C=0.1, gamma=1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 12/17] END ......C

In [586]:
for dataset in datasets:
    if idx==0:
        print("French Dataset Results:")
    if idx==1:
        print("Moai Dataset Results:")
    if idx==2:
        print("World Cup Dataset Results:")
    n_estimators = [5,20,50,100] # number of trees in the random forest
    max_features = ['auto', 'sqrt'] # number of features in consideration at every split
    max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
    min_samples_split = [2, 6, 10] # minimum sample number to split a node
    min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
    bootstrap = [True, False] # method used to sample data points
    random_grid = {'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap}
    from sklearn.model_selection import RandomizedSearchCV
    rf_random = RandomizedSearchCV(estimator = clf,param_distributions = random_grid,
                   n_iter = 100, cv = cv, verbose=2, random_state=35, n_jobs = -1)
    X=dataset.loc[:, dataset.columns == 'X']
    y=dataset.loc[:, dataset.columns != 'X']
    label_encoder = preprocessing.LabelEncoder()
    X_transformed = label_encoder.fit_transform(X)
    X=X_transformed.reshape(-1,1)
    rf_random.fit(X, y)
    print ('Random grid: ', random_grid, '\n')
    # print the best parameters
    print ('Best Parameters: ', rf_random.best_params_, ' \n')
   
    randmf=RandomForestClassifier(n_estimators= 50, min_samples_split= 10, min_samples_leaf= 3, max_features= "sqrt", max_depth= 80, bootstrap= True)
    scores_randmf = cross_val_score(randmf, X, y, scoring='accuracy',
                             cv=cv, n_jobs=-1)
    print('Accuracy: %.3f (%.3f)' % (mean(scores_randmf), std(scores_randmf)))
    idx+=1

Fitting 17 folds for each of 100 candidates, totalling 1700 fits
Random grid:  {'n_estimators': [5, 20, 50, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 5, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}  

Accuracy: 0.118 (0.322)
Fitting 17 folds for each of 100 candidates, totalling 1700 fits
Random grid:  {'n_estimators': [5, 20, 50, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 70, 'bootstrap': True}  

Accuracy: 0.000 (0.000)
Fitting 17 folds for each of 100 candida