In [12]:
# for manipulating the PDF
# import fitz
# for OCR using PyTesseract
import re
import os
import cv2                              # pre-processing images
import math
import json
from collections import OrderedDict
import numpy as np
import pandas as pd
import pytesseract                      # extracting text from images
import warnings
import matplotlib.pyplot as plt         # displaying output images
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

warnings.filterwarnings("ignore")
from PIL import Image

In [13]:
os.chdir('C:\\Users\\alper\\OneDrive\\Desktop\\School\\5th Semester\\AIN311\\Project') 
french_filepaths = []
moai_filepaths = []
worldcup_filepaths = []
french_filepaths = [os.path.join("data\\french\\",f) for f in os.listdir("data\\french\\") if f.endswith(".txt")]
moai_filepaths = [os.path.join("data\\moai\\",f) for f in os.listdir("data\\moai\\") if f.endswith(".txt")]
worldcup_filepaths = [os.path.join("data\\worldcup\\",f) for f in os.listdir("data\\worldcup\\") if f.endswith(".txt")]

In [14]:
french_experiments = []
for data_file in french_filepaths:
    df = pd.read_json(data_file, lines = True)
    df["values"]
    french_experiments.append(df)

In [15]:
for text_type, filepaths in zip(["french","moai","worldcup"], [french_filepaths, moai_filepaths, worldcup_filepaths]):
    # Read data
    generic_datalist = []
    for data_file in filepaths:
        with open(data_file) as f:
            lines = f.readlines()

        dataDict = {}
        for i,line in enumerate(lines):
            lineDict = json.loads(line)
            dataDict[i] = lineDict
        generic_datalist.append(dataDict)

    # filter out non-fixation data
    generic_fixation_list = []
    for experiment_dict in generic_datalist:
        trackerDict = {}
        j = 0
        for i in range(len(experiment_dict)):
            if(experiment_dict[i]['category'] == 'tracker'):
                trackerDict[j] = experiment_dict[i]
                j += 1

        fixationDict = {}
        j = 0
        for i in range(len(trackerDict)):
            try:
                if(trackerDict[i]['values']['frame']['fix'] == True):
                    fixationDict[j] = trackerDict[i]
                    j += 1
            except:
                pass
        generic_fixation_list.append(fixationDict)

    # general overview
    print("\n",text_type,": ")
    for i,experiment in enumerate(generic_fixation_list):
        print(str(i) + ":"+ str(len(experiment)), end = ", ")

    # df creation for cond
    generic_list = []
    for experiment_dict in generic_fixation_list:
        experiment_df = pd.DataFrame( columns=list('xy'))
        for i in range(len(experiment_dict)):
            x = experiment_dict[i]['values']['frame']['raw']['x']
            y = experiment_dict[i]['values']['frame']['raw']['y']   
            experiment_df = experiment_df.append({'x':x,'y':y}, ignore_index=True) 
        generic_list.append(experiment_df)

    if (text_type == "french"):
        french_df = generic_list
    elif (text_type == "moai"):
        moai_df = generic_list
    elif (text_type == "worldcup"):
        worldcup_df = generic_list


 french : 
0:841, 1:749, 2:345, 3:334, 4:322, 5:889, 6:140, 7:415, 8:246, 9:602, 10:865, 11:542, 12:308, 13:325, 14:228, 15:924, 16:660, 
 moai : 
0:656, 1:527, 2:290, 3:374, 4:422, 5:193, 6:44, 7:521, 8:271, 9:614, 10:625, 11:544, 12:362, 13:230, 14:250, 15:754, 16:499, 
 worldcup : 
0:673, 1:533, 2:261, 3:336, 4:266, 5:242, 6:91, 7:313, 8:300, 9:475, 10:537, 11:427, 12:208, 13:144, 14:274, 15:726, 16:626, 

In [16]:
french_BoW_list = []
moai_BoW_list = []
worldcup_BoW_list = []
for text_type, dataset in zip(["french","moai","worldcup"], [french_df, moai_df, worldcup_df]):
    screenshot_filepath = "data\\" + text_type + "\\" + text_type + ".png"
    original_image = cv2.imread(screenshot_filepath)
    # convert the image to grayscale
    gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
    # Performing OTSU threshold
    ret, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)

    rectangular_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (12, 12))

    # Applying dilation on the threshold image
    dilated_image = cv2.dilate(threshold_image, rectangular_kernel, iterations = 1)
    #plt.figure(figsize=(25, 15))
    #plt.imshow(dilated_image)
    #plt.show()

    # Finding contours
    contours, hierarchy = cv2.findContours(dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Creating a copy of the image
    copied_image = original_image.copy()

    mask = np.zeros(original_image.shape, np.uint8)

    # Looping through the identified contours
    # Then rectangular part is cropped and passed on to pytesseract
    # pytesseract extracts the text inside each contours
    # Extracted text is then written into a text file
    paragraph = ""
    #print(len(contours))
    for experiment_data in dataset:
        bag_of_words = {}
        #print(experiment_data)
        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            # Cropping the text block for giving input to OCR
            cropped = copied_image[y:y + h, x:x + w]
            
            cv2.rectangle(copied_image, (x, y), (x + w, y + h), (36,255,12), 2)
            # Apply OCR on the cropped image
            text = pytesseract.image_to_string(cropped, lang='eng', config='--oem 3 --psm 1')
            text = text.lower()
            text = re.sub('[^a-z]', ' ', text)
            text = re.sub(r'\s+', '', text)

            insideCond = (experiment_data["x"] >= x) & (experiment_data["x"] < x + w) & (experiment_data["y"] >= y) & (experiment_data["y"] < y + h)
            #print(insideCond)
            boundFixations = experiment_data[insideCond]
            #print(boundFixations)
            for i in range(len(boundFixations)):
                try:
                    count = bag_of_words[str(text)]
                    count += 1
                    bag_of_words.update({str(text):count})
                except:
                    bag_of_words[str(text)] = 1
            #del bag_of_words[""]       
        if (text_type == "french"):
            french_BoW_list.append(bag_of_words)
            #print(french_BoW_list)
        elif (text_type == "moai"):
            moai_BoW_list.append(bag_of_words)
            #print(moai_BoW_list)
        elif (text_type == "worldcup"):
            worldcup_BoW_list.append(bag_of_words)
            #print(worldcup_BoW_list)
    masked = cv2.drawContours(mask, [cnt], 0, (255, 255, 255), -1)

In [313]:
french_BoW_list[0]

{'assembly': 4,
 'national': 1,
 'convocation': 9,
 'ofthe': 3,
 'the': 14,
 'distress': 12,
 'manage': 27,
 'may': 2,
 'ancienrgime': 8,
 'widespread': 10,
 'unableto': 7,
 'proved': 5,
 'economic': 12,
 'factors': 20,
 'political': 16,
 'social': 10,
 'combination': 2,
 'are': 1,
 'be': 6,
 'agreed': 6,
 'generally': 25,
 'day': 6,
 'this': 5,
 'politics': 8,
 'institutions': 4,
 'dominate': 13,
 'created': 3,
 'values': 2,
 'suffrage': 6,
 'campaigns': 13,
 'universal': 22,
 'slavery': 7,
 'abolition': 25,
 'for': 7,
 'inspired': 5,
 'russian': 1,
 'such': 3,
 'revolts': 1}

In [123]:
french_y = [1,5,5,3,4,4,1,3,4,1,4,2,3,3,5,2,2]
moai_y = [1,1,1,4,2,1,2,1,1,1,2,3,1,1,3,1,1]
worldcup_y = [2,5,1,1,2,1,3,4,4,3,1,5,4,3,3,3,4]

In [224]:
all_BoW_list = french_BoW_list + moai_BoW_list + worldcup_BoW_list
all_y =  french_y + moai_y + worldcup_y

In [None]:
for bag_of_words in all_BoW_list:
    del bag_of_words['']

In [287]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_BoW_list, all_y, test_size=0.15, random_state=42, shuffle=False)

In [122]:
def convertBoWtoVector(bag_of_words):
    vector = []
    for count in bag_of_words.values():
        vector.append(count)
    return vector

In [268]:
def returnNc(category, X_train):
    total = 0
    ls = []
    for i in range(len(X_train)):
        ls.append([y_train[i],X_train[i]])
        
    for i in range(len(ls)):
        if (ls[i][0] == int(category)):
            total += 1
    return total

In [242]:
def createBagofBags(X_train,y_train):
    ls = []
    for i in range(len(X_train)):
        ls.append([y_train[i],X_train[i]])

    bagOfbags = dict([('1',{}),('2',{}),('3',{}),('4',{}),('5',{})])
    for i in range(len(ls)):
        category = str(ls[i][0])
        for word, increase in ls[i][1].items():
            try:
                count = bag_of_words[str(word)]
                count += increase
                bagOfbags[category].update({str(word):count})
            except:
                bagOfbags[category].update({str(word):increase})
    return bagOfbags

In [250]:
def getTotalUniqueWords(X_train):
    word_list = []
    for bag_of_words in X_train:
        word_list += list(bag_of_words.keys())
    vocab_set = set(word_list)
    return len(vocab_set)

In [249]:
def returnSizeofCategory(category, bagofBags):
    total = 0
    for count in bagofBags[category].values():
        total += count
    return total

In [176]:
def returnSizeofAll():
    total = 0
    for category in ['1','2','3','4','5']:
        total += returnSizeofCategory(category)
    return total

In [256]:
def NaiveBayes (bagOfexperiment, X_train, y_train):
    bagofBags = createBagofBags(X_train,y_train)
    total_unique_words = getTotalUniqueWords(X_train)
    outputDict = dict.fromkeys(bagOfbags.keys())
    N = len(X_train)
    for category in bagOfbags.keys():
        genreBag = bagOfbags[category]
        total_size_of_class = returnSizeofCategory(category, bagofBags) # count(c)
        N_c = returnNc(category, X_train)
        prior = np.log(float(N_c/N))
        for word in bagOfexperiment:
            try:
                word_count_in_class = genreBag[word] # count(w,c)
            except:
                word_count_in_class = 0
            for i in range(int(bagOfexperiment[word])):
                P_w_c = float(word_count_in_class + 1)/float(total_size_of_class + total_unique_words)
                prior += np.log(P_w_c)
        #print('Value for ',genre,': ',prior)
        outputDict[category] = prior
    max_key = max(outputDict, key=outputDict.get)
    return max_key

In [305]:
def accuracy (X_train,y_train,X_test, y_test):
    NaiveBayesResults = []
    for bag_of_words in X_test:
            NaiveBayesResults.append(NaiveBayes(bag_of_words,X_train,y_train))
    N = len(X_test)
    score = 0
    for i in range (len(X_test)):
        if(int(NaiveBayesResults[i]) == y_test[i]):
            score += 1
    return (score/N)

In [306]:
def loo_X (all_BoW_list, train_index):
    return_list = []
    for i in train_index:
        return_list.append(all_BoW_list[i])
    return return_list

In [307]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
loo.get_n_splits(all_BoW_list)
for i, (train_index, test_index) in enumerate(loo.split(all_BoW_list)):
    print(f"Fold {i}:")
    print( accuracy(loo_X(all_BoW_list, train_index)
                    , loo_X (all_y, train_index), 
                    loo_X (all_BoW_list, test_index), loo_X (all_y, test_index)) )
    #print(f"  Train: index={train_index}")
    #print(f"  Test:  index={test_index}")

Fold 0:


IndexError: list index out of range