In [0]:
#import the libraries
import math
import random
import re
from collections import defaultdict, Counter
from nltk import ngrams
import pandas as pd
import glob as gb
import numpy as np
import matplotlib.pyplot as plt
import itertools
import more_itertools

In [0]:
#this method does the tokenization and removal of numbers and punctuations
def tokenize(text):
    "List all the word tokens (consecutive letters) in a text. Normalize to lowercase."
    return re.findall('[a-z]+', text.lower())

In [0]:
#this method handles reading the training corpus
def readCorpus(directory="imdb_movie_reviews/imdb_dataset.csv"):
  with open(directory, 'r',encoding="utf8", errors='replace') as doc:
      data = doc.readlines()
  return data

In [0]:
#read corpus method is invoked here
training_data=readCorpus()

In [0]:
def KfoldSplit(traing_data, k_chunk_size=5000):
  #shuffle the data randomly
  random.shuffle(training_data) #shuffle method
  #training data is a list of list
  training_dataClean = []
  #training_dataClean is a list of tuples (class, document)
  for x in training_data:
    training_dataClean.append((x[-2:-1],tokenize(x)))
  return [training_dataClean[offs:offs+k_chunk_size] for offs in range(0, len(training_dataClean), k_chunk_size)]

In [0]:
splittedDataset1=KfoldSplit(training_data, k_chunk_size=5000)

In [0]:
print('the total number of folds = ',len(splittedDataset1))
print('fold 1 has  = ',len(splittedDataset1[0]))
#they are splitted to 10 folds of 5000 documents each 
#thus 9 would be merged for training and a fold for testing 10 times

the total number of folds =  10
fold 1 has  =  5000


In [0]:
#get the different combinations of folds
def getFolds(splittedDataset):
  testDataGroups = []
  trainingDataGroups = []
  for groups in range(len(splittedDataset)):
    testDataGroups.append(splittedDataset[groups])
    trainingDataGroups.append(splittedDataset[:groups]+splittedDataset[groups+1:])
  #merge the 9 lists per trainingDataGroups
  for i in range(len(trainingDataGroups)):
    trainingDataGroups[i] = list(itertools.chain.from_iterable(trainingDataGroups[i]))
  return testDataGroups, trainingDataGroups
# each fold exist as the test group onnce making the size of each just 5000 while for each instance of a testgroup, other groups are combined

In [0]:
testDataGroups1, trainingDataGroups1 = getFolds(splittedDataset1)

In [0]:
print('the total number of folds = ',len(testDataGroups1))
print('the total number test set per folds (fold 1) = ',len(testDataGroups1[0]))
print('the total number of training set per folds (fold 1) = ',len(trainingDataGroups1[0]))

the total number of folds =  10
the total number test set per folds (fold 1) =  5000
the total number of training set per folds (fold 1) =  45000


In [0]:
#this method encapsulates what we had in 1.1, codes that was used . This is everytime a model is to be trainned
def processTheGroup(trainingDataGroups):
  
  #divide sentence to their categories 1,2 they are saved respectively to list in index 0,1 which is a list of lists
  catSentList=[[],[]]
  for x in trainingDataGroups:
    catSentList[int(x[0])-1].append(x[1])
  
  #merge category list gotten above to a universal list for each category; 
  #catSentList is a list of lists, to merge the list we did the following
  reducedList=[]
  for i in range(len(catSentList)):
    reducedList.append(list(more_itertools.flatten(catSentList[i])))
    
  #count the words in each group/category
  countDictList=[]
  for i in range(len(reducedList)):
    countDictList.append(Counter(reducedList[i]))
  
  #this takes the training data as a whole instead of filtering and grouping them 
  #comibine the two classes into one (merge the list for class 1 & 2) and removing words with frequency less than 2
  sentCount=countDictList[0]+countDictList[1] 
  vocabulary=Counter(dict(filter(lambda x: x[1] >= 2, sentCount.items())))
  return catSentList,countDictList,vocabulary 

In [0]:
def collectStatistics(catSentList,vocabulary):
  # b From the preprocessed text, collect the following statistics
  D = len(catSentList[0])+len(catSentList[1])#the total number of documents
  
  Dk = [len(catSentList[0]),len(catSentList[1])] #the total number of documents labelled with class k+1
  
  #the frequency of word wt in the documents of class k
  dictVocab = dict(vocabulary)
  vocabularyCat=[]
  for i in range(len(Dk)):
    vocabularyCat.append(dictVocab.copy())
  for i in range(len(vocabularyCat)):
    for k,v in vocabularyCat[i].items(): 
      vocabularyCat[i][k] = countDictList[i][k]
  return D, Dk,vocabularyCat 
  

In [0]:
# d Estimate the likelihoods P(w_t|C_k) as
def Likelihood(word,CK=1,alpha=1):
  likelihd = (alpha + countDictList[CK-1][word])/(len(vocabulary) + catLength[CK-1])
  return likelihd

In [0]:
def Prior():
  # c Estimate the priors P(C_k) 
  priorsPCK = [] 
  catLength = [] 
  for i in range(len(Dk)):
    priorsPCK.append(Dk[i]/D)
    catLength.append(sum(vocabularyCat[i].values()))
  return catLength,priorsPCK

In [0]:
# Design and implement a Python structure NB_Classifier that encapsulates the model
# parameters mentioned above, namely the priors and the likelihoods.
def NB_Classifier(wordseq):
  classCat = []
  for i in range(len(Dk)):
    firstPart = math.log(priorsPCK[i],2)
    #the second part is a sum over 
    secondpart = sum([math.log(Likelihood(j,CK = i+1),2) for j in wordseq])
    classCat.append(firstPart+secondpart)
  return classCat.index(max(classCat))+1

In [0]:
#first fold
accuracyList=[]
for i in range(10):
  catSentList,countDictList,vocabulary =processTheGroup(trainingDataGroups1[i])
  D, Dk,vocabularyCat=collectStatistics(catSentList,vocabulary)
  print
  catLength,priorsPCK=Prior()
  
  classlabel = []; text_data=[]; predlabel = []
  for x in testDataGroups1[i]:
    classlabel.append(int(x[0]))
    text_data.append(x[1])
  #generate the predicted classes per document
  for document in text_data:
    predlabel.append(NB_Classifier(document))
  #calculate the accuracy, get the number of times the lists matches 
  correctPred=sum(a == b for a,b in zip(classlabel, predlabel))
  accuracy =  correctPred/len(classlabel)
  accuracyList.append(accuracy)
  print('Correct prediction = ', correctPred)
  print("The accuracy is = ", accuracy)

Correct prediction =  4190
The accuracy is =  0.838
Correct prediction =  4202
The accuracy is =  0.8404
Correct prediction =  4231
The accuracy is =  0.8462
Correct prediction =  4223
The accuracy is =  0.8446
Correct prediction =  4213
The accuracy is =  0.8426
Correct prediction =  4251
The accuracy is =  0.8502
Correct prediction =  4214
The accuracy is =  0.8428
Correct prediction =  4227
The accuracy is =  0.8454
Correct prediction =  4280
The accuracy is =  0.856
Correct prediction =  4240
The accuracy is =  0.848


In [0]:
print(accuracyList)
print("The average accuracy = ", round(np.mean(accuracyList),4))
print("The standard deviation = ", round(np.std(accuracyList),4))
print("The maximum accuracy = ", np.max(accuracyList))
print("The minimum accuracy = ", np.min(accuracyList))

[0.838, 0.8404, 0.8462, 0.8446, 0.8426, 0.8502, 0.8428, 0.8454, 0.856, 0.848]
The average accuracy =  0.8454
The standard deviation =  0.0049
The maximum accuracy =  0.856
The minimum accuracy =  0.838
