In [5]:
from nltk.tokenize import RegexpTokenizer
import sys
import re
import numpy as np
import pandas as pd
import math
import random

In [6]:
tokenizer = RegexpTokenizer(r'\w+')

In [7]:
def tokenizedInput(inputFileName,tokenize=True):
    cleaned_list=[]
    with open(inputFileName) as f:
        docs = f.readlines()

    for doc in docs:
        raw=None
        if(tokenize==True):
            raw = tokenizer.tokenize(doc)
        else:
            raw=doc
        cleaned_list.append(raw)
    
    return cleaned_list


In [8]:
def tokenizedInput_WithNGrams(inputFileName,grams=1):
    cleaned_list=[]
    with open(inputFileName) as f:
        docs = f.readlines()

    for doc in docs:
        raw2=[]
        raw = tokenizer.tokenize(doc)
        leng=len(raw)
        for k in range(grams-1,leng,1):
            new_token=""
            for l in range(k-grams+1,k+1,1):
                new_token+=" "+raw[l]
            raw2.append(new_token)
        cleaned_list.append(raw2+raw)
    
    return cleaned_list

In [9]:
# x_train=tokenizedInput_WithNGrams('Dataset/Stem/toy',grams=2)
# y_train=tokenizedInput('Dataset/imdb/imdb_train_labels.txt',tokenize=False)
# y_train=list(map(int,y_train))


In [10]:
# print ((x_train[0]))

In [11]:
class NaiveBayes:
    
    def __init__(self,x,y,laplace_smoother=1):
        self.x=list(x)
        self.y=list(y)
        self.laplace_smoother=laplace_smoother
        self.distinct_x=0
        self.distinct_y=0
        self.instances=len(y)
        self.x_dict={}
        self.vocabulary={}
        self.y_dict={}
        self.revMapy={}
        self.wordsInClass=[]
        self.instancesInClass=[]
        self.weight_dict={}
        self.classweight=[]
    
    def tfidf(self):
        
        for i in range(self.distinct_y):
            self.classweight.append(0)
        
        for key,value in self.vocabulary.items():
            
            docs=0
            for i in range(self.distinct_y):
                if((key,i) in self.x_dict):
                    docs+=1
            
            for i in range(self.distinct_y):
                if((key,i) in self.x_dict):
                    temp=self.x_dict[(key,i)]*(math.log(10/docs))
                    self.weight_dict[(key,i)]=temp
                    self.classweight[i]+=temp
    
    
                
    
    def calculateParameters(self):
#         classes=np.unique(self.npy)
        counter=0
        for i in range(self.instances):
            feature_vector=self.x[i]
            feature_class=self.y[i]
            mapped_class=counter
            
            if(feature_class in self.y_dict):
                mapped_class=self.y_dict[feature_class]
            else:
                self.revMapy[counter]=feature_class
                self.y_dict[feature_class]=counter
                self.wordsInClass.append(0)
                self.instancesInClass.append(0)
                counter+=1
            
            self.instancesInClass[mapped_class]+=1
            
            for words in feature_vector:
                if(words in self.vocabulary):
                    self.vocabulary[words]+=1
                else:
                    self.vocabulary[words]=1
                
                key=(words,mapped_class)
                self.wordsInClass[mapped_class]+=1
                
                if(key in self.x_dict):
                    self.x_dict[key]+=1
                else:
                    self.x_dict[key]=1
            
        self.distinct_x=len(self.vocabulary)
        self.distinct_y=len(self.y_dict)
#         self.printParameters()

    def purge(self,threshold=7):
        
        emp_lis=[]
        
        for key,value in self.vocabulary.items():
            if(value<threshold and len(tokenizer.tokenize(key))>1):
                self.distinct_x-=1
                emp_lis.append(key)
                for labels in range(self.distinct_y):
                    new_key=(key,labels)
                    if(new_key in self.x_dict):
                        count=self.x_dict[new_key]
                        del self.x_dict[new_key]
                        self.wordsInClass[labels]-=count
        
        for words in emp_lis:
            del self.vocabulary[words]
    
    def getLogPrior(self,label):
        return math.log(float(self.instancesInClass[label])/self.instances)
    
#     def getLogProb(self,attribute,label):
        
#         occurences=0
#         key=(attribute,label)
        
#         if key in self.x_dict:
#             occurences=self.x_dict[key]
        
#         occurences+=self.laplace_smoother

#         total_occurences_in_class=self.wordsInClass[label]
#         total_occurences_in_class+=self.distinct_x*self.laplace_smoother
                    
#         return math.log(float(occurences)/total_occurences_in_class)

    def getLogProb(self,attribute,label):
        
        occurences=0
        key=(attribute,label)
        
        if key in self.weight_dict:
            occurences=self.weight_dict[key]
        
        occurences+=self.laplace_smoother

        total_occurences_in_class=self.classweight[label]
        total_occurences_in_class+=self.distinct_x*self.laplace_smoother
            
        return math.log(float(occurences)/total_occurences_in_class)
        
        
        
    def getClass(self,x):
        max_log_prob=-1e9
        label=-1
        
        for i in range(self.distinct_y):
            log_prob_x_given_y=0
            
            for attributes in x:
                log_prob_x_given_y+=self.getLogProb(attributes,i)
            
            log_prob_x=log_prob_x_given_y+self.getLogPrior(i)
            
            if(log_prob_x>max_log_prob):
                max_log_prob=log_prob_x
                label=self.revMapy[i]
            
        return label
    
    def ConfusionMatrix(self,y,predy):
        confusion = [ [0]*self.distinct_y for _ in range(self.distinct_y) ]
        
        tests=len(y)
        
        for i in range(tests):
            confusion[self.y_dict[predy[i]]][self.y_dict[y[i]]]+=1
        
        for i,ii in self.y_dict.items():
            for j,jj in self.y_dict.items():
                print("%5d"%(confusion[ii][jj]),end=' ')
            print()
        
        
    def getAccuracy(self,x,y,printConfusionMatrix=False):
        total_tests=len(y)
        passed_tests=0
        prediction_list=[]
        
        for i in range(total_tests):
            xi=x[i]
            yi=y[i]
            
#             if y[i] in self.y_dict:
#                 pass
#             else:
#                 continue
            pred_yi=self.getClass(xi)
            prediction_list.append(pred_yi)
            if(pred_yi==yi):
                passed_tests+=1
        
        if(printConfusionMatrix==True):
            self.ConfusionMatrix(y,pred_yi)
                
        return [prediction_list,(float(passed_tests))/total_tests]
    
    def getAccuracyRandomPredict(self,x,y):
        
        average_over=10
        total_accuracy=0
        
        for i in range(average_over):
            
            total_tests=len(y)
            passed_tests=0

            for i in range(total_tests):
                yi=y[i]

                pred_yi=random.randint(0,self.distinct_y-1)
                pred_yi=self.revMapy[pred_yi]

                if(pred_yi==yi):
                    passed_tests+=1

            total_accuracy+=(float(passed_tests))/total_tests
        
        return total_accuracy/average_over
    
    def getAccuracyMajorityPredictor(self,x,y):
        max_occ=-1
        majority_class=-1
        total_tests=len(y)
        passed_tests=0        
        
        for i in range(self.distinct_y):
            if(max_occ<self.instancesInClass[i]):
                max_occ=self.instancesInClass[i]
                majority_class=i
        
        if(majority_class==-1):
            return 0
        
        majority_class=self.revMapy[majority_class]
        
        for yi in y:
            if(yi==majority_class):
                passed_tests+=1
                
        return (float(passed_tests))/total_tests       

    def printParameters(self):
        print("Vocabulary Size:",self.distinct_x)
        print("Classes:",self.distinct_y)
        print("X_Y's:",len(self.x_dict))
        
        print("Mapped Classes:", self.y_dict)
        
        print("Instances In Mapped class:",end='')
        
        for counts in self.instancesInClass:
            print(counts,end=' ')
        
        print("")
        print("Words In Mapped class:",end='')
        
        for counts in self.wordsInClass:
            print(counts,end=' ')
        
        print("")
        

In [43]:
def augmentation(x_train,y_train):
    x=[]
    y=[]
    
    instance=len(y_train)
    lis1=[2,3,4]
    lis2=[7,8,9]
    
    for i in range(instance):
        x.append(x_train[i])
        y.append(y_train[i])
        
        if(y_train[i]==1):
            x.append(x_train[i])
            y.append(lis1[random.randint(0,2)])
            x.append(x_train[i])
            y.append(lis1[random.randint(0,2)])
#             x.append(x_train[i])
#             y.append(lis1[random.randint(0,2)])
        
        if(y_train[i]==10):
            x.append(x_train[i])
            y.append(lis2[random.randint(0,2)])
            x.append(x_train[i])
            y.append(lis2[random.randint(0,2)])
#             x.append(x_train[i])
#             y.append(lis2[random.randint(0,2)])
            
    return (x,y)
    

In [44]:
# x_train=tokenizedInput_WithNGrams('Dataset/Stem/imdb_train_text.txt',grams=2)
x_train=tokenizedInput('Dataset/Stem/imdb_train_text.txt')
y_train=tokenizedInput('Dataset/imdb/imdb_train_labels.txt',tokenize=False)
y_train=list(map(int,y_train))
(x_train,y_train)=augmentation(x_train,y_train)
# x_test=tokenizedInput_WithNGrams('Dataset/Stem/imdb_test_text.txt',grams=2)
x_test=tokenizedInput('Dataset/Stem/imdb_test_text.txt')
y_test=tokenizedInput('Dataset/imdb/imdb_test_labels.txt',tokenize=False)
y_test=list(map(int,y_test))

In [45]:
print(len(y_train))

44664


In [46]:
NaiveBayesClassifier=NaiveBayes(x=x_train,y=y_train)
NaiveBayesClassifier.calculateParameters()
print (len(NaiveBayesClassifier.x_dict))
print(NaiveBayesClassifier.distinct_x)
print(len(NaiveBayesClassifier.vocabulary))

199596
50815
50815


In [51]:
NaiveBayesClassifier.purge(threshold=7)
NaiveBayesClassifier.tfidf()
print(len(NaiveBayesClassifier.x_dict))
print(NaiveBayesClassifier.distinct_x)
print(len(NaiveBayesClassifier.vocabulary))

199596
50815
50815


In [48]:
[a,b]=(NaiveBayesClassifier.getAccuracy(x_test,y_test,printConfusionMatrix=False))
print (b)
# print(NaiveBayesClassifier.ConfusionMatrix(a,y_test))

0.27312


In [49]:
NaiveBayesClassifier.ConfusionMatrix(y_test,a)

 1732   312   161    82    25    54    46   180 
 1121   500   414   284   122   123   101   218 
 1229   717   820   659   276   243   172   342 
  730   602   808   974   382   314   196   296 
   76    75   160   296   670   655   440   812 
   82    63   130   267   659  1085   875  1545 
   34    25    42    61   152   289   396   955 
   18     8     6    12    21    87   118   651 


In [101]:
# NaiveBayesClassifier.ConfusionMatrix(y_test,a)

In [251]:
NaiveBayesClassifier.y_dict

{1: 7, 2: 6, 3: 4, 4: 5, 7: 2, 8: 1, 9: 3, 10: 0}

In [97]:
x_train2=tokenizedInput('Dataset/Stem/imdb_train_text.txt')
y_train2=tokenizedInput('Dataset/imdb/imdb_train_labels.txt',tokenize=False)
y_train2=list(map(int,y_train2))
(x_train2,y_train2)=augmentation(x_train2,y_train2)
x_test2=tokenizedInput('Dataset/imdb/imdb_test_text.txt')
y_test2=tokenizedInput('Dataset/imdb/imdb_test_labels.txt',tokenize=False)
y_test2=list(map(int,y_test2))

In [98]:
NaiveBayesClassifier2=NaiveBayes(x=x_train2,y=y_train2)
NaiveBayesClassifier2.calculateParameters()
# print (sum(NaiveBayesClassifier2.wordsInClass))

In [99]:
[a2,b2]=(NaiveBayesClassifier2.getAccuracy(x_test2,y_test2,printConfusionMatrix=False))
print (b2)

0.24196


In [100]:
NaiveBayesClassifier.ConfusionMatrix(y_test2,a2)

 1035   306   210   150    41    38    24    71 
  375   185   162   149    55    60    51    99 
    1     1     0     1     0     0     0     0 
    0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0 
 3611  1810  2169  2335  2211  2752  2269  4829 


In [33]:
# x_train3=tokenizedInput_WithNGrams('Dataset/Stem/imdb_train_text.txt',grams=3)
# y_train3=tokenizedInput('Dataset/imdb/imdb_train_labels.txt',tokenize=False)
# y_train3=list(map(int,y_train3))
# x_test3=tokenizedInput_WithNGrams('Dataset/imdb/imdb_test_text.txt',grams=3)
# y_test3=tokenizedInput('Dataset/imdb/imdb_test_labels.txt',tokenize=False)
# y_test3=list(map(int,y_test3))

In [34]:
# print (len(NaiveBayesClassifier2.x_dict))

2151085


In [57]:
x_train1=tokenizedInput_WithNGrams('Dataset/Clean1/imdb_train_text.txt',grams=1)
y_train1=tokenizedInput('Dataset/imdb/imdb_train_labels.txt',tokenize=False)
y_train1=list(map(int,y_train1))
x_test1=tokenizedInput_WithNGrams('Dataset/Clean1/imdb_test_text.txt',grams=1)
y_test1=tokenizedInput('Dataset/imdb/imdb_test_labels.txt',tokenize=False)
y_test1=list(map(int,y_test1))

In [58]:
NaiveBayesClassifier1=NaiveBayes(x=x_train1,y=y_train1)
NaiveBayesClassifier1.calculateParameters()
print (len(NaiveBayesClassifier1.x_dict))

241067


In [59]:
[a1,b1]=(NaiveBayesClassifier1.getAccuracy(x_train1,y_train1,printConfusionMatrix=False))
print (b1)

0.68476


In [60]:
NaiveBayesClassifier1.ConfusionMatrix(y_test1,a1)

 3065  1374  1548  1581   242   291   231   528 
  383   172   178   162     6     6     1     3 
  564   265   281   315     7    10     9    20 
  737   381   416   430    25    24    36    52 
   54    26    21    36   335   378   304   728 
   69    34    35    45   432   527   436   913 
   10     3     5     2   183   240   218   412 
  140    47    57    64  1077  1374  1109  2343 


In [None]:
x_train_uni_bi=tokenizedInput_WithNGrams('Dataset/Stem/imdb_train_text.txt',grams=2)
y_train_uni_bi=tokenizedInput('Dataset/imdb/imdb_train_labels.txt',tokenize=False)
y_train_uni_bi=list(map(int,y_train_uni_bi))
x_test_uni_bi=tokenizedInput_WithNGrams('Dataset/Stem/imdb_test_text.txt',grams=2)
y_test_uni_bi=tokenizedInput('Dataset/imdb/imdb_test_labels.txt',tokenize=False)
y_test_uni_bi=list(map(int,y_test1))