In [None]:
##MI Notebook for Movie Reviews and Elections Dataset
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
import preprocessor as p
from nltk import PorterStemmer 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string
import pandas as pd
from stw import SupervisedTermWeightingWTransformer
from numpy import array

import sys
import ast
from collections import Counter
from os import listdir
import simplejson
import math
import numpy as np

In [None]:
###To find out frequency of documents that contain a particular term in the vocabulary###
def document_frequency(pos_documents,neg_documents,vocabulary_list):
	pos_documents_freq=[]
	neg_documents_freq=[]
	for word in vocabulary_list:
		pos_count=0
		neg_count=0
		for document in pos_documents:
			if word in document:
				pos_count+=1
		pos_documents_freq.append(pos_count)

		for document in neg_documents:
			if word in document:
				neg_count+=1
		neg_documents_freq.append(neg_count)

	return pos_documents_freq,neg_documents_freq

###MI for Positive Corpus###
def MI_for_positive_corpus(pos_documents,neg_documents,pos_documents_freq,neg_documents_freq):
	pos_D=len(pos_documents)
	neg_D=len(neg_documents)
	D=pos_D+neg_D
	MI_pos=[]
	for i in range(len(pos_documents_freq)):
		numerator=pos_documents_freq[i] * D
		denominator=(pos_documents_freq[i]+neg_documents_freq[i])*len(pos_documents)

		if denominator == 0 or float(numerator)/denominator == 0:
			MI_pos.append(0)
		else:
			MI_per_term = float(numerator)/denominator
			MI_per_term=math.log(MI_per_term,2)
			MI_pos.append(MI_per_term)

	return MI_pos

###MI for Negative Corpus###
def MI_for_negative_corpus(pos_documents,neg_documents,pos_documents_freq,neg_documents_freq):
	pos_D=len(pos_documents)
	neg_D=len(neg_documents)
	D=pos_D+neg_D
	MI_neg=[]
	for i in range(len(neg_documents_freq)):
		numerator=neg_documents_freq[i] * D
		denominator=(pos_documents_freq[i]+neg_documents_freq[i])*len(neg_documents)

		if denominator == 0 or float(numerator)/denominator == 0:
			MI_neg.append(0)
		else:
			MI_per_term = float(numerator)/denominator
			MI_per_term=math.log(MI_per_term,2)
			MI_neg.append(MI_per_term)

	return MI_neg

###Calculating MI###
def MI(MI_pos,MI_neg):
	MI_result=[]
	for i in range(len(MI_pos)):
		MI_result.append(max(MI_pos[i],MI_neg[i]))

	return MI_result

def MI_mapper(MI_result,vocabulary_list):
    d={}
    for i in range(len(vocabulary_list)):
        d[vocabulary_list[i]]=MI_result[i]
    return d

In [None]:
#Elections Vocabulary
def make_Corpus(root_dir,polarity_dirs):
    corpus = []
    for polarity_dir in polarity_dirs:
        reviews = [os.path.join(polarity_dir,f) for f in os.listdir(polarity_dir)]
        for review in reviews:
            doc_string = "";
            with open(review) as rev:
                for line in rev:
                    #line = preprocessing(line)
                    doc_string = doc_string + line
                    doc_string+=" "
            if not corpus:
                corpus = [doc_string]
            else:
                corpus.append(doc_string)
    return corpus

root_dir = 'Elections/pos/'
pos_corpus = make_Corpus(root_dir,['Elections/pos/'])
print("Positive Corpus Successful")

root_dir = 'Elections/neg/'
neg_corpus = make_Corpus(root_dir,['Elections/neg/'])
print("Negative Corpus Successful")

corpus=pos_corpus+neg_corpus
for i in range(len(corpus)):
        corpus[i] = corpus[i].split(" ")
        
for i in range(len(pos_corpus)):
        pos_corpus[i] = pos_corpus[i].split(" ")

for i in range(len(neg_corpus)):
        neg_corpus[i] = neg_corpus[i].split(" ")


def create_vocabulary(corpus):
    vocabulary=Counter()
    for i in range(len(corpus)):
        vocabulary.update(corpus[i])   
    vocabulary_list = [word for word,frequency in vocabulary.items() if frequency >= 5]
    print("Vocabulary Generated")
    
    return vocabulary_list

election_vocabulary_list=create_vocabulary(corpus)



#Election Corpus Results
pos_documents_freq,neg_documents_freq=document_frequency(pos_corpus,neg_corpus,election_vocabulary_list)
MI_pos=MI_for_positive_corpus(pos_corpus,neg_corpus,pos_documents_freq,neg_documents_freq)
MI_neg=MI_for_negative_corpus(pos_corpus,neg_corpus,pos_documents_freq,neg_documents_freq)
MI_result=MI(MI_pos,MI_neg)
d = MI_mapper(MI_result,election_vocabulary_list)

labels = np.zeros(4472);
labels[0:2236]=1;
labels[2236:]=0; 
       
kf = StratifiedKFold(n_splits=10)
 
totalsvm = 0           # Accuracy measure on 2000 files
totalNB = 0
totalLR = 0
totalMatSvm = np.zeros((2,2));  # Confusion matrix on 2000 files
totalMatNB = np.zeros((2,2));
totalMatLR = np.zeros((2,2));

for train_index, test_index in kf.split(corpus,labels):
    X_train = [corpus[i] for i in train_index]
    X_test = [corpus[i] for i in test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    MI_train=[]
    for i in range(len(X_train)):
        score=[]
        for j in range(len(election_vocabulary_list)):
            if election_vocabulary_list[j] in X_train[i]:
                score.append(d[election_vocabulary_list[j]])
            else:
                score.append(0.0)
        MI_train.append(score)
    
    print("MI Train Done")
        
    
    MI_test=[]
    for i in range(len(X_test)):
        score=[]
        for j in range(len(election_vocabulary_list)):
            if election_vocabulary_list[j] in X_test[i]:
                score.append(d[election_vocabulary_list[j]])
            else:
                score.append(0.0)
        MI_test.append(score)
    
    print("MI Test Done")

    
    model1 = LinearSVC()
    model2 = MultinomialNB()   
    model3 = LogisticRegression()
    model1.fit(MI_train,y_train)
    model2.fit(MI_train,y_train)
    model3.fit(MI_train,y_train)
    result1 = model1.predict(MI_test)
    result2 = model2.predict(MI_test)
    result3 = model3.predict(MI_test)
    
     
    totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
    totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
    totalMatLR = totalMatLR + confusion_matrix(y_test, result3)
    totalsvm = totalsvm+sum(y_test==result1)
    totalNB = totalNB+sum(y_test==result2)
    totalLR = totalLR+sum(y_test==result3)

print("########Results########")
print("SVM: ",totalMatSvm, totalsvm/4472.0)
print("NB: ",totalMatNB, totalNB/4472.0)
print("LR: ",totalMatLR, totalLR/4472.0)
print()
print()
from sklearn.metrics import f1_score
print("SVM",f1_score(y_test, result1, average='binary')) 
print("NB",f1_score(y_test, result2, average='binary')) 
print("LR",f1_score(y_test, result3, average='binary')) 

In [None]:
#Movie
def preprocessing(line):
    line=p.clean(line)
    line = line.lower()
    line = line.split()
    for i in range(len(line)):
        lemmatizing_token=lemmatizer.lemmatize(line[i])
        line[i]=lemmatizing_token
    translation = str.maketrans("","", string.punctuation);
    for i in range(len(line)):
        line[i]=line[i].translate(translation)

    line=[token for token in line if token.isalpha()]
    line=[token for token in line if len(token)>2]
    line = " ".join(line)
    return line

def make_Corpus(root_dir,polarity_dirs):
    corpus = []
    for polarity_dir in polarity_dirs:
        reviews = [os.path.join(polarity_dir,f) for f in os.listdir(polarity_dir)]
        for review in reviews:
            doc_string = "";
            with open(review) as rev:
                for line in rev:
                    line = preprocessing(line)
                    doc_string = doc_string + line
                    doc_string+=" "
            if not corpus:
                corpus = [doc_string]
            else:
                corpus.append(doc_string)
    return corpus

root_dir = 'txt_sentoken/pos/'
pos_corpus = make_Corpus(root_dir,['txt_sentoken/pos/'])
print("Successful Positive Corpus")

root_dir = 'txt_sentoken/neg/'
neg_corpus = make_Corpus(root_dir,['txt_sentoken/neg/'])
print("Successful Negative Corpus")

corpus=pos_corpus+neg_corpus
for i in range(len(corpus)):
        corpus[i] = corpus[i].split(" ")
        
for i in range(len(pos_corpus)):
        pos_corpus[i] = pos_corpus[i].split(" ")

for i in range(len(neg_corpus)):
        neg_corpus[i] = neg_corpus[i].split(" ")


def create_vocabulary(corpus):
    vocabulary=Counter()
    for i in range(len(corpus)):
        vocabulary.update(corpus[i])   
    vocabulary_list = [word for word,frequency in vocabulary.items() if frequency >= 5]
    print("Vocabulary Generated")
    
    return vocabulary_list

vocabulary_list=create_vocabulary(corpus)

#Movie Corpus Results
pos_documents_freq,neg_documents_freq=document_frequency(pos_corpus,neg_corpus,vocabulary_list)
MI_pos=MI_for_positive_corpus(pos_corpus,neg_corpus,pos_documents_freq,neg_documents_freq)
MI_neg=MI_for_negative_corpus(pos_corpus,neg_corpus,pos_documents_freq,neg_documents_freq)
MI_result=MI(MI_pos,MI_neg)
d = MI_mapper(MI_result,vocabulary_list)

#Movie Corpus Results
labels = np.zeros(2000);
labels[0:1000]=1;
labels[1000:2000]=0; 
       
kf = StratifiedKFold(n_splits=10)
 
totalsvm = 0           # Accuracy measure on 2000 files
totalNB = 0
totalLR = 0
totalMatSvm = np.zeros((2,2));  # Confusion matrix on 2000 files
totalMatNB = np.zeros((2,2));
totalMatLR = np.zeros((2,2));

for train_index, test_index in kf.split(corpus,labels):
    X_train = [corpus[i] for i in train_index]
    X_test = [corpus[i] for i in test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    MI_train=[]
    for i in range(len(X_train)):
        score=[]
        for j in range(len(vocabulary_list)):
            if vocabulary_list[j] in X_train[i]:
                score.append(d[vocabulary_list[j]])
            else:
                score.append(0.0)
        MI_train.append(score)
    
    print("MI Training done")

    MI_test=[]
    for i in range(len(X_test)):
        score=[]
        for j in range(len(vocabulary_list)):
            if vocabulary_list[j] in X_test[i]:
                score.append(d[vocabulary_list[j]])
            else:
                score.append(0.0)
        MI_test.append(score)
    
    print("MI Testing done")
    
    model1 = LinearSVC()
    model2 = MultinomialNB()   
    model3 = LogisticRegression()
    model1.fit(MI_train,y_train)
    model2.fit(MI_train,y_train)
    model3.fit(MI_train,y_train)
    result1 = model1.predict(MI_test)
    result2 = model2.predict(MI_test)
    result3 = model3.predict(MI_test)
    
     
    totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
    totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
    totalMatLR = totalMatLR + confusion_matrix(y_test, result3)
    totalsvm = totalsvm+sum(y_test==result1)
    totalNB = totalNB+sum(y_test==result2)
    totalLR = totalLR+sum(y_test==result3)
    

print("########Results########")
print("SVM: ",totalMatSvm, totalsvm/2000.0)
print("NB: ",totalMatNB, totalNB/2000.0)
print("LR: ",totalMatLR, totalLR/2000.0)
print()
print()
from sklearn.metrics import f1_score
print("SVM",f1_score(y_test, result1, average='binary')) 
print("NB",f1_score(y_test, result2, average='binary')) 
print("LR",f1_score(y_test, result3, average='binary')) 