In [69]:
import numpy as np
from math import log,factorial
import re
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,recall_score,f1_score
from itertools import combinations
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import io

In [6]:
filename = "imdb_labelled.txt"
r = io.open(filename, encoding='utf8').readlines()
review = []
Y = []
for i in r:
    x = i.split('\t')
    review.append(x[0])
    Y.append([int(x[-1])])
Y = np.array(Y)

In [32]:
def tokenize(text):
    tokens = re.findall(r'\w+', text.lower())
    return tokens

In [31]:
def refine_words(reviews):
    words = {}
    for i in token_list:
        for j in i:
            if j not in words.keys():
                words[j] = 1
            else:
                words[j] += 1
    word_list = []
    for i in words:
        if words[i] < 25:
            word_list.append(i)
    return word_list

In [36]:
def create_data(fw,tl):
    data = []
    for i in tl:
        count = Counter(i)
        tmp = [0]*len(fw)
        for j in count:
            if j in fw:
                tmp[fw.index(j)] = count[j]
        data.append(tmp)
    return np.array(data)


In [3]:
def binarizer(data):
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] != 0:
                data[i][j] = 1.0
    return data

In [39]:
def find_alpha(class_labels):
    alpha1 = {}
    class_count1 = {}
    for i in class_labels:
        if i[0] not in class_count1.keys():
            class_count1[i[0]] = 1
        else:
            class_count1[i[0]] += 1
    classes1 = class_count1.keys()
    for j in  class_count1:
        alpha1[j] = class_count1[j]*1.0/len(class_labels)
    return classes1,class_count1,alpha1

In [40]:
def segregate_data(data,labels,clabels):
    multi_X = {}
    for i in range(len(clabels)):
        if clabels[i] not in multi_X.keys():
            multi_X[clabels[i]] = []
        for j in range(len(labels)):
            if labels[j][0] == clabels[i]:
                multi_X[clabels[i]].append(data[j]) 
    for i in multi_X:
        multi_X[i] = np.array(multi_X[i])
    return multi_X

In [44]:
def find_mean_multi(data):
    multi_mean1 = {}
    multi_s ={}
    for i in data:
        multi_mean1[i] = np.mean(data[i],axis =0)
        multi_s[i] = np.sum(data[i],axis = 0)
    return multi_s,multi_mean1

In [48]:
def smoothing(msum,sdata,sv):
    smean = {}
    for i in msum:
        smean[i]= []
        for j in range(len(msum[i])):
            smean[i].append((msum[i][j]+sv)/float(np.sum(sdata[i])+2*sv))
    for i in smean:
        smean[i] = np.array(smean[i])
    return smean

In [8]:
def find_covariance_multi(data_split,split_mean,n):
    multi_intrim= {}
    covar = {}
    for i in data_split:
        if i not in multi_intrim.keys():
            multi_intrim[i] = []
        for j in data_split[i]:
            multi_intrim[i].append(j-split_mean[i])
    for i in multi_intrim:
        covar[i] = np.dot(np.array(multi_intrim[i]).transpose(),np.array(multi_intrim[i]))/float(n[i])
    return covar

In [53]:
def training(x,y,sv):
    cl,n,alph = find_alpha(y)
    data_seg = segregate_data(x,y,cl)
    mean_s, mean_cl = find_mean_multi(data_seg)
    sme = smoothing(mean_s,data_seg,sv)
    return alph,sme

In [71]:
def nCr(n,r):
    return factorial(n) / factorial(r) /factorial(n-r)

In [82]:
def membership_naive(data,mean,prior):
    mem = []
    P = np.sum(data)
    for i in mean:
        c = 0
        for j in range(len(mean[i])):
            comb = log(nCr(P,data[j]))
            a = data[j]*log(mean[i][j])
            b = (P-data[j])*log(1-mean[i][j])
            c= c+a+b+comb
        c = c+prior[i]
        mem.append(c)
    return mem


In [83]:
def determinist(data):
    return data.index(max(data))

In [84]:
def find_class(data,clabels):
    return clabels[data]

In [85]:
def prediction(data,mean,prior,clabels):
    member = []
    for i in data:
        member.append(membership_naive(i,mean,prior))
    determine =[]
    for j in member:
        determine.append(determinist(j))
    predict = [find_class(i,clabels) for i in determine]
    return predict

In [86]:
def mean_squrae_error(pred,y):
    return sum([(i-j)**2 for i,j in zip(pred,y)])/float(len(pred))

In [87]:
def find_confusion_matrix(clabels,actual,predicted):
    cm= []
    for i in clabels:
        tmp =[0]*len(clabels)
        for j in range(len(actual)):
            if actual[j][0] == i and actual[j][0] == predicted[j]:
                tmp[clabels.index(i)] += 1
            elif actual[j][0] == i and actual[j][0] != predicted[j]:
                tmp[clabels.index(predicted[j])] += 1
        cm.append(tmp)
    return np.array(cm)

In [88]:
def find_accuracy(matrix):
    return np.trace(matrix)*1.0/np.sum(matrix)

In [89]:
def find_precision(matrix):
    pres = []
    x = np.sum(matrix,axis=0)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                pres.append(matrix[i][j]*1.0/x[i])
    return pres

In [90]:
def find_recall(matrix):
    rec = []
    x = np.sum(matrix,axis=1)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                rec.append(matrix[i][j]*1.0/x[i])
    return rec

In [91]:
def find_fmeasure(prec,rec):
    tmp = []
    for i,j in zip(prec,rec):
        tmp.append(2.0*(i*j)/(i+j))
    return tmp

In [92]:
def roc(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    precision = find_precision(confmatrix)
    recall = find_recall(confmatrix)
    return precision,recall

In [98]:
def cross_validation(data, labels,sv,clabels, n_folds=10,MSE = False):
    cv = KFold(len(labels), n_folds,shuffle= True)
    accuracies = []
    precision_list = []
    recall_list = []
    training_MSE_list =[]
    testing_MSE_list = []
    i = 0
    for train_ind, test_ind in cv: 
        train_alpha,train_mean = training(data[train_ind], labels[train_ind],sv)
        training_MSE = mean_squrae_error(prediction(data[train_ind],train_mean,train_alpha,clabels), labels[train_ind])
        training_MSE_list.append(training_MSE)
        predict = prediction(data[test_ind],train_mean,train_alpha,clabels)
        p,r = roc(clabels,labels[test_ind],predict)
        precision_list.append(p)
        recall_list.append(r)
        testing_MSE_list.append(mean_squrae_error(predict,labels[test_ind]))
        accuracies.append(accuracy_score(labels[test_ind], predict))
        
    if MSE == True:
        for i in range(len(testing_MSE_list)):
            print 'Fold',i,'Testing Error',testing_MSE_list[i]
        print "Average Mean Square Error"
        print "Training Error \t Testing Error"
        print np.mean(training_MSE_list),"\t",np.mean(testing_MSE_list)
    else:
        for i in range(len(accuracies)):
            print 'Fold',i,'Accuracy',accuracies[i]
        print "Average Accuracy ", np.mean(accuracies)
    return precision_list,recall_list

In [94]:
def evaluation(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    print "Confusion Matrix"
    print confmatrix
    accuracy = find_accuracy(confmatrix)
    print "Accuracy", accuracy
    precision = find_precision(confmatrix)
    print "Precision", precision
    recall = find_recall(confmatrix)
    print "Recall", recall
    f_score =find_fmeasure(precision,recall)
    print "F_score", f_score

In [77]:
token_list = []
for i in review:
    token_list.append(tokenize(i))
final_words = refine_words(token_list)
X = create_data(final_words,token_list)

In [78]:
classes,class_count,alpha = find_alpha(Y)
print "The Classses are", classes
print "The Classes Count ", class_count
print "The prior probabiliy", alpha
multi_X_split =segregate_data(X,Y,classes)

The Classses are [0, 1]
The Classes Count  {0: 500, 1: 500}
The prior probabiliy {0: 0.5, 1: 0.5}


In [43]:
np.sum(multi_X_split[1])

3859

In [79]:
multi_sum,multi_mean = find_mean_multi(multi_X_split)
smoothing_value = 1
smoothed_mean = smoothing(multi_sum,multi_X_split,smoothing_value)
print "Smoothed Mean for mulivariate features"
for i in smoothed_mean:
    print i,np.sum(smoothed_mean[i])

Smoothed Mean for mulivariate features
0 1.90925644917
1 1.77596477596


In [80]:
a,m = training(X,Y,smoothing_value)
predictions = prediction(X,m,a,classes)
print "Predicted Value \t True Value "
for i in range(10,21):
    print predictions[i],"\t\t\t",Y[i]

Predicted Value 	 True Value 
1 			[1]
1 			[1]
1 			[1]
1 			[1]
1 			[1]
0 			[0]
1 			[1]
1 			[1]
1 			[1]
1 			[1]
1 			[1]


In [99]:
cross_validation(X, Y,smoothing_value,classes)

Fold 0 Accuracy 0.76
Fold 1 Accuracy 0.83
Fold 2 Accuracy 0.77
Fold 3 Accuracy 0.85
Fold 4 Accuracy 0.76
Fold 5 Accuracy 0.78
Fold 6 Accuracy 0.77
Fold 7 Accuracy 0.79
Fold 8 Accuracy 0.77
Fold 9 Accuracy 0.75
Average Accuracy  0.783


([[0.82499999999999996, 0.71666666666666667],
  [0.89130434782608692, 0.77777777777777779],
  [0.80000000000000004, 0.75],
  [0.9285714285714286, 0.7931034482758621],
  [0.84090909090909094, 0.6964285714285714],
  [0.68421052631578949, 0.90697674418604646],
  [0.83720930232558144, 0.7192982456140351],
  [0.80434782608695654, 0.77777777777777779],
  [0.80392156862745101, 0.73469387755102045],
  [0.72916666666666663, 0.76923076923076927]],
 [[0.66000000000000003, 0.85999999999999999],
  [0.77358490566037741, 0.8936170212765957],
  [0.68085106382978722, 0.84905660377358494],
  [0.76470588235294112, 0.93877551020408168],
  [0.68518518518518523, 0.84782608695652173],
  [0.90697674418604646, 0.68421052631578949],
  [0.69230769230769229, 0.85416666666666663],
  [0.75510204081632648, 0.82352941176470584],
  [0.7592592592592593, 0.78260869565217395],
  [0.74468085106382975, 0.75471698113207553]])

In [96]:
cross_validation(X, Y, smoothing_value,classes,MSE = True)

Fold 0 Testing Error [ 0.22]
Fold 1 Testing Error [ 0.17]
Fold 2 Testing Error [ 0.16]
Fold 3 Testing Error [ 0.21]
Fold 4 Testing Error [ 0.17]
Fold 5 Testing Error [ 0.26]
Fold 6 Testing Error [ 0.25]
Fold 7 Testing Error [ 0.22]
Fold 8 Testing Error [ 0.26]
Fold 9 Testing Error [ 0.18]
Average Mean Square Error
Training Error 	 Testing Error
0.0436666666667 	0.21


In [97]:
evaluation(classes,Y,predictions)

Confusion Matrix
[[473  27]
 [ 16 484]]
Accuracy 0.957
Precision [0.96728016359918201, 0.94716242661448136]
Recall [0.94599999999999995, 0.96799999999999997]
F_score [0.95652173913043481, 0.95746785361028686]
