In [4]:
import pandas as pd
import sys
df_labels = pd.read_csv("tweet-train-labels.csv", delimiter=',', header=None)
df_features = pd.read_csv("tweet-train-features.csv", delimiter=',', header=None)

In [5]:
df_features_test = pd.read_csv("tweet-test-features.csv", delimiter=',', header=None)
df_labels_test = pd.read_csv("tweet-test-labels.csv", delimiter=',', header=None)

In [6]:
vcs = df_labels[0].value_counts()

In [7]:
print("Ratio of classes\n------------------")
for vck in vcs.keys():
    print(vck, "\t", vcs[vck]/sum(vcs))

Ratio of classes
------------------
negative 	 0.6054474043715847
neutral 	 0.2234460382513661
positive 	 0.1711065573770492


In [8]:
def make_labels_num(label_list):
    result_list = []
    for index, row in label_list.iterrows():
        label = row[0]
        if label == "positive":
            result_list.append(1)
        elif label == "neutral":
            result_list.append(0)
        else:
            result_list.append(-1)
    return pd.DataFrame(result_list)

In [9]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    assert(len(actual) == len(predicted))
    for i in range(len(actual)):
        if actual.iloc[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [241]:
def safe_ln(x, minval=0.0000000001):
    result = np.log(x.clip(min=minval))
    return result

import numpy as np
class MultinomialNB(object):
    def __init__(self, alpha=0):
        self.alpha = alpha
        
    def fit(self, X, y):
        # X => features
        # y => labels
        count_vocabs = X.shape[1]
        count_sample = X.shape[0]
        
        # count_vocabs = vocab length
        result_list_dict = {}
        zipped = pd.concat([X, y], axis=1)
        neg_df = zipped[zipped.iloc[:, -1] == -1].iloc[:,0:-1]
        neut_df = zipped[zipped.iloc[:, -1] == 0].iloc[:,0:-1]
        pos_df = zipped[zipped.iloc[:, -1] == 1].iloc[:,0:-1]
                
        result_neg = []
        for i in range(count_vocabs):
            #print(i, "/", count_vocabs)
            neg_df_trimmed = neg_df.iloc[:, i]
            result_neg.append(int(neg_df_trimmed.sum(axis=0)))

        result_neg_sum = sum(result_neg)
                    
        result_neut = []
        for i in range(count_vocabs):
            #print(i, "/", count_vocabs)
            neut_df_trimmed = neut_df.iloc[:, i]
            result_neut.append(int(neut_df_trimmed.sum(axis=0)))

        result_neut_sum = sum(result_neut)
         
        result_pos = []
        for i in range(count_vocabs):
            pos_df_trimmed = pos_df.iloc[:, i]
            result_pos.append(int(pos_df_trimmed.sum(axis=0)))
        
        result_pos_sum = sum(result_pos)
        
        separated = [neg_df, neut_df, pos_df]
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]

        results = np.array([result_neg, result_neut, result_pos]) + self.alpha
        m = results / ( results.sum(axis=1)[np.newaxis].T + count_vocabs * self.alpha ) 

        self.feature_log_prob_ = safe_ln(m)
        return self
    
    def predict_log_proba(self, X):
        #t0 = time()
        res = []
        ctr = 1
        print("Calculating...\n")
        for index, xrow in X.iterrows():
            
            feature_log_pr = pd.DataFrame(self.feature_log_prob_)

            toSum = [[x * flp[1][index] for index, x in enumerate(xrow) if x != 0] for flp in feature_log_pr.iterrows()]

            sumRes = [sum(ts) for ts in toSum]

            totalRes = [sr + self.class_log_prior_[index] for index, sr in enumerate(sumRes)]
            if((totalRes[1] == totalRes[0] and totalRes[0] >= totalRes[2]) or (totalRes[2] == totalRes[0] and totalRes[0] >= totalRes[1])): #favoring neutral
                print("\nHERE\n")
                totalRes[1] = 1
                totalRes[0] = 0
                totalRes[2] = 0
                
            res.append(np.array(totalRes))

            print(str(ctr) + "/" + str(len(X)), end="\r", flush=True)

            sys.stdout.flush()
            ctr += 1
        return res

    def predict(self, X):
        return np.argmax(self.predict_log_proba(X), axis=1) - 1

In [251]:
def print_prediction_stats(actual, prediction):
    correct = 0
    assert(len(actual) == len(prediction))
    for i in range(len(actual)):
        if actual.iloc[i] == prediction[i]:
            correct += 1
    print("Total predictions made:\t", len(actual))
    print("Correct predictions: \t", correct)
    print("Wrong predictions:   \t", len(actual) - correct)

# Without Dirichlet prior ( First coding problem)

In [243]:
nb = MultinomialNB()
df_labels_new = make_labels_num(df_labels)
nb.fit(df_features, df_labels_new)
df_labels_test = make_labels_num(df_labels_test)

In [244]:
#for the full test data
predicted1 = nb.predict(df_features_test)
len(df_features_test)
accuracy1 = accuracy_metric(df_labels_test[0], predicted1)

Calculating...

2928/2928

In [252]:
print("Accuracy: ", accuracy1)
print_prediction_stats(df_labels_test[0], predicted1)

Accuracy:  81.86475409836066
Total predictions made:	 2928
Correct predictions: 	 2397
Wrong predictions:   	 531


# With Dirichlet prior

In [249]:
# Second question with Alpha value. MAP part
nb = MultinomialNB(alpha=1.0)
nb.fit(df_features, df_labels_new)
predicted2 = nb.predict(df_features_test)
len(df_features_test)
accuracy2 = accuracy_metric(df_labels_test[0], predicted2)

Calculating...

2928/2928

In [254]:
print("Accuracy: ", accuracy2)
print_prediction_stats(df_labels_test[0], predicted2)

Accuracy:  89.31010928961749
Total predictions made:	 2928
Correct predictions: 	 2615
Wrong predictions:   	 313
