In [2]:
# Ref: https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
from __future__ import division
from codecs import open
from collections import Counter
import numpy as np
import pandas as pd

In [11]:

# Read document
# Ref: Project 2 instruction
def read_documents(doc_file):
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            #print(words)
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

# Seperate list it by class
def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

# Get the sum of pos column
def sum_total_pos(dataframe):
    return dataframe['pos'].sum()

# Get the sum of neg column
def sum_total_neg(dataframe):
    return dataframe['neg'].sum()

# Smoothing dataframe
def smooth_data_frame(dataframe):
    dataframe['pos'] = dataframe['pos'].add(0.5)
    dataframe['neg'] = dataframe['neg'].add(0.5)
    return dataframe

# Calculate logarithmic probability
def calculate_probability(value,total):
    return np.log(value/total)

# Calculate logarithmic probabilities for dataframe
def calculate_probabilities(dataframe):
    sum_pos = sum_total_pos(dataframe)
    print(sum_pos)
    sum_neg = sum_total_neg(dataframe)
    dataframe['pos'] =  np.log(dataframe['pos'] / sum_pos)
    dataframe['neg'] =  np.log(dataframe['neg'] / sum_neg)
    return dataframe


# Add label to docs
def add_label_to_docs(docs,labels):
    for index in range(len(docs)):
        docs[index].append(labels[index])
    return docs

def train_nb(documents, labels):
    #return the data you need to classify new instances
    new_docs = add_label_to_docs(documents,labels)
    # Separate docs by label
    train_docs = np.array(documents)
    seperate = separate_by_class(documents)
    train_docs_neg_freqs = Counter(w for doc in seperate['neg'] for w in doc)
    train_docs_pos_freqs = Counter(w for doc in seperate['pos'] for w in doc)
    df_neg = pd.DataFrame.from_dict(train_docs_neg_freqs, orient='index').reset_index()
    df_neg = df_neg.rename(columns={'index':'word', 0:'neg'})
    df_pos = pd.DataFrame.from_dict(train_docs_pos_freqs, orient='index').reset_index()
    df_pos = df_pos.rename(columns={'index':'word', 0:'pos'})
    
    # Union Join two dataframe
    # Ref: https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
    df_merge = pd.merge(df_neg,df_pos,on='word',how='outer')
    df_merge = df_merge.fillna(0)
    print('Before smoothing:\n',df_merge)
    df_merge = smooth_data_frame(df_merge)
    print('After smoothing:\n',df_merge)
    df_probablities=calculate_probabilities(df_merge).set_index('word')

    # calculate the probabilities
    num_pos_doc=labels.count('pos')
    num_neg_doc=labels.count('neg')

    before_prob_dic={'pos':np.log(num_pos_doc/(num_pos_doc+num_neg_doc)),
                    'neg':np.log(num_neg_doc/(num_pos_doc+num_neg_doc))}


    return df_probablities,before_prob_dic


def score_doc_label(document, label, params):
    '''
    # (return the log probability)
    '''
    df_probablities,before_prob_dic=params
    p_log=before_prob_dic[label] #with the probabilities from above
    for w in document:
        try:
            p_log+=df_probablities.loc[w,label] #increasing the conditional probabilities
        except:
            pass #if the word does not exist in the model, do not count it
    p=np.exp(p_log)

    return p


def classify_nb(document, params):
    '''
    (return the guess of the classifier)
    '''
    # check the score
    pos_score=score_doc_label(document,'pos',params)
    neg_score=score_doc_label(document,'neg',params)

    label='pos' if pos_score>neg_score else 'neg'
    return label


def classify_documents(docs, params):
    '''
    (return the classifier's predictions for all documents in the collection)
    '''
    guessed_labels=[]
    for document in docs:
        guessed_labels.append(classify_nb(document,params))
    return guessed_labels


def accuracy(true_labels, guessed_labels):
    '''...
    (return the accuracy)
    '''
    correct_num=0
    for t,g in zip(true_labels,guessed_labels):
        # check the correctness
        correct_num+=(t==g)
    # accuracy
    acc=correct_num/len(true_labels)
    return acc


## Task0

In [12]:
# 1.Read docs
all_docs, all_labels = read_documents('all_sentiment_shuffled.txt')


# 2.This split first 80% of data as training set, and the rest 20% will be evaluation set
split_point = int(0.8*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]
eval_docs = all_docs[split_point:]
eval_labels = all_labels[split_point:]

## Task1

In [13]:
# Train model get a dataframe which contains probabilities after smoothing and log
df_probablities,before_prob_dic =train_nb(train_docs,train_labels)
print(df_probablities)

Before smoothing:
                            word      neg      pos
0                             i  16707.0  14814.0
1                        bought    686.0    543.0
2                          this   9608.0   9391.0
3                         album    733.0    882.0
4                       because   1183.0    897.0
5                         loved    125.0    181.0
6                           the  31914.0  33502.0
7                         title    110.0     95.0
8                          song    366.0    498.0
9                             .  30682.0  29981.0
10                           it  12338.0  11728.0
11                           's   4769.0   5309.0
12                         such    453.0    404.0
13                            a  14857.0  16184.0
14                        great    824.0   2008.0
15                            ,  26333.0  27532.0
16                          how    941.0    881.0
17                          bad    709.0    261.0
18                          can

## Task2

In [14]:
# <SOMETHING> task1's return 
params=(df_probablities,before_prob_dic)

In [15]:
# sanity check1
very_short_doc=['great']
pos_prob=score_doc_label(very_short_doc,'pos',params)
neg_prob=score_doc_label(very_short_doc,'neg',params)

print(pos_prob,neg_prob)

0.001358386003932884 0.0005433109334758503


In [16]:
# sanity check2
very_short_doc=['a','top-quality','performance']
pos_prob=score_doc_label(very_short_doc,'pos',params)
neg_prob=score_doc_label(very_short_doc,'neg',params)

print(pos_prob,neg_prob)

2.889535085317261e-06 1.4637191643382007e-06


In [17]:
# overall acc
guessed_labels=classify_documents(eval_docs,params)
overall_acc=accuracy(eval_labels,guessed_labels)
print('Overall acc is ',overall_acc)

Overall acc is  0.6953420058749475


In [18]:
#  Give a table of results showing the accuracy for each class
pos_correct,neg_correct=0,0 #the correctness from pos and neg
pos_all,neg_all=0,0 #the overall records from pos and neg

for guess,true in zip(guessed_labels,eval_labels):
    if true=='pos':   
        pos_all+=1
        if guess=='pos':
            pos_correct+=1
    elif true=='neg':   
        neg_all+=1
        if guess=='neg':
            neg_correct+=1

pos_acc=pos_correct/pos_all
neg_acc=neg_correct/neg_all
df_table=pd.DataFrame([[pos_correct,pos_all,pos_acc],
                       [neg_correct,neg_all,neg_acc]],
                     columns=['Correct','All','Accuracy'],
                     index=['positive','negative'])
df_table

Unnamed: 0,Correct,All,Accuracy
positive,529,1153,0.458803
negative,1128,1230,0.917073


Based on the observation, pos' accuracy is much less than neg's. The reason could be the positve comments from the modle is less than expected, therefore, it disturbs the modle's judment.