In [47]:
# Ref: https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
from codecs import open
from __future__ import division
from collections import Counter
import numpy as np
import pandas as pd

# Read document
# Ref: Project 2 instruction
def read_documents(doc_file):
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            #print(words)
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

# Seperate list it by class
def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

# Get the sum of pos column
def sum_total_pos(dataframe):
    return dataframe['pos'].sum()

# Get the sum of neg column
def sum_total_neg(dataframe):
    return dataframe['neg'].sum()

# Smoothing dataframe
def smooth_data_frame(dataframe):
    dataframe['pos'] = dataframe['pos'].add(0.5)
    dataframe['neg'] = dataframe['neg'].add(0.5)
    return dataframe

# Calculate logarithmic probability
def calculate_probability(value,total):
    return np.log(value/total)

# Calculate logarithmic probabilities for dataframe
def calculate_probabilities(dataframe):
    sum_pos = sum_total_pos(dataframe)
    print(sum_pos)
    sum_neg = sum_total_neg(dataframe)
    dataframe['pos'] =  np.log(dataframe['pos'] / sum_pos)
    dataframe['neg'] =  np.log(dataframe['neg'] / sum_neg)
    return dataframe

# Add label to docs
def add_label_to_docs(docs,labels):
    for index in range(len(docs)):
        docs[index].append(labels[index])
    return docs

def train_nb(documents, labels):
    #return the data you need to classify new instances
    new_docs = add_label_to_docs(documents,labels)
    # Separate docs by label
    train_docs = np.array(documents)
    seperate = separate_by_class(documents)
    train_docs_neg_freqs = Counter(w for doc in seperate['neg'] for w in doc)
    train_docs_pos_freqs = Counter(w for doc in seperate['pos'] for w in doc)
    df_neg = pd.DataFrame.from_dict(train_docs_neg_freqs, orient='index').reset_index()
    df_neg = df_neg.rename(columns={'index':'word', 0:'neg'})
    df_pos = pd.DataFrame.from_dict(train_docs_pos_freqs, orient='index').reset_index()
    df_pos = df_pos.rename(columns={'index':'word', 0:'pos'})
    
    # Union Join two dataframe
    # Ref: https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
    df_merge = pd.merge(df_neg,df_pos,on='word',how='outer')
    df_merge = df_merge.fillna(0)
    print('Before smoothing:\n',df_merge)
    df_merge = smooth_data_frame(df_merge)
    print('After smoothing:\n',df_merge)
    
    #return a dataframe with all probabilities
    return calculate_probabilities(df_merge)


# def score_doc_label(document, label, <SOMETHING>):
# ...
# (return the log probability)



# def classify_nb(document, <SOMETHING>):
# ...
# (return the guess of the classifier)


# def classify_documents(docs, <SOMETHING>):
# ...
# (return the classifier's predictions for all documents in the collection)
 

# def accuracy(true_labels, guessed_labels):
# ...
# (return the accuracy)


# Read docs
all_docs, all_labels = read_documents('all_sentiment_shuffled.txt')
# This split first 5% of data as training set, and the rest 20% will be evaluation set
split_point = int(0.05*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]
eval_docs = all_docs[split_point:]
eval_labels = all_labels[split_point:]

# Train model get a dataframe which contains probabilities after smoothing and log
df_probablities =train_nb(train_docs,train_labels)
print(df_probablities)

# Ref:https://docs.python.org/3.1/library/collections.html
freqs = Counter()
for label in train_labels:
    freqs[label]+=1

#Sum total labels
#Ref: https://stackoverflow.com/questions/18593519/sum-of-all-counts-in-a-collections-counter
print('Total labels:\n',sum(freqs.values()))
print('negtive labels:\n',freqs['neg'])
print('postive labels:\n',freqs['pos'])
neg_pob = calculate_probability(freqs['neg'],sum(freqs.values()))
pos_pob = calculate_probability(freqs['pos'],sum(freqs.values()))
print('negtive probablity after smoothing and log:\n',neg_pob)
print('postive probablity after smoothing and log:\n',pos_pob)





Before smoothing:
                  word     neg     pos
0                   i  1014.0   985.0
1              bought    43.0    33.0
2                this   605.0   621.0
3               album    57.0    55.0
4             because    72.0    58.0
5               loved    11.0     6.0
6                 the  2109.0  2178.0
7               title     6.0     3.0
8                song    25.0    18.0
9                   .  1896.0  1920.0
10                 it   742.0   745.0
11                 's   297.0   312.0
12               such    29.0    19.0
13                  a   963.0  1025.0
14              great    63.0   115.0
15                  ,  1741.0  1665.0
16                how    76.0    58.0
17                bad    43.0    16.0
18                can    90.0   126.0
19               rest    14.0     7.0
20                 of   857.0   898.0
21                 be   170.0   170.0
22              right    22.0    28.0
23                  ?   104.0    52.0
24               well    43.0  

In [29]:
# Example for append function
keys = ['a', 'b', 'c']
values = [[1, 2, 3, 1, 2, 1, 1],[1, 2, 3, 1, 2, 1, 1],[1, 2, 3, 1, 2, 1, 1]]
for index in range(len(values)):
        values[index].append(keys[index])
print(values)

[[1, 2, 3, 1, 2, 1, 1, 'a'], [1, 2, 3, 1, 2, 1, 1, 'b'], [1, 2, 3, 1, 2, 1, 1, 'c']]
