In [1]:
import re
import os
import io
import statistics
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
def clean_text(text):
    text = text.lower()
    text = re.sub("[\":\?!\(\);,\*~]|\.{2,}|<br /><br />|-{2,}|\s-\s", " ", text)
    text = re.sub("([a-z])\.", "\g<1>", text)
    text = re.sub("\s+\.|\.\Z", "", text)
    return re.sub("\s{2,}", " ", text)

In [3]:
def extract_text(file_name):
    ret = []
    
    for root, dirs, files in os.walk(file_name[:-1]):    
        for name in files:
            with io.open(file_name + name, "r", encoding="utf-8") as file:
                ret.append(clean_text(file.read()))
    
    return ret

In [4]:
def create_review_data(pos_file_name, neg_file_name):
    train_text_positive = extract_text(pos_file_name)
    train_text_negative = extract_text(neg_file_name)
    
    train_pos = pd.DataFrame(data=train_text_positive, columns=["review"])
    train_pos["sentiment"] = "positive"
    
    train_neg = pd.DataFrame(data=train_text_negative, columns=["review"])
    train_neg["sentiment"] = "negative"
    
    return pd.concat([train_pos, train_neg])

### Create the word list

In [5]:
def create_word_lists(data):    
    positive_words = dict()
    negative_words = dict()
    all_words = dict()

    i = 0
    
    for review in data["review"]:
        review = review.split(" ")
        #iloc didn't want to work with the value from a for loop
        #so I adjusted it to be manual
        current = data.iloc[i]

        if current["sentiment"] == "positive":
            for word in review:
                if word in positive_words.keys():
                    positive_words[word].append(i)
                else:
                    positive_words[word] = [i]
                    
                if word in all_words.keys():
                    all_words[word].append(i)
                else:
                    all_words[word] = [i]
        else:
            for word in review:
                if word in negative_words.keys():
                    negative_words[word].append(i)
                else:
                    negative_words[word] = [i]
                    
                if word in all_words.keys():
                    all_words[word].append(i)
                else:
                    all_words[word] = [i]

        i += 1
                    
    for i in positive_words:
        positive_words[i] = len(set(positive_words[i]))
        
    for i in negative_words:
        negative_words[i] = len(set(negative_words[i]))
        
    for i in all_words:
        all_words[i] = len(set(all_words[i]))
        
    positive_words = {k: v for k, v in positive_words.items() if v > 5}
    negative_words = {k: v for k, v in negative_words.items() if v > 5}
    all_words = {k: v for k, v in all_words.items() if v > 5}
    
    return positive_words, negative_words, all_words

### Run naive bayes

In [6]:
def naive_bayes(train_data, test_data, smoothing):
    positive_words, negative_words, all_words = create_word_lists(train_data)
    
    total_documents = train_data.size
    positive_documents = train_data[train_data["sentiment"] == "positive"].size
    negative_documents = total_documents - positive_documents
    
    unique_positive = len(positive_words)
    unique_negative = len(negative_words)
    
    p_positive = positive_documents / total_documents
    p_negative = negative_documents / total_documents
    
    accuracies = []
    
    i = 0
    
    for review in test_data:
        review = review.split(" ")
        current = data.iloc[i]
        pos_num = p_positive
        neg_num = p_negative
        denom = 1
        
        for word in review:
            denom *= all_words[word] / total_documents
            
            if word not in positive_words.keys():
                positive_words[word] = 0
                
            if word not in negative_words.keys():
                negative_words[word] = 0
        
            if smoothing:
                pos_num *= (positive_words[word] + 1) / (positive_documents + unique_positive)
                neg_num *= (negative_words[word] + 1) / (negative_documents + unique_negative)
            else:
                pos_num *= positive_words[word] / positive_documents
                neg_num *= negative_words[word] / negative_documents
        
        if pos_num > neg_num:
            if current["sentiment"] == "positive":
                accuracies.append(1)
            else:
                accuracies.append(0)
        else:
            if current["sentiment"] == "negative":
                accuracies.append(1)
            else:
                accuracies.append(0)
        
        i += 1
        
    print("The accuracy is", statistics.mean(accuracies))

### Determine information about "the"

In [7]:
data = create_review_data("./train/pos/", "./train/neg/")

positive_words, negative_words, all_words = create_word_lists(data)

document_count = data.size

print("P['the'] =", all_words["the"] / document_count)
print("P['the' | Positive] =", positive_words["the"] / (document_count / 2))
print("P['the' | Negative] =", negative_words["the"] / (document_count / 2))

P['the'] = 0.49582
P['the' | Positive] = 0.4952
P['the' | Negative] = 0.49644


### 5-fold cross validation, no smoothing

In [8]:
data = data.sample(frac=1)

data = data.reset_index(drop=True)

kf = KFold(n_splits=5)

for train_index, test_index in kf.split(data):
    
    train_data = data.iloc[train_index]
    test_data = data.iloc[test_index]
    
    naive_bayes(train_data, test_data, False)

The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5


### 5-fold cross validation, smoothing

In [9]:
for train_index, test_index in kf.split(data):
    
    train_data = data.iloc[train_index]
    test_data = data.iloc[test_index]
    
    naive_bayes(train_data, test_data, True)

The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5


### Top positive and negative

In [10]:
positive = []
negative = []

for k, v in positive_words.items():
    probability = (positive_words[k] / (document_count / 2)) / (all_words[k] / document_count)
    positive.append((k, probability))
    
for k, v in negative_words.items():
    probability = (negative_words[k] / (document_count / 2)) / (all_words[k] / document_count)
    negative.append((k, probability))
    
positive = sorted(positive, key=lambda x: x[1], reverse=True)[:10]
negative = sorted(negative, key=lambda x: x[1], reverse=True)[:10]

print("Top 10 positive: " + str(positive))
print("Top 10 negative: " + str(negative))

Top 10 positive: [('cannavale', 2.0), ('transitional', 2.0), ('quibble', 2.0), ('first-class', 2.0), ('cynics', 2.0), ('subtler', 2.0), ('beguiling', 2.0), ('xu', 2.0), ('zhu', 2.0), ('liang', 2.0)]
Top 10 negative: [('747', 2.0), ('pataki', 2.0), ('howlers', 2.0), ('rosanna', 2.0), ('kareena', 2.0), ('saif', 2.0), ('tashan', 2.0), ('acharya', 2.0), ('recoil', 2.0), ('bhai', 2.0)]


### Test set

In [11]:
data = create_review_data("./test/pos/", "./test/neg/")

data = data.sample(frac=1)

data = data.reset_index(drop=True)

kf = KFold(n_splits=5)

for train_index, test_index in kf.split(data):
    
    train_data = data.iloc[train_index]
    test_data = data.iloc[test_index]
    
    naive_bayes(train_data, test_data, False)

The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
The accuracy is 0.5
