# Haojin Liao 1001778275

## Data preprocess

In [2]:
import re
import os
import numpy as np
from tqdm import tqdm
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import nltk
nltk.download('punkt')
nltk.download('stopwords')


class Data_preprocess(object):
    def __init__(self):
        pass

    # regular expression
    def rm_tags(self, text):
        re_tag = re.compile(r'<[^>]+>')
        return re_tag.sub('', text)


    def read_files(self, filetype):
        path = "./aclImdb/"
        file_list = []
        pos_num = 0
        neg_num = 0
        positive_path = path + filetype+"/pos/"
        for f in os.listdir(positive_path):
            file_list += [positive_path+f]
            pos_num += 1
        negative_path = path + filetype+"/neg/"
        for f in os.listdir(negative_path):
            file_list += [negative_path+f]
            neg_num += 1
        print('read', filetype, 'files:', len(file_list))
        print('pos_num: ', pos_num)
        print('neg_num: ', neg_num)
        all_labels = ([1] * pos_num + [0] * neg_num)
        all_texts = []
        for index, fi in tqdm(enumerate(file_list)):
            with open(fi, encoding='utf8') as file_input:
                filelines = file_input.readlines()
                if len(filelines) != 0:
                    text = filelines[0]
                    # remove < > tag
                    text = self.rm_tags(text)
                    # lower case
                    text = text.lower()
                    # tokenize
                    words = word_tokenize(text)
                    # topwords
                    words = [w for w in words if w not in stopwords.words('english')]
                    # # Stemming
                    words = [PorterStemmer().stem(w) for w in words]
                    all_texts.append(words)
                else:
                    print('empty index: ', index)
                    all_texts.append([''])
#             if index == 20:
#                 break

        return all_texts, all_labels


data_preprocess = Data_preprocess()
x_train, y_train = data_preprocess.read_files('train')
x_test, y_test = data_preprocess.read_files('test')

train_index = [i for i in range(len(x_train))]
test_index = [i for i in range(len(x_test))]

random.shuffle(train_index)
random.shuffle(test_index)

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

x_train = x_train[train_index]
y_train = y_train[train_index]
x_test = x_test[test_index]
y_test = y_test[test_index]



[nltk_data] Downloading package punkt to /home/haojin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/haojin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
6it [00:00, 59.93it/s]

read train files: 25000
pos_num:  12500
neg_num:  12500


25000it [11:37, 35.82it/s]
5it [00:00, 42.39it/s]

read test files: 25000
pos_num:  12500
neg_num:  12500


15841it [07:10, 36.76it/s]

empty index:  15835


18847it [08:33, 50.65it/s]

empty index:  18839


25000it [11:21, 36.70it/s]


## Extract feature words

In [3]:
import math, collections

labels = [0, 1]

def mutual_info(N, Nij, Ni_, N_j):
    return Nij * 1.0 / N * math.log(N * (Nij + 1) * 1.0 / (Ni_ * N_j)) / math.log(2)

def label2id(label):
    for i in range(len(labels)):
        if label == labels[i]:
            return i

def id2label(i):
    if i <= 2:
        return labels[i]
    else:
        return 0
    
def doc_dict():
    return [0] * len(labels)

def count_for_cates(train_x, train_y, featureFile='./bayes_feature.txt'):
    doccount = [0] * len(labels)
    wordcount = collections.defaultdict(lambda: doc_dict())

    n = 0
    class_count = [0, 0]

    while (n < len(train_x)):

        index1 = label2id(train_y[n])

        class_count[index1] += 1
        words = train_x[n]
        for word in words:
            wordcount[word][index1] += 1
            doccount[index1] += 1
        n += 1

    # print('wordcount:', wordcount)
    print('Word count ：', len(wordcount))
    print('doc count Number of words per category:', doccount)

    print('Extract feature words')
    midict = collections.defaultdict(lambda: doc_dict())
    N = sum(doccount)
    for k, vs in wordcount.items():
        for i in range(len(vs)):
            N11 = vs[i]
            N10 = sum(vs) - N11
            N01 = doccount[i] - N11
            N00 = N - N11 - N10 - N01
            mi = mutual_info(N, N11, N10 + N11, N01 + N11) + mutual_info(N, N10, N10 + N11, N00 + N10) + mutual_info(N, N01, N01 + N11, N01 + N00) + mutual_info(
                N, N00, N00 + N10, N00 + N01)
            midict[k][i] = mi

    fwords = set()
    for i in range(len(doccount)):
        keyf = lambda x: x[1][i]
        sortedDict = sorted(midict.items(), key=keyf, reverse=True)
        for j in range(100):
            fwords.add(sortedDict[j][0])
    out = open(featureFile, 'w', encoding='utf-8', errors='ignore')
    out.write(str(doccount) + '\n')
    for fword in fwords:
        out.write(fword + '\n')
    out.close()
    return class_count

class_count = count_for_cates(x_train, y_train)



Word count ： 108646
doc count Number of words per category: [1899087, 1953045]
Extract feature words


## Train Naive_Bayes

In [4]:
def load_feature_words(featureFile):
    f = open(featureFile, encoding='utf-8', errors='ignore')
    doccounts = eval(f.readline())
    features = set()
    for line in f:
        features.add(line.strip())
    f.close()
    return doccounts, features

def train_bayes(class_count, featurefile, modelfile, x_train, y_train):
    
    doccounts, features = load_feature_words(featurefile)
    print(doccounts)
    wordcount = collections.defaultdict(lambda: doc_dict())
    tcount = [0] * len(doccounts)
    
    for index, words in enumerate(x_train):
        
        index1 = label2id(y_train[index])
        
        for word in words:
            if word in features:
                tcount[index1] += 1
                wordcount[word][index1] += 1
    print('tcount: ', tcount)
    print('wordcount: ', wordcount)
    outmodel = open(modelfile, 'w', encoding='utf-8')
    print('save model')
    for k, v in wordcount.items():
        if k == '':
            continue
        scores = [v[i] * 1.0 / len(wordcount) * (class_count[i]/sum(class_count)) for i in range(len(v))]
#         scores = [(v[i] + 1) * 1.0 / (tcount[i] + len(wordcount)) * (class_count[i]/sum(class_count)) for i in range(len(v))]
        outmodel.write(k + '\t' + str(scores) + '\n')
    outmodel.close()
    
train_bayes(class_count,'./bayes_feature.txt','./bayes_model.txt', x_train, y_train)

[1899087, 1953045]
tcount:  [362722, 330106]
wordcount:  defaultdict(<function train_bayes.<locals>.<lambda> at 0x7f18c4186560>, {'movi': [27993, 21947], ',': [131804, 144074], "n't": [19959, 13420], 'enjoy': [1466, 2797], 'love': [2751, 5962], 'year': [2551, 3765], 'superb': [99, 549], 'could': [5684, 3673], 'play': [3565, 5009], 'perfect': [367, 1338], 'great': [2637, 6351], 'well': [3833, 5833], 'noth': [2927, 1283], 'look': [5646, 4032], 'world': [1369, 2395], 'beauti': [820, 2393], 'oh': [934, 324], 'famili': [1218, 2116], 'life': [2214, 3885], 'annoy': [943, 282], 'also': [3496, 5433], 'tri': [3808, 2491], 'act': [5120, 3318], 'favorit': [297, 1101], 'reason': [1964, 1165], 'dull': [664, 141], 'anyth': [1843, 1059], 'would': [7663, 5710], '2': [1295, 667], 'today': [313, 922], '...': [7242, 4884], '?': [11342, 4743], 'bad': [7139, 1846], 'even': [7665, 5021], 'wast': [1998, 193], 'plot': [4137, 2469], 'wonder': [1309, 2282], "'m": [2878, 1878], 'amaz': [367, 1162], 'suck': [591, 

## Predict on text data

In [5]:
def load_model(modelfile):
    print('loading model')
    f = open(modelfile, encoding='utf-8', errors='ignore')
    scores = {}
    for line in f:
        word, counts = line.strip().rsplit('\t', 1)
        scores[word] = eval(counts)
    f.close()
    return scores

def predict(featurefile, modelfile, test_x, test_y):
    doccounts, features = load_feature_words(featurefile)
    docscores = [math.log(count * 1.0 / sum(doccounts)) for count in doccounts]
    scores = load_model(modelfile)
    rcount = 0
    doccount = 0
    print('Use the test set to validate the model')

    predict_y = []

    n = 0
    while (n < len(test_x)):
        words = test_x[n]
        index1 = label2id(test_y[n])
        prevalues = list(docscores)
        for word in words:
            if word in features:
                for i in range(len(prevalues)):
                    prevalues[i] += math.log(scores[word][i])
        m = max(prevalues)
        pindex = prevalues.index(m)

        predict_y.append(id2label(pindex))

        if pindex == index1:
            rcount += 1
        doccount += 1
        n += 1
    print('Test text volume: %d, Predict the correct amount of categories: %d, Naive Bayes classifier accuracy: %f' % (doccount, rcount, rcount * 1.0 / doccount))
    
predict('./bayes_feature.txt', './bayes_model.txt', x_test, y_test)

loading model
Use the test set to validate the model
Test text volume: 25000, Predict the correct amount of categories: 19387, Naive Bayes classifier accuracy: 0.775480


## Bayes model using Smoothing

In [6]:
def load_feature_words(featureFile):
    f = open(featureFile, encoding='utf-8', errors='ignore')
    doccounts = eval(f.readline())
    features = set()
    for line in f:
        features.add(line.strip())
    f.close()
    return doccounts, features

def train_bayes(class_count, featurefile, modelfile, x_train, y_train):
    
    doccounts, features = load_feature_words(featurefile)
    print(doccounts)
    print(features)
    wordcount = collections.defaultdict(lambda: doc_dict())
    tcount = [0] * len(doccounts)
    
    for index, words in enumerate(x_train):
        
        index1 = label2id(y_train[index])
        
        for word in words:
            if word in features:
                tcount[index1] += 1
                wordcount[word][index1] += 1
    print('tcount: ', tcount)
    print('wordcount: ', wordcount)
    outmodel = open(modelfile, 'w', encoding='utf-8')
    print('save model')
    for k, v in wordcount.items():
        if k == '':
            continue
        scores = [(v[i] + 1) * 1.0 / (tcount[i] + len(wordcount)) * (class_count[i]/sum(class_count)) for i in range(len(v))]
        outmodel.write(k + '\t' + str(scores) + '\n')
    outmodel.close()
    
train_bayes(class_count,'./bayes_feature.txt','./bayes_model.txt', x_train, y_train)

[1899087, 1953045]
{'highli', 'plot', 'also', 'lack', 'money', 'unless', 'worst', 'bad', 'stewart', "'m", 'terribl', 'gore', 'fantast', 'decent', 'annoy', 'disappoint', 'guy', 'fail', 'wonder', 'minut', 'young', 'even', 'pathet', 'unbeliev', 'seri', 'suck', 'lame', 'alway', 'wors', 'look', 'stupid', 'brilliant', 'play', 'excus', 'laughabl', 'anyth', '2', 'victoria', 'beauti', ',', 'poorli', 'noth', 'today', 'wast', 'delight', 'embarrass', 'crap', 'garbag', 'would', '...', 'redeem', 'oh', 'tri', 'reason', 'dull', '?', 'horribl', 'touch', 'save', 'best', 'well', 'excel', 'year', 'favorit', 'heart', 'pointless', 'dumb', 'like', 'love', 'script', 'zombi', 'aw', 'amaz', 'badli', 'perform', 'joke', 'perfect', 'could', 'instead', 'ridicul', 'avoid', 'cheap', 'suppos', 'role', 'mess', 'movi', 'enjoy', 'least', 'superb', 'act', 'famili', 'poor', "n't", 'unfunni', 'insult', 'great', 'bore', 'world', 'life', 'thing'}
tcount:  [362722, 330106]
wordcount:  defaultdict(<function train_bayes.<locals>

## Compare the smoothing one with the original one on test data

In [7]:
def load_model(modelfile):
    print('loading model')
    f = open(modelfile, encoding='utf-8', errors='ignore')
    scores = {}
    for line in f:
        word, counts = line.strip().rsplit('\t', 1)
        scores[word] = eval(counts)
    f.close()
    return scores

def predict(featurefile, modelfile, test_x, test_y):
    doccounts, features = load_feature_words(featurefile)
    docscores = [math.log(count * 1.0 / sum(doccounts)) for count in doccounts]
    scores = load_model(modelfile)
    rcount = 0
    doccount = 0
    print('Use the test set to validate the model')

    predict_y = []

    n = 0
    while (n < len(test_x)):
        words = test_x[n]
        index1 = label2id(test_y[n])
        prevalues = list(docscores)
        for word in words:
            if word in features:
                for i in range(len(prevalues)):
                    prevalues[i] += math.log(scores[word][i])
        m = max(prevalues)
        pindex = prevalues.index(m)

        predict_y.append(id2label(pindex))

        if pindex == index1:
            rcount += 1
        doccount += 1
        n += 1
    print('Test text volume: %d, Predict the correct amount of categories: %d, Naive Bayes classifier accuracy: %f' % (doccount, rcount, rcount * 1.0 / doccount))
    
predict('./bayes_feature.txt', './bayes_model.txt', x_test, y_test)

loading model
Use the test set to validate the model
Test text volume: 25000, Predict the correct amount of categories: 20269, Naive Bayes classifier accuracy: 0.810760


## Top 10 words

In [8]:
scores = load_model('./bayes_model.txt')
# print(scores)
# keyf = lambda x: x[1][i]
label_0 = sorted(scores.items(), key = lambda kv:(kv[1][0] - kv[1][1]), reverse=True)
print('top 10 words for negative class: ')
for index, item in enumerate(label_0):
    if index < 10:
        print(item[0])
label_1 = sorted(scores.items(), key = lambda kv:(kv[1][1] - kv[1][0]), reverse=True)
print('top 10 words for positive class: ')
for index, item in enumerate(label_1):
    if index < 10:
        print(item[0])

loading model
top 10 words for negative class: 
?
n't
bad
movi
worst
even
...
wast
could
noth
top 10 words for positive class: 
,
great
love
best
well
also
perform
life
play
beauti


## K-Fold Validation

In [9]:
from sklearn.model_selection import RepeatedKFold

kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=30)

for train_index, dev_index in kf.split(x_train):
    # print('train_index', train_index, 'test_index', test_index)
    train_x, train_y = x_train[train_index], y_train[train_index]
    dev_x, dev_y = x_train[dev_index], y_train[dev_index]
    
    train_bayes(class_count,'./bayes_feature.txt','./bayes_model.txt', train_x, train_y)
    predict('./bayes_feature.txt', './bayes_model.txt', dev_x, dev_y)
    

[1899087, 1953045]
{'highli', 'plot', 'also', 'lack', 'money', 'unless', 'worst', 'bad', 'stewart', "'m", 'terribl', 'gore', 'fantast', 'decent', 'annoy', 'disappoint', 'guy', 'fail', 'wonder', 'minut', 'young', 'even', 'pathet', 'unbeliev', 'seri', 'suck', 'lame', 'alway', 'wors', 'look', 'stupid', 'brilliant', 'play', 'excus', 'laughabl', 'anyth', '2', 'victoria', 'beauti', ',', 'poorli', 'noth', 'today', 'wast', 'delight', 'embarrass', 'crap', 'garbag', 'would', '...', 'redeem', 'oh', 'tri', 'reason', 'dull', '?', 'horribl', 'touch', 'save', 'best', 'well', 'excel', 'year', 'favorit', 'heart', 'pointless', 'dumb', 'like', 'love', 'script', 'zombi', 'aw', 'amaz', 'badli', 'perform', 'joke', 'perfect', 'could', 'instead', 'ridicul', 'avoid', 'cheap', 'suppos', 'role', 'mess', 'movi', 'enjoy', 'least', 'superb', 'act', 'famili', 'poor', "n't", 'unfunni', 'insult', 'great', 'bore', 'world', 'life', 'thing'}
tcount:  [289522, 263780]
wordcount:  defaultdict(<function train_bayes.<locals>