In [1]:
import numpy as np
import os
from collections import Counter
import re

In [2]:
def load_stop_words(file):
    swl = []
    with open(file, encoding='utf-8') as sw:
        swl.extend(sw.read().split('\n'))
    return np.array(swl)

In [3]:
def read_folder(path, stop_words):
    for files in (os.path.join(path, files_name) for files_name in os.listdir(path)):
        with open(files, encoding='utf-8') as txt:
            uni = np.array(list(set(re.findall(r'\w+', re.sub('<br />', ' ', txt.read().lower())))))
            yield np.setdiff1d(uni, stop_words)

In [4]:
def get_word_frequency(path, stop_words):
    files_count = len(os.listdir(path))
    i = 1
    total = Counter()
    for t in read_folder(path, stop_words):
        each = Counter(t)
        total.update(each)
        if i % (0.2 * files_count) == 0:
            print(" %d/%d " % (i, files_count), end='..')
        i += 1
    print("\nSuccess!")
    res = np.array(total.most_common())
    res[:, 1] = res[:, 1].astype('int32') / files_count
    res[:, 1] = np.log(res[:, 1].astype('float64'))  # 取对数，PI(a,b,c,d) = SIGMA(a,b,c,d) 过小的数字乘起来会趋0
    return res

In [5]:
def prefix(path, stop_words):
    with open(path, encoding='utf-8') as txt:
        uni = np.array(list(set(re.findall(r'\w+', re.sub('<br />', ' ', txt.read().lower())))))
        return np.setdiff1d(uni, stop_words)

In [17]:
def load_test(path, goal, stop_words, train_pos_dict, train_neg_dict):
    files_count = len(os.listdir(path))
    i = 1
    err_count = 0
    for files in (os.path.join(path, files_name) for files_name in os.listdir(path)):
        if i % (0.1 * files_count) == 0:
            print("%5d samples tested, acc is %f" % (i, (1 - float(err_count) / files_count) * 100))
        fixed = prefix(files, stop_words)
        pos_score = 0.0
        neg_score = 0.0
        for word in fixed:
            if word in train_pos_dict:
                pos_score += float(train_pos_dict[word])
            if word in train_neg_dict:
                neg_score += float(train_neg_dict[word])
        i += 1
        if (pos_score - neg_score) * goal >= 0:
            continue
        else:
            err_count += 1
    return (1 - float(err_count) / files_count) * 100

In [18]:
def model(train_pos_path, train_neg_path, test_pos_path, test_neg_path, stop_words_path):
    d = {'train_pos_path': train_pos_path,
         'train_neg_path': train_neg_path,
         'test_pos_path': test_pos_path,
         'test_neg_path': test_neg_path,
         's_w_path': stop_words_path}
    stop_words = load_stop_words(stop_words_path)
    print("*******Loading Positive Data*************************")
    train_pos_data = get_word_frequency(train_pos_path, stop_words)
    print("*******Loading Negative Data*************************")
    train_neg_data = get_word_frequency(train_neg_path, stop_words)
    train_pos_dict = dict(train_pos_data)
    train_neg_dict = dict(train_neg_data)
    print("*******Positive Tests:*******************************")
    d['test_pos_acc'] = load_test(test_pos_path, 1, stop_words, train_pos_dict, train_neg_dict)
    print("*******Negative Tests:*******************************")
    d['test_neg_acc'] = load_test(test_neg_path, -1, stop_words, train_pos_dict, train_neg_dict)
    return d

In [19]:
result = model('aclImdb/train/pos/',
               'aclImdb/train/neg/',
               'aclImdb/test/pos/',
               'aclImdb/test/neg/',
               'stop_words.txt')

*******Loading Positive Data*************************
 2500/12500 .. 5000/12500 .. 7500/12500 .. 10000/12500 .. 12500/12500 ..
Success!
*******Loading Negative Data*************************
 2500/12500 .. 5000/12500 .. 7500/12500 .. 10000/12500 .. 12500/12500 ..
Success!
*******Positive Tests:*******************************
 1000 samples tested, acc is 97.710000
 2000 samples tested, acc is 95.560000
 3000 samples tested, acc is 93.500000
 4000 samples tested, acc is 91.050000
 5000 samples tested, acc is 88.720000
 6000 samples tested, acc is 86.050000
 7000 samples tested, acc is 83.470000
 8000 samples tested, acc is 81.210000
 9000 samples tested, acc is 78.660000
10000 samples tested, acc is 76.030000
*******Negative Tests:*******************************
 1000 samples tested, acc is 96.680000
 2000 samples tested, acc is 93.730000
 3000 samples tested, acc is 90.710000
 4000 samples tested, acc is 87.660000
 5000 samples tested, acc is 84.730000
 6000 samples tested, acc is 81.770

In [16]:
for k in result:
    print(str(k) + " \t: " + str(result[k]))

train_pos_path 	: aclImdb/train/pos/
train_neg_path 	: aclImdb/train/neg/
test_pos_path 	: aclImdb/test/pos/
test_neg_path 	: aclImdb/test/neg/
s_w_path 	: stop_words.txt
test_pos_acc 	: 76.03
test_neg_acc 	: 68.99
