In [50]:
import os
import shutil
import re
import numpy as np
from collections import defaultdict
from operator import itemgetter

In [10]:
filenames = ['./models/part-00000.txt', './models/part-00001.txt']

In [11]:
word_search_re = re.compile(r"[\w']+")

In [12]:
def load_model(filenames):
    """ parse files with models for naive bayes
    """
    model = defaultdict(lambda: defaultdict(float))
    for filename in filenames:
        with open(filename) as inf:
            for line in inf:
                word, values = line.split(maxsplit=1)
                word = eval(word)
                values = eval(values)
                model[word] = values
    return model

In [13]:
blogposts_model = load_model(filenames)

In [43]:
# check the probabilities for each class
blogposts_model['yes']

{'female': 0.000630445094236531, 'male': 0.0002960331557134399}

The product of conditional probabilities is a very small number. So, we should  calculate sum of logarithms (use the log space) to avoid underflow error. Logarithm of a number < 2 is a negative number. 

Also: $a, b < 2$ and $a < b$, => $log(a) < log(b)$. 

Thus, the gender with greater log probability will be the answer.

In [44]:
def nb_predict(model, document):
    """ predict gender for document
    """
    probabilities = defaultdict(lambda : 1)
    words = word_search_re.findall(document)
    for word in set(words):
        probabilities['male'] += np.log(model[word].get('male', 1e-15))
        probabilities['female'] += np.log(model[word].get('female', 1e-15))
    likely_genders = sorted(probabilities.items(), key=itemgetter(1), reverse=True)
    return likely_genders[0][0]

Copy post from the dataset for *male* to try prediction.

In [60]:
def test_nb(model, filename):
    """ predict gender for each post in file
        returns: read_gender and predicted one
    """
    with open(filename) as inf:
        for line in inf:
            tokens = line.split()
            real_gender = eval(tokens[0])
            blog_post = eval(" ".join(tokens[1:]))
            yield real_gender, nb_predict(model, blog_post)

Check accuracy for file with blogposts from training set.

In [69]:
result_list = list(test_nb(blogposts_model, './blogposts/part-00010'))

In [81]:
right_answers = 0
for pair in result_list:
    if pair[0] == pair[1]:
        right_answers += 1

print("Accuracy:", right_answers / len(result_list))

Accuracy: 0.921875


Make training and testing sets.

In [54]:
try:
    os.mkdir('./blogs_train')
except:
    pass
try:
    os.mkdir('./blogs_test')
except:
    pass

Training data: posts starting from '4' and '8'

Testing data: posts starting from '6' and '7'

In [58]:
data_dir = './data/data/'
train_dir = './blogs_train'
test_dir = './blogs_test'
for file in os.listdir(data_dir):
    if file[0] == '4' or file[0] == '8':
        shutil.copy2(data_dir + file, train_dir)
    elif file[0] == '6' or file[0] == '7':
        shutil.copy2(data_dir + file, test_dir)