In [335]:
'''
Multinomial Naive Bayes on /r/nba Posts
@author: Brian Lin
'''
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import sys
sys.path.append('../lib/')
import Bayes_Helper as bayes
import Multinomial_Helper as multih

In [336]:
def return_csv_files(path):
    pattern = r'^.*\.csv$'
    return [f for f in os.listdir(path) if bool(re.match(pattern,f))]
    
path = '../data/backlog/'
csv_files = return_csv_files(path)
df = pd.DataFrame()
for csv in csv_files:
    csv_df = pd.read_csv(path + csv)
    df = pd.concat([df,csv_df])
df['created'] = pd.to_datetime(df.created, unit = 's')
df.index = range(0,len(df))

In [337]:
''' 
Randomly shuffle the dataframe and generates a training, cross-validation, and test set
'''
random_df = df.reindex(np.random.permutation(df.index))
training_df = random_df.iloc[0:12000]
cross_validation_df = random_df.iloc[12000:15000]
test_df = random_df.iloc[15000:]

In [338]:
cutoffs = [50, 150, 350]
Y_training = training_df.score.apply(lambda d: multih.bucketize(cutoffs,d))
Y_cross_validation = cross_validation_df.score.apply(lambda d: multih.bucketize(cutoffs,d))
Y_test = test_df.score.apply(lambda d: multih.bucketize(cutoffs,d))

In [339]:
words = (bayes.generate_words_and_scores(training_df.title, training_df.score)
                 .groupby('word').agg({'score':['mean','std','median','count']}))


In [369]:
words[('score','bucket')] = words[('score','median')].apply(lambda d: multih.bucketize(cutoffs,d))
bucketize_words = words[words[('score','count')] > 20].sort_values(by=[('score','bucket'),('score','count')], ascending=False)
top200ineachbucket = []
for i in range(len(cutoffs) + 1):
    if i == 2:
        bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:100]
    else:
        bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:100]
    top200ineachbucket.append(bucket)
top200ineachbucket = pd.concat(top200ineachbucket).index.values

In [370]:
n = 250
top250 = words[words['score','count'] > 20].sort_values(by=('score','median'),ascending=False)[:n]
bot250 = words[words['score','count'] > 20].sort_values(by=('score','median'),ascending=True)[:n]
topbot500 = pd.concat([top250,bot250])
topbot500_words = topbot500.index.values
all_words = words.index.values
word_vector_trained_upon= topbot500_words

In [371]:
f500words = words[words['score','count'] > 20].sort_values(by=('score','count'),ascending=False)[:500].index.values

In [372]:
def bag_of_words(titles):
    words = (bayes.generate_words_and_scores(training_df.title, training_df.score)
                 .groupby('word').agg({'score':['mean','std','median','count']}))
    words[('score','bucket')] = words[('score','median')].apply(lambda d: multih.bucketize(cutoffs,d))
    bucketize_words = words[words[('score','count')] > 20].sort_values(by=[('score','bucket'),('score','count')], ascending=False)
    top200ineachbucket = []
    for i in range(len(cutoffs) + 1):
        bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:200]
        top200ineachbucket.append(bucket)
    top200ineachbucket = pd.concat(top200ineachbucket).index.values
    return top200ineachbucket

In [373]:
#bayes_weights = [.9,.8,1.2,.6]
bayes_weights = [1,1,1,1]
def pxy_table(X_data, Y_data, words, num_i):
    '''
    Fix: Y_data must be a numpy array and X_data is a dataframe
    p(word|y=i) = # of times y = i where word appears / # of time y = i
    '''
    Y_values = Y_data.values
    pxyi = [{} for i in range(num_i)]
    for index in range(len(words)):
        for i in range(len(pxyi)):
            word = words[index]
            table = pxyi[i]
            num = float(((X_data.iloc[:,index] == 1) & (Y_values == i)).sum() +1) 
            den = float((Y_values == i).sum() +2)
            table[word] = num/den
    return pxyi

def py_table(Y_data, num_i):
    py = []
    for i in range(num_i):
        py.append(float((Y_data == i).sum())/ len(Y_data))
    return py

def bayes_prob(x_hour, x_titles, pxy_table, py_table, pno):
    titles_prob = []
    for title_i in range(len(x_titles)):
        title = x_titles[title_i]
        title_prob = [1.0 for _ in range(len(py_table))]
        for word in title:
            for i in range((len(title_prob))):
                if word in pxy_table[i]:
                    title_prob[i] *= pxy_table[i][word]
                #else:
                #    title_prob[i] *= pno[i]
        hour = X_hour.iloc[title_i]
        title_prob = [(phour[i+hour*4]) * (bayes_weights[i]) * (title_prob[i]) for i in range(len(py_table))]
        titles_prob.append(title_prob)
    return titles_prob

def pno_table(Y_data, num_i):
    pno = []
    for i in range(num_i):
        pno.append(1 / float(sum(Y_data.values == i) +2))
    return pno

In [374]:
#Note: This changes the word vector we train upon
word_vector_trained_upon= top200ineachbucket


X_training = bayes.generate_feature_vector((training_df.title), word_vector_trained_upon, bayes.common_words)
X_training_df = pd.DataFrame(X_training)

# number of training examples without any features mapped
sum(X_training_df.sum(axis=1) == 0)/float(len(X_training_df))

0.094

In [375]:
pxy = pxy_table(X_training_df, Y_training, word_vector_trained_upon, len(cutoffs)+1)
py = py_table(Y_training, len(cutoffs)+1)
pno = pno_table(Y_training, len(cutoffs)+1)


X_hour = training_df.created.apply(lambda d: d.hour).rename('hour')
phour = ((pd.concat([X_hour,Y_training], axis=1).groupby(['hour','score']).size()
          / pd.concat([X_hour,Y_training], axis=1).groupby(['hour']).size())).values

In [376]:
def calculate_p_values(X_data, Y_data, bag_of_words, cutoffs):
    
    X_training = bayes.generate_feature_vector((X_data.title), bag_of_words, bayes.common_words)
    X_training_df = pd.DataFrame(X_training)
    
    pxy = pxy_table(X_training_df, Y_data, bag_of_words, len(cutoffs)+1)
    py = py_table(Y_data, len(cutoffs)+1)
    pno = pno_table(Y_data, len(cutoffs)+1)
    
    X_hour = X_data.created.apply(lambda d: d.hour).rename('hour')
    phour = ((pd.concat([X_hour,Y_data], axis=1).groupby(['hour','score']).size()
              / pd.concat([X_hour,Y_data], axis=1).groupby(['hour']).size())).values
    return pxy, py, pno, py

In [377]:
def accuracy(predicted_Y, Y):
    return sum(predicted_Y == Y)/float(len(predicted_Y))
def accuracy_on_set(X_data, Y_data):
    X_hour = training_df.created.apply(lambda d: d.hour).rename('hour')
    phour = ((pd.concat([X_hour,Y_data], axis=1).groupby(['hour','score']).size()
              / pd.concat([X_hour,Y_data], axis=1).groupby(['hour']).size())).values
    probs_Y = pd.DataFrame(bayes_prob(phour, bayes.sanitize_and_split_titles(X_data.title),pxy,py,pno))
    probs_Y.index = Y_data.index
    prediction_Y = probs_Y.apply(lambda d: d.argmax(),axis=1)
    prediction_Y.index = Y_data.index
    return accuracy(prediction_Y, Y_data)

In [388]:
#Goal is 65% Accuracy

print 'Bayesian Model w/ Hour:\n'
print 'training accuracy:', accuracy_on_set(training_df, Y_training)
print 'validation accuracy:', accuracy_on_set(cross_validation_df, Y_cross_validation)
print 'test accuracy:', accuracy_on_set(test_df, Y_test)

Bayesian Model w/ Hour:

training accuracy: 0.579333333333
validation accuracy: 0.566666666667
test accuracy: 0.562479173609


In [384]:
probs_Y = pd.DataFrame(bayes_prob(training_df, bayes.sanitize_and_split_titles(training_df.title),pxy,py,pno))
probs_Y.index = Y_training.index
prediction_Y = probs_Y.apply(lambda d: d.argmax(),axis=1)
prediction_Y.index = Y_training.index

debug_df = pd.concat([training_df.title,prediction_Y, Y_training], axis=1)
debug_df.title = bayes.sanitize_and_split_titles(training_df.title)

In [385]:
'''
Algorithm:
p(y=i|title,hour,domain) = p(title|y=i,hour,domain)*p(y=i|hour,domain)/p(title|hour,domain)

Assumptions:
1. Distribution of words, given a score bucket are the same regardless of hour and domain. 
    This affects the conditional probabilities of words given the score bucket. In other words,
    I assume that p(title|y=i,hour,domain) = p(title|y=i). 
    This assumption is made because there aren't enough training examples if we do segment 
    by hour as well as bucket and domain. I tried this and it led to overfitting on domains 
    leading to a high accuracy on the training set and low on the validation/test sets.

Note: I get this warning, but this is intentional:
        A value is trying to be set on a copy of a slice from a DataFrame.
        Try using .loc[row_indexer,col_indexer] = value instead

    This is because I am trying to set a value on a slice and .loc does not allow me to do that
'''
import warnings
warnings.filterwarnings('ignore')

top_domains = ['selfnba', 'twitter','youtube','streamable','instagram','espn','imgur', 'other']
top_domains = ['selfnba', 'twitter','youtube','streamable','other']

def strip_domain(d):
    for td in top_domains:
        if td in d:
            return td
    return 'other'

def bayes_prob_w_domain(x_titles, x_domain, x_hour, pxy_table, p_hour_domain, py, num_i):
    titles_prob = []
    for title_i in range(len(x_titles)):
        title = x_titles[title_i]
        domain = x_domain[title_i]
        hour = x_hour[title_i]
        title_prob = [1.0 for _ in range(num_i)]
        for word in title:
            for i in range((len(title_prob))):
                if word in pxy_table[i]:
                    title_prob[i] *= pxy_table[i][word]
                #else:
                #    title_prob[i] *= pno[i]
        title_prob = [(p_hour_domain[(hour,domain,i)] if (hour,domain,i) in p_hour_domain else (py[i])) 
                      * (title_prob[i]) for i in range(num_i)]
        titles_prob.append(title_prob)
    return titles_prob

def arg_max_list(d):
    max_index = -1
    max_val = 0
    for i in range(len(d)):
        if d[i] > max_val:
            max_index = i
            max_val = d[i]
    return max_index

def calculate_p(data_df, score_cutoffs, domains, word_vector):
    df = data_df.copy()

    df['score_bucket'] =  df.score.apply(lambda d: multih.bucketize(score_cutoffs,d))
    df['domain'] = df.domain.apply(lambda d: strip_domain(''.join(d.split('.'))))
    df['hour'] = df.created.apply(lambda d: d.hour).rename('hour')
    df['title_list'] = bayes.sanitize_and_split_titles(df.title)
    num_i = len(cutoffs) + 1
    
    X_training = bayes.generate_feature_vector((df.title), word_vector, bayes.common_words)
    X_training_df = pd.DataFrame(X_training)
    Y_data = df.score_bucket
    pxy = pxy_table(X_training_df, Y_data, word_vector, len(score_cutoffs)+1)
    
    #prob y parameterized by hour and domain
    phd_num = (df[['hour','domain','score_bucket']].groupby(['hour','domain','score_bucket']).size()).astype(float)
    phd_den = df[['hour','domain','score_bucket']].groupby(['hour','domain']).size()
    
    for i in phd_den.index:
        phd_num[i] = (phd_num[i]/phd_den[i])
    phourdomain = phd_num
    py = py_table(df.score_bucket, num_i)
    return pxy, phourdomain, py


# Calculates P_Values
pxy_nb, phd_nb, py_nb = calculate_p(training_df
                                    , cutoffs
                                    , top_domains
                                    , word_vector_trained_upon)

In [387]:
def get_accuracy_hd(data_df, pxy, phd, py, score_cutoffs):
    num_i = len(score_cutoffs) +1
    df = data_df.copy()
    df['score_bucket'] =  df.score.apply(lambda d: multih.bucketize(score_cutoffs,d))
    df['domain'] = df.domain.apply(lambda d: strip_domain(''.join(d.split('.'))))
    df['hour'] = df.created.apply(lambda d: d.hour).rename('hour')
    df['title_list'] = bayes.sanitize_and_split_titles(df.title)
    
    df['Y_probs'] = bayes_prob_w_domain(df.title_list.values 
                                        ,df.domain.values
                                        ,df.hour.values
                                        ,pxy
                                        ,phd
                                        ,py
                                        ,num_i)


    df['Y_predicted'] = df.Y_probs.apply(lambda d: arg_max_list(d)) 
    return sum(df['Y_predicted'] == df['score_bucket'])/ float(len(df))#,sum((df['Y_predicted'] - df['score_bucket']).abs() <= 1)/ float(len(df))

print 'Bayesian Model w/ Domain and Hour\n'
print 'training accuracy:', get_accuracy_hd(training_df, pxy_nb, phd_nb, py_nb, cutoffs)
print 'validation accuracy:', get_accuracy_hd(cross_validation_df, pxy_nb, phd_nb, py_nb, cutoffs)
print 'test accuracy:', get_accuracy_hd(test_df, pxy_nb, phd_nb, py_nb, cutoffs)

Bayesian Model w/ Domain and Hour

training accuracy: 0.603166666667
validation accuracy: 0.590333333333
test accuracy: 0.581806064645
