In [1069]:
'''
Analysis of /r/nba Posts
@author: Brian Lin
'''
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import Bayes_Helper as bayes
import Multinomial_Helper as multih

In [1070]:
def return_csv_files(path):
    pattern = r'^.*\.csv$'
    return [f for f in os.listdir(path) if bool(re.match(pattern,f))]
    
path = 'data/backlog/'
csv_files = return_csv_files(path)
df = pd.DataFrame()
for csv in csv_files:
    csv_df = pd.read_csv(path + csv)
    df = pd.concat([df,csv_df])
df['created'] = pd.to_datetime(df.created, unit = 's')
df.index = range(0,len(df))

In [1071]:
''' 
Randomly shuffle the dataframe and generates a training, cross-validation, and test set
'''
random_df = df.reindex(np.random.permutation(df.index))
training_df = random_df.iloc[0:12000]
cross_validation_df = random_df.iloc[12000:15000]
test_df = random_df.iloc[15000:]

In [1072]:
cutoffs = [50, 150, 350]
Y_training = training_df.score.apply(lambda d: multih.bucketize(cutoffs,d))
Y_cross_validation = cross_validation_df.score.apply(lambda d: multih.bucketize(cutoffs,d))
Y_test = test_df.score.apply(lambda d: multih.bucketize(cutoffs,d))

In [1073]:
words = (bayes.generate_words_and_scores(training_df.title, training_df.score)
                 .groupby('word').agg({'score':['mean','std','median','count']}))


In [1074]:
words[('score','bucket')] = words[('score','median')].apply(lambda d: multih.bucketize(cutoffs,d))
bucketize_words = words[words[('score','count')] > 20].sort_values(by=[('score','bucket'),('score','count')], ascending=False)
top200ineachbucket = []
for i in range(len(cutoffs) + 1):
    if i == 2:
        bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:400]
    else:
        bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:200]
    top200ineachbucket.append(bucket)
top200ineachbucket = pd.concat(top200ineachbucket).index.values

In [1075]:
n = 250
top250 = words[words['score','count'] > 20].sort_values(by=('score','median'),ascending=False)[:n]
bot250 = words[words['score','count'] > 20].sort_values(by=('score','median'),ascending=True)[:n]
topbot500 = pd.concat([top250,bot250])
topbot500_words = topbot500.index.values
all_words = words.index.values
word_vector_trained_upon= topbot500_words

In [1076]:
f500words = words[words['score','count'] > 20].sort_values(by=('score','count'),ascending=False)[:500].index.values

In [None]:
def bag_of_words(titles):
    words = (bayes.generate_words_and_scores(training_df.title, training_df.score)
                 .groupby('word').agg({'score':['mean','std','median','count']}))
    words[('score','bucket')] = words[('score','median')].apply(lambda d: multih.bucketize(cutoffs,d))
    bucketize_words = words[words[('score','count')] > 20].sort_values(by=[('score','bucket'),('score','count')], ascending=False)
    top200ineachbucket = []
    for i in range(len(cutoffs) + 1):
        if i == 2:
            bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:400]
        else:
            bucket = bucketize_words[bucketize_words[('score','bucket')] == i][:200]
        top200ineachbucket.append(bucket)
    top200ineachbucket = pd.concat(top200ineachbucket).index.values
    return top200ineachbucket

In [1078]:
#bayes_weights = [.9,.8,1.2,.6]
#bayes_weights = [1,1,1,1]
def pxy_table(X_data, Y_data, words, num_i):
    '''
    p(word|y=i) = # of times y = i where word appears / # of time y = i
    '''
    pxyi = [{} for i in range(num_i)]
    for index in range(len(words)):
        for i in range(len(pxyi)):
            word = words[index]
            table = pxyi[i]
            num = float(((X_data.iloc[:,index] == 1) & (Y_data == i)).sum() +1) 
            den = float((Y_data == i).sum() +2)
            table[word] = num/den
    return pxyi

def py_table(Y_data, num_i):
    py = []
    for i in range(num_i):
        py.append(float((Y_data == i).sum())/ len(Y_data))
    return py

def bayes_prob(x_hour, x_titles, pxy_table, py_table, pno):
    titles_prob = []
    for title_i in range(len(x_titles)):
        title = x_titles[title_i]
        title_prob = [1.0 for _ in range(len(py_table))]
        for word in title:
            for i in range((len(title_prob))):
                if word in pxy_table[i]:
                    title_prob[i] *= pxy_table[i][word]
                #else:
                #    title_prob[i] *= pno[i]
        hour = X_hour.iloc[title_i]
        title_prob = [(phour[i+hour*4]) * (bayes_weights[i]) * (title_prob[i]) for i in range(len(py_table))]
        titles_prob.append(title_prob)
    return titles_prob

def pno_table(Y_data, num_i):
    pno = []
    for i in range(num_i):
        pno.append(1 / float(sum(Y_data.values == i) +2))
    return pno

In [1077]:
#Note: This changes the word vector we train upon
word_vector_trained_upon= top200ineachbucket


X_training = bayes.generate_feature_vector((training_df.title), word_vector_trained_upon, bayes.common_words)
X_training_df = pd.DataFrame(X_training)

# number of training examples without any features mapped
sum(X_training_df.sum(axis=1) == 0)/float(len(X_training_df))

0.041583333333333333

In [1079]:
pxy = pxy_table(X_training_df, Y_training.values, word_vector_trained_upon, len(cutoffs)+1)
py = py_table(Y_training, len(cutoffs)+1)
pno = pno_table(Y_training, len(cutoffs)+1)


X_hour = training_df.created.apply(lambda d: d.hour).rename('hour')
phour = ((pd.concat([hour,Y_training], axis=1).groupby(['hour','score']).size()
          / pd.concat([hour,Y_training], axis=1).groupby(['hour']).size())).values

In [None]:
def calculate_p_values(X_data, Y_data, bag_of_words, cutoffs):
    
    X_training = bayes.generate_feature_vector((X_data.title), bag_of_words, bayes.common_words)
    X_training_df = pd.DataFrame(X_training)
    
    pxy = pxy_table(X_training_df, Y_data.values, bag_of_words, len(cutoffs)+1)
    py = py_table(Y_data, len(cutoffs)+1)
    pno = pno_table(Y_data, len(cutoffs)+1)
    
    X_hour = X_data.created.apply(lambda d: d.hour).rename('hour')
    phour = ((pd.concat([X_hour,Y_data], axis=1).groupby(['hour','score']).size()
              / pd.concat([X_hour,Y_data], axis=1).groupby(['hour']).size())).values
    return pxy, py, pno, py

In [None]:
def divide_by_domain(X_data,Y_data):
    

In [1080]:
def accuracy(predicted_Y, Y):
    return sum(predicted_Y == Y)/float(len(predicted_Y))
def accuracy_on_set(X_data, Y_data):
    X_hour = training_df.created.apply(lambda d: d.hour).rename('hour')
    probs_Y = pd.DataFrame(bayes_prob(X_hour, bayes.sanitize_and_split_titles(X_data.title),pxy,py,pno))
    probs_Y.index = Y_data.index
    prediction_Y = probs_Y.apply(lambda d: d.argmax(),axis=1)
    prediction_Y.index = Y_data.index
    return accuracy(prediction_Y, Y_data)

In [1081]:
#Goal is 65% Accuracy

print 'training accuracy:', accuracy_on_set(training_df, Y_training)
print 'validation accuracy:', accuracy_on_set(cross_validation_df, Y_cross_validation)
print 'test accuracy:', accuracy_on_set(test_df, Y_test)

training accuracy: 0.57875
validation accuracy: 0.526333333333
test accuracy: 0.551149616794


In [1082]:
probs_Y = pd.DataFrame(bayes_prob(training_df, bayes.sanitize_and_split_titles(training_df.title),pxy,py,pno))
probs_Y.index = Y_training.index
prediction_Y = probs_Y.apply(lambda d: d.argmax(),axis=1)
prediction_Y.index = Y_training.index

debug_df = pd.concat([training_df.title,prediction_Y, Y_training], axis=1)
debug_df.title = bayes.sanitize_and_split_titles(training_df.title)

In [1083]:
prediction_Y.value_counts()

0    7252
3    2455
2    1200
1    1093
dtype: int64

In [1084]:
prediction_Y[Y_training != prediction_Y].value_counts()/prediction_Y.value_counts()

0    0.255654
3    0.633809
2    0.755833
1    0.675206
dtype: float64

In [1085]:
Y_training[Y_training != prediction_Y].value_counts()/Y_training.value_counts()

0    0.257803
1    0.796678
2    0.756037
3    0.494944
Name: score, dtype: float64