In [1]:
## load modules
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge

In [None]:
def test_func( arg=10 ):
    
    output = [arg, arg*5, arg*10, 'foo']
    
    return output[0]

In [45]:
merged.ix[merged.year==2015, 100:].head()

Unnamed: 0,t99,year,tg,tg_adj,op,op_adj,tg_theaters,op_theaters,opdate,cldate,...,budget,revurl,critscore,critrate,audiscore,audirate,genres,id_rt,id_imdb,mpaa
1849,0.002119,2015,294510675,4068.052,147187040,36760.0,4022,4004,4/3/2015,04/21/2015,...,,http://api.rottentomatoes.com/api/public/v1.0/...,82,Certified Fresh,88,Upright,"['Mystery & Suspense', 'Action & Adventure']",771354922,2820852.0,PG-13
1850,0.020737,2015,186478482,1242.593,67877361,17653.41,3848,3845,3/13/2015,04/21/2015,...,95000000.0,http://api.rottentomatoes.com/api/public/v1.0/...,85,Certified Fresh,84,Upright,"['Science Fiction & Fantasy', 'Romance']",771270966,1661199.0,PG
1851,0.007614,2015,165960425,677.708,85171450,23360.244,3655,3646,2/13/2015,04/21/2015,...,40000000.0,http://api.rottentomatoes.com/api/public/v1.0/...,25,Rotten,44,Spilled,['Romance'],771311953,,R
1852,0.002513,2015,161709302,593.821,55365012,15205.991,3680,3641,2/6/2015,04/21/2015,...,74000000.0,http://api.rottentomatoes.com/api/public/v1.0/...,78,Certified Fresh,59,Spilled,"['Comedy', 'Animation']",771361762,2279373.0,PG
1853,0.002232,2015,142917827,1504.002,52107731,14052.786,3801,3708,3/27/2015,04/21/2015,...,135000000.0,http://api.rottentomatoes.com/api/public/v1.0/...,47,Rotten,69,Upright,['Animation'],771315639,2224026.0,PG


In [3]:
def load_data( n_topics = 100 ):
    ## load movie names
    titles = pd.read_table('lda-data/movietitles/titlesall.csv', header=None, names=["title"])
    ## load theta df (docs x topics) from 100-iteration 100-topic lda
    theta = pd.read_csv('lda-data/gibbs/theta/theta-df{nt}.csv'.format(nt=n_topics))
    ## load all mojo data, 2010-2015 (this has response vars like box office, ratings, etc)
    mdata = pd.read_csv('lda-data/mojo/mojoall.csv')
    ## merge movie names with thetas
    title_theta = pd.concat([titles, theta],axis=1)
    ## merge named thetas with mojo data
    merged = pd.merge(title_theta, mdata, how="left", left_on="title", right_on="title")

In [28]:
def prep_preds( data, max_bound=1e9, min_bound=1e4 ):
    
    theta_cols = data.columns[1:101]
    feature_cols = ['tg','critscore','audiscore']
    budget_mask = ((data.tg < max_bound) & (data.tg > min_bound))
    
    ## set Y and X for modeling
    Y = data.ix[budget_mask,feature_cols].copy()
    X = data.ix[budget_mask,theta_cols].copy()
    
    N = X.shape[0]
    feats = Y.columns
    
    preds_mat = np.empty((total,len(feats)),dtype=object)
    
    return make_preds( X, Y, N, feats, preds_mat)

In [46]:
## make predictions on data
def make_preds( X, Y, N, responses, preds):

    ## predict on each individual movie (using all other movies as training data)
    for target in xrange(N):

        ## iterate over each response variable
        for n,response in enumerate(responses):

            trainmask = np.ones(X.shape[0],dtype=bool)
            trainmask[target] = False

            trainX = X.ix[ trainmask,:]
            trainY = Y.ix[ trainmask,response]
            testX  = X.ix[~trainmask,:]
            testY  = Y.ix[~trainmask,response]

            mincol       = 0
            maxcol       = 99
            
            designmat    = trainX.ix[:,mincol:maxcol]
            holdout      = testX.ix[:,mincol:maxcol]

            clf  = Ridge()
            fit  = clf.fit(designmat, trainY[response])
            pred = fit.predict(holdout)
            mean = max(trainY[response]).mean()
            actual = max(testY[response])
            pred_error = np.abs(pred[0] - actual)
            mean_error = np.abs( mean   - actual)

            preds[target][n] = {"actual":actual, "pred":pred[0], "mean":mean, 
                                "pred_error":pred_error, "mean_error":mean_error,
                                "pred_better":pred_error<mean_error, "pred_error_margin":pred_error/float(actual)}

    return preds

In [22]:
## returns simple count and proportion statistics on how frequently our prediction was better than the mean 
def get_better(preds):
    
    better = {'tg': {'ct':0,'prop':0}, 'critscore': {'ct':0,'prop':0}, 'audiscore': {'ct':0,'prop':0}}

    for x in preds:
        for j,vals in enumerate(x):

            cat = response_cols[j]
            if vals['pred_better']:
                better[cat]['ct'] += 1

            #print 'Prediction is {better} for {cat}: {val}'.format(better='better' if (vals['pred_better']) else 'worse',cat=cat,val=vals['actual'])

    total = float(preds.shape[0])
    
    better['tg']['prop']        = better['tg']['ct']/total
    better['critscore']['prop'] = better['critscore']['ct']/total
    better['audiscore']['prop'] = better['audiscore']['ct']/total
    
    return better