In [None]:
#This code is for generating predictions of the opening box office of upcoming movies.  
#Logistic regressions are used for estimating the probability of each movie making over $10M, $20M, $30M, $50M, and $100M
#A linear model is used to predict the opening box office
#The goal was to generate a dataframe containing the probability of each bench mark as well as a prediction of the opening box office
#It needed to be flexible to incorporating different metrics, as well as targeting specific movie types (kids vs not vs all)
#Actual metrics used in this have been removed due to NDA stipulations

In [6]:
#Import packages and data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import array
from sklearn import metrics
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from math import exp

dataTrain = pd.read_csv(r'C:\Path\TrainingData.csv',encoding = "ISO-8859-1")
dataPred = pd.read_csv(r'C:\Path\PredictionData.csv',encoding = "ISO-8859-1")

In [7]:
#Define functions for performing logictic regresssions and linear predictions of opening box office


def logpreds(metric,training, forPred,segment):
    
    #Set up dataframe for training
    allDat = training[metric]
    if segment == 1:
        allDat = allDat.loc[allDat.Kids == 1]
    elif segment == 2:
        allDat = allDat.loc[allDat.Kids != 1]
    if 'wikiSumDrop' in metric:
        allDat['wikiSumDrop'] = np.log(allDat['wikiSumDrop'])

    #Drop NaN's (gets rid of movies with no opening box office yet) and train model
    cleanDat = allDat.dropna()
    colName = [col for col in cleanDat.columns if 'Over' in col]
    y = cleanDat[colName]
    X = cleanDat.drop(columns =[colName[0]])
    
    clf = LogisticRegression().fit(X, y)
    ypredTrain = clf.predict_proba(X)
    ypredTrain2 = ypredTrain[:,1]*100
    ypred2 =clf.predict(X) 
    cnf_matrix = metrics.confusion_matrix(y, ypred2)
    accuracy = (cnf_matrix[0,0]+cnf_matrix[1,1])/(cnf_matrix[0,0]+cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[0,1])
    print(accuracy)

    d2 = forPred[X.columns]
    if segment == 1:
        d2 = d2.loc[d2.Kids == 1]
    elif segment == 2:
        d2 = d2.loc[d2.Kids != 1]
    #log transform metrics that are skewed/not mornally distributed
    if 'metricX' in metric:
        d2['metricX'] = np.log(d2['metricX'])
    Xnew = d2.dropna()
    yPred = clf.predict_proba(Xnew)
    boPred = yPred[:,1]*100
    
    return(boPred,ypredTrain2)

def linpreds(metric,training, forPred,segment):
    
    #Set up dataframe for training
    allDat = training[metric]
    if segment == 1:
        allDat = allDat.loc[allDat.Kids == 1]
    elif segment == 2:
        allDat = allDat.loc[allDat.Kids != 1]
    if 'metricX' in metric:
        allDat['metricX'] = np.log(allDat['metricX'])

    #Drop NaN's (gets rid of movies with no opening box office yet) and train model
    cleanDat = allDat.dropna()
    
    y = cleanDat['OPBO Adj']
    y = np.log(y)
    X = cleanDat.drop(columns ='OPBO Adj')
    
    X2 = sm.add_constant(X)
    est = sm.OLS(y,X2)
    est2 = est.fit()
    yPredTrain = est2.predict(X2)

    d2 = forPred[X.columns]
    if segment == 1:
        d2 = d2.loc[d2.Kids == 1]
    elif segment == 2:
        d2 = d2.loc[d2.Kids != 1]
    if 'metricX' in metric:
        d2['metricX'] = np.log(d2['metricX'])
    Xnew = d2.dropna()
    Xnew2 = sm.add_constant(Xnew)
    yPred = est2.predict(Xnew2)
    yPred = np.exp(yPred)
    
    print(est2.summary())
    return(yPred)

In [8]:

#Metrics used for predicting each bench mark amount 
metric10 = (['Over10','metricA', 'metricY','metricZ'])
metric20 = (['Over20','metricX', 'metricB','metricZ'])
metric30 = (['Over30','metricX', 'metricY','metricZ'])
metric50 = (['Over50','metricC', 'metricY','metricZ'])
metric100 = (['Over100','metricX', 'metricY','metricZ'])
linMetrics =(['OPBO Adj','metricX', 'metricF','metricZ']) 

#Initialize dataframes for results
percPreds = pd.DataFrame()
trainPreds = pd.DataFrame() 
linPreds = pd.DataFrame()

#Pick which audience segment to look at, 1 = over PG, 2 = kids only, 3 = all
seg = 3

X = dataTrain
Z = dataPred
#Pull titles for the current segment of interest
if seg ==1: 
    X = X.loc[X.Kids == 1]
    X = X.dropna()
    Z = Z.loc[Z.Kids == 1]
    percPreds['Title'] = Z['Movie Title']
    linPreds['Title'] = Z['Movie Title']
    trainPreds['Title'] = X['Movie Title']
elif seg == 2:
    X = X.loc[X.Kids != 1]
    X=X.dropna()
    Z = Z.loc[Z.Kids != 1]
    percPreds['Title'] = Z['Movie Title']     
    linPreds['Title'] = Z['Movie Title']
    trainPreds['Title'] = X['Movie Title']
elif seg == 3:
    percPreds['Title'] = Z['Movie Title']     
    linPreds['Title'] = Z['Movie Title']
    X = X.dropna()
    trainPreds['Title'] = X['Movie Title']
    
    
percPreds['Over 10'],trainPreds['Over 10'] = logpreds(metric10,dataTrain,dataPred,seg)
percPreds['Over 20'],trainPreds['Over 20'] = logpreds(metric20,dataTrain,dataPred,seg)
percPreds['Over 30'],trainPreds['Over 30'] = logpreds(metric30,dataTrain,dataPred,seg)
percPreds['Over 50'],trainPreds['Over 50'] = logpreds(metric50,dataTrain,dataPred,seg)
percPreds['Over 100'] ,trainPreds['Over 100']= logpreds(metric100,dataTrain,dataPred,seg)

percPreds['Predicted']= linpreds(linMetrics,dataTrain,dataPred,seg)
print(percPreds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.9230769230769231
(52,)
0.8461538461538461
(52,)
0.9230769230769231
(52,)
0.9615384615384616
(52,)
1.0
(52,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


                            OLS Regression Results                            
Dep. Variable:               OPBO Adj   R-squared:                       0.851
Model:                            OLS   Adj. R-squared:                  0.816
Method:                 Least Squares   F-statistic:                     24.56
Date:                Thu, 17 Oct 2019   Prob (F-statistic):           1.25e-14
Time:                        18:02:06   Log-Likelihood:                -29.360
No. Observations:                  54   AIC:                             80.72
Df Residuals:                      43   BIC:                             102.6
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
