In [1]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

# Necessary imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import RidgeCV

%matplotlib inline



In [2]:
movies = pd.read_csv('/Users/veenakumar/Desktop/Projects/Luther/Luther_Data_Manip.csv')

In [3]:
movies = movies.drop('Unnamed: 0',axis=1)

In [4]:
movies.columns = movies.columns.map(str.strip)
movies.columns

Index([u'Distributor', u'Runtime', u'Domestic_Total', u'MPAA_Rating',
       u'Production_Budget', u'Release_Date', u'Release_Yr', u'Title',
       u'Foreign', u'Genre', u'Total_Gross', u'Domestic_Percent',
       u'Foreign_Percent'],
      dtype='object')

In [5]:
movies.shape

(1700, 13)

In [6]:
movies.corr()

Unnamed: 0,Runtime,Domestic_Total,Production_Budget,Release_Yr,Foreign,Total_Gross,Domestic_Percent,Foreign_Percent
Runtime,1.0,0.300122,-0.054263,0.049046,0.342898,0.342236,-0.134483,0.134483
Domestic_Total,0.300122,1.0,-0.030896,-0.095849,0.818945,0.922,-0.189145,0.189145
Production_Budget,-0.054263,-0.030896,1.0,0.024699,-0.028004,-0.030309,0.022777,-0.022777
Release_Yr,0.049046,-0.095849,0.024699,1.0,0.086944,0.023249,-0.209922,0.209922
Foreign,0.342898,0.818945,-0.028004,0.086944,1.0,0.977265,-0.502945,0.502945
Total_Gross,0.342236,0.922,-0.030309,0.023249,0.977265,1.0,-0.409405,0.409405
Domestic_Percent,-0.134483,-0.189145,0.022777,-0.209922,-0.502945,-0.409405,1.0,-1.0
Foreign_Percent,0.134483,0.189145,-0.022777,0.209922,0.502945,0.409405,-1.0,1.0


In [None]:
df = movies.dropna()

In [None]:
df.shape

In [None]:
sns.pairplot(df, size = 1.2, aspect=1.5)

In [None]:
dfregress = df

In [None]:
#Renaming variables to make it easier to run regression.

dfregress['y'] = df['Foreign']
dfregress['X1'] = df['Distributor']
dfregress['X2'] = df['Runtime']
dfregress['X3'] = df['MPAA_Rating']
dfregress['X4'] = df['Production_Budget']
dfregress['X5'] = df['Release_Yr']
dfregress['X6'] = df['Genre']

dfregress = dfregress.drop(['Foreign','Genre','Total_Gross','Domestic_Percent','Foreign_Percent'],axis=1)

In [None]:
#Creating dummy variables for Genre, Year, Distributor.
dfregress = pd.get_dummies(data = dfregress)

In [None]:
dfregress['X5'] = map(str,dfregress['X5'])

In [None]:
dfregress = dfregress.drop('Title',axis=1)

In [None]:
dfregress

In [None]:
#Creates a list of variables in the proper format for the patsy function. 

Xs = [c for c in dfregress.columns]
'\") + Q(\"'.join(Xs)

In [None]:
#Starting some modeling!

#This creates a feature matrix (X) and target vector (y). Removied 'X6' because X6 was parsed out into dummy variables,
#so it no longer exists as simply 'X6'.
y, X = patsy.dmatrices('y ~ X1 + X2 + X3 + X4 + X5', data=dfregress, return_type="dataframe")
model = sm.OLS(y, X)
fit = model.fit()
fit.summary()

In [None]:
#Same as above using different function.

# Define the model
lm1 = smf.ols('y ~ Q("X2") + Q("X4") + Q("X1_A24") + Q("X1_Artisan") + Q("X1_Bleecker Street") + Q("X1_Broad Green Pictures") + Q("X1_Buena Vista") + Q("X1_CBS Films") + Q("X1_Dimension Films") + Q("X1_DreamWorks") + Q("X1_FilmDistrict") + Q("X1_Focus Features") + Q("X1_Fox") + Q("X1_Fox Atomic") + Q("X1_Fox Searchlight") + Q("X1_Freestyle Releasing") + Q("X1_High Top Releasing") + Q("X1_IFC") + Q("X1_IMAX") + Q("X1_Lions Gate") + Q("X1_Lionsgate") + Q("X1_Lionsgate/Summit") + Q("X1_MGM") + Q("X1_MGM (Weinstein)") + Q("X1_Miramax") + Q("X1_National Geographic Entertainment") + Q("X1_New Line") + Q("X1_Newmarket") + Q("X1_Open Road Films") + Q("X1_Overture Films") + Q("X1_Paramount") + Q("X1_Paramount (DreamWorks)") + Q("X1_Paramount Vantage") + Q("X1_Picturehouse") + Q("X1_Pure Flix") + Q("X1_Relativity") + Q("X1_Roadside Attractions") + Q("X1_Rocky Mountain Pictures") + Q("X1_Rogue Pictures") + Q("X1_STX Entertainment") + Q("X1_Samuel Goldwyn") + Q("X1_Sony (Revolution)") + Q("X1_Sony / Columbia") + Q("X1_Sony / Screen Gems") + Q("X1_Sony Classics") + Q("X1_Summit Entertainment") + Q("X1_TriStar") + Q("X1_USA Films") + Q("X1_United Artists") + Q("X1_Universal") + Q("X1_Warner Bros.") + Q("X1_Warner Bros. (New Line)") + Q("X1_Warner Independent") + Q("X1_Weinstein / Dimension") + Q("X1_Weinstein Company") + Q("X1_Yari Film Group") + Q("X3_G") + Q("X3_PG") + Q("X3_PG-13") + Q("X3_R") + Q("X3_Unrated") + Q("X5_2000") + Q("X5_2001") + Q("X5_2002") + Q("X5_2003") + Q("X5_2004") + Q("X5_2005") + Q("X5_2006") + Q("X5_2007") + Q("X5_2008") + Q("X5_2009") + Q("X5_2010") + Q("X5_2011") + Q("X5_2012") + Q("X5_2013") + Q("X5_2014") + Q("X5_2015") + Q("X5_2016") + Q("X6_Action") + Q("X6_Action / Adventure") + Q("X6_Action / Crime") + Q("X6_Action Comedy") + Q("X6_Action Drama") + Q("X6_Action Fantasy") + Q("X6_Action Horror") + Q("X6_Action Thriller") + Q("X6_Adventure") + Q("X6_Adventure Comedy") + Q("X6_Animation") + Q("X6_Comedy") + Q("X6_Comedy / Drama") + Q("X6_Comedy Thriller") + Q("X6_Concert") + Q("X6_Crime") + Q("X6_Crime Comedy") + Q("X6_Crime Drama") + Q("X6_Crime Thriller") + Q("X6_Documentary") + Q("X6_Drama") + Q("X6_Drama / Thriller") + Q("X6_Family") + Q("X6_Family Adventure") + Q("X6_Family Comedy") + Q("X6_Fantasy") + Q("X6_Fantasy Comedy") + Q("X6_Fantasy Drama") + Q("X6_Foreign") + Q("X6_Foreign / Action") + Q("X6_Foreign / Horror") + Q("X6_Historical Drama") + Q("X6_Historical Epic") + Q("X6_Horror") + Q("X6_Horror Comedy") + Q("X6_Horror Thriller") + Q("X6_IMAX") + Q("X6_Music Drama") + Q("X6_Musical") + Q("X6_Period Action") + Q("X6_Period Adventure") + Q("X6_Period Comedy") + Q("X6_Period Drama") + Q("X6_Period Horror") + Q("X6_Romance") + Q("X6_Romantic Adventure") + Q("X6_Romantic Comedy") + Q("X6_Romantic Thriller") + Q("X6_Sci-Fi") + Q("X6_Sci-Fi Action") + Q("X6_Sci-Fi Adventure") + Q("X6_Sci-Fi Comedy") + Q("X6_Sci-Fi Fantasy") + Q("X6_Sci-Fi Horror") + Q("X6_Sci-Fi Thriller") + Q("X6_Sports Comedy") + Q("X6_Sports Drama") + Q("X6_Thriller") + Q("X6_War") + Q("X6_War Drama") + Q("X6_War Romance") + Q("X6_Western") + Q("X6_Western Comedy")', data=dfregress)
# Fit the model
fit1 = lm1.fit()
# Print summary statistics of the model's performance
fit1.summary()

In [None]:
lm2 = smf.ols('y ~ Q("X1_Bleecker Street") + Q("X1_CBS Films") + Q("X1_Dimension Films") + Q("X1_Freestyle Releasing")+Q("X1_IFC")+Q("X1_IMAX")+Q("X1_National Geographic Entertainment")+Q("X1_Pure Flix")+Q("X1_Roadside Attractions")+Q("X1_Rocky Mountain Pictures")+Q("X1_Rogue Pictures")+Q("X1_United Artists")+Q("X1_Yari Film Group")+Q("X3_Unrated")+Q("X5_2009")+Q("X5_2010")+Q("X6_Concert")+Q("X6_Fantasy Comedy")+Q("X6_Foreign")+Q("X6_Period Horror")', data=dfregress)
fit2 = lm2.fit()
fit2.summary()

In [None]:
lm3 = smf.ols('y ~ Q("X2")+ Q("X1_A24")+Q("X1_Buena Vista")+Q("X1_DreamWorks")+Q("X1_FilmDistrict")+Q("X1_Focus Features")+Q("X1_Fox")+Q("X1_Fox Searchlight")+Q("X1_Lions Gate")+Q("X1_Lionsgate")+Q("X1_Lionsgate/Summit")+Q("X1_MGM")+Q("X1_MGM (Weinstein)")+Q("X1_Miramax")+Q("X1_New Line")+Q("X1_Newmarket")+Q("X1_Open Road Films")+Q("X1_Paramount")+Q("X1_Paramount (DreamWorks)")+Q("X1_Paramount Vantage")+Q("X1_Relativity")+Q("X1_STX Entertainment")+Q("X1_Sony (Revolution)")+Q("X1_Sony / Columbia")+Q("X1_Sony / Screen Gems")+Q("X1_Summit Entertainment")+Q("X1_TriStar")+Q("X1_USA Films")+Q("X1_Universal")+Q("X1_Warner Bros.")+Q("X1_Warner Bros. (New Line)")+Q("X1_Warner Independent")+Q("X1_Weinstein / Dimension")+Q("X3_G")+Q("X3_PG")+Q("X3_PG-13")+Q("X3_R")+Q("X5_2000")+Q("X5_2001")+Q("X5_2002")+Q("X5_2003")+Q("X5_2004")+Q("X5_2005")+Q("X5_2006")+Q("X5_2007")+Q("X5_2013")+Q("X5_2014")+Q("X5_2015")+Q("X5_2016")+Q("X6_Action")+Q("X6_Action / Adventure")+Q("X6_Action / Crime")+Q("X6_Action Comedy")+Q("X6_Action Drama")+Q("X6_Action Horror")+Q("X6_Action Thriller")+Q("X6_Adventure")+Q("X6_Adventure Comedy")+Q("X6_Animation")+Q("X6_Comedy")+Q("X6_Comedy / Drama")+Q("X6_Comedy Thriller")+Q("X6_Crime")+Q("X6_Crime Drama")+Q("X6_Drama")+Q("X6_Drama / Thriller")+Q("X6_Family")+Q("X6_Family Adventure")+Q("X6_Family Comedy")+Q("X6_Fantasy")+Q("X6_Horror")+Q("X6_Horror Comedy")+Q("X6_Horror Thriller")+Q("X6_IMAX")+Q("X6_Music Drama")+Q("X6_Musical")+Q("X6_Period Adventure")+Q("X6_Period Drama")+Q("X6_Romantic Adventure")+Q("X6_Romantic Comedy")+Q("X6_Romantic Thriller")+Q("X6_Sci-Fi Action")+Q("X6_Sci-Fi Adventure")+Q("X6_Sci-Fi Comedy")+Q("X6_Sci-Fi Fantasy")+Q("X6_Sci-Fi Horror")+Q("X6_Sports Comedy")+Q("X6_Sports Drama")+Q("X6_Thriller")+Q("X6_War")+Q("X6_War Drama")++Q("X6_War Romance")+Q("X6_Western")+Q("X6_Western Comedy")', data=dfregress)
fit3 = lm3.fit()
fit3.summary()

In [None]:
y, X = patsy.dmatrices('y ~ Q("X2")+ Q("X1_A24")+Q("X1_Buena Vista")+Q("X1_DreamWorks")+Q("X1_FilmDistrict")+Q("X1_Focus Features")+Q("X1_Fox")+Q("X1_Fox Searchlight")+Q("X1_Lions Gate")+Q("X1_Lionsgate")+Q("X1_Lionsgate/Summit")+Q("X1_MGM")+Q("X1_MGM (Weinstein)")+Q("X1_Miramax")+Q("X1_New Line")+Q("X1_Newmarket")+Q("X1_Open Road Films")+Q("X1_Paramount")+Q("X1_Paramount (DreamWorks)")+Q("X1_Paramount Vantage")+Q("X1_Relativity")+Q("X1_STX Entertainment")+Q("X1_Sony (Revolution)")+Q("X1_Sony / Columbia")+Q("X1_Sony / Screen Gems")+Q("X1_Summit Entertainment")+Q("X1_TriStar")+Q("X1_USA Films")+Q("X1_Universal")+Q("X1_Warner Bros.")+Q("X1_Warner Bros. (New Line)")+Q("X1_Warner Independent")+Q("X1_Weinstein / Dimension")+Q("X3_G")+Q("X3_PG")+Q("X3_PG-13")+Q("X3_R")+Q("X5_2000")+Q("X5_2001")+Q("X5_2002")+Q("X5_2003")+Q("X5_2004")+Q("X5_2005")+Q("X5_2006")+Q("X5_2007")+Q("X5_2013")+Q("X5_2014")+Q("X5_2015")+Q("X5_2016")+Q("X6_Action")+Q("X6_Action / Adventure")+Q("X6_Action / Crime")+Q("X6_Action Comedy")+Q("X6_Action Drama")+Q("X6_Action Horror")+Q("X6_Action Thriller")+Q("X6_Adventure")+Q("X6_Adventure Comedy")+Q("X6_Animation")+Q("X6_Comedy")+Q("X6_Comedy / Drama")+Q("X6_Comedy Thriller")+Q("X6_Crime")+Q("X6_Crime Drama")+Q("X6_Drama")+Q("X6_Drama / Thriller")+Q("X6_Family")+Q("X6_Family Adventure")+Q("X6_Family Comedy")+Q("X6_Fantasy")+Q("X6_Horror")+Q("X6_Horror Comedy")+Q("X6_Horror Thriller")+Q("X6_IMAX")+Q("X6_Music Drama")+Q("X6_Musical")+Q("X6_Period Adventure")+Q("X6_Period Drama")+Q("X6_Romantic Adventure")+Q("X6_Romantic Comedy")+Q("X6_Romantic Thriller")+Q("X6_Sci-Fi Action")+Q("X6_Sci-Fi Adventure")+Q("X6_Sci-Fi Comedy")+Q("X6_Sci-Fi Fantasy")+Q("X6_Sci-Fi Horror")+Q("X6_Sports Comedy")+Q("X6_Sports Drama")+Q("X6_Thriller")+Q("X6_War")+Q("X6_War Drama")++Q("X6_War Romance")+Q("X6_Western")+Q("X6_Western Comedy")', return_type="dataframe",data=dfregress)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
model = sm.OLS(y_train, X_train)
results = model.fit()
results.summary()

In [None]:
# Use statsmodels to plot the residuals
fit1.resid.plot(style='o', figsize=(12,8))

In [None]:
lr1 = LinearRegression()
lr1.fit(X, y)
lr1.score(X, y)