In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
#from sklearn.externals import joblib

import matplotlib.pyplot as plt
%matplotlib inline


# EDA

In [2]:
datafile = 'mojo_data.csv'
df = pd.read_csv(datafile, index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 996 entries, 0 to 199
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0.1            996 non-null    object 
 1   link_stub               996 non-null    object 
 2   rank                    996 non-null    object 
 3   title                   996 non-null    object 
 4   domestic_gross_x        996 non-null    object 
 5   release_year            996 non-null    int64  
 6   domestic_opening_gross  996 non-null    int64  
 7   budget                  996 non-null    int64  
 8   domestic_gross_y        996 non-null    int64  
 9   international_gross     996 non-null    int64  
 10  worldwide_gross         996 non-null    int64  
 11  runtime_minutes         975 non-null    float64
 12  rating                  867 non-null    object 
 13  release_month           996 non-null    int64  
 14  release_date            996 non-null    ob

In [4]:
df.rename( columns={'Unnamed: 0.1':'movie_title'}, inplace=True )
df.head(2)

Unnamed: 0,movie_title,link_stub,rank,title,domestic_gross_x,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
0,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
1,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [5]:
df.index = np.arange(1, len(df) + 1)

In [6]:
df.head(2)

Unnamed: 0,movie_title,link_stub,rank,title,domestic_gross_x,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [7]:
df.drop(['link_stub'], axis=1, inplace = True)

In [8]:
df.head(2)

Unnamed: 0,movie_title,rank,title,domestic_gross_x,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,Avengers: Endgame,"$858,373,000",2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [9]:
df.drop(['domestic_gross_x'], axis=1, inplace = True)

In [10]:
df.head(2)

Unnamed: 0,movie_title,rank,title,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,Star Wars: Episode VII - The Force Awakens,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,Avengers: Endgame,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [11]:
df.drop(['title'], axis=1, inplace = True)

In [12]:
df.head(2)

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [13]:
df.rename( columns={'domestic_gross_y':'domestic_gross'}, inplace=True )

In [14]:
df.head()

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"
3,Avatar,3,2009,77025481,237000000,760507625,2029931467,2790439092,162.0,PG-13,12,2009-12-16,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
4,Black Panther,4,2018,202003951,0,700426566,647171407,1347597973,134.0,PG-13,2,2018-02-13,"['Action', 'Adventure', 'Sci-Fi']"
5,Avengers: Infinity War,5,2018,257698183,0,678815482,1369544272,2048359754,149.0,PG-13,4,2018-04-25,"['Action', 'Adventure', 'Sci-Fi']"


In [15]:
df.rename( columns={'runtime_minutes':'runtime'}, inplace=True )

In [16]:
df.shape

(996, 13)

In [17]:
movies = df.replace('?', np.NaN).dropna().reset_index(drop=True)

In [18]:
movies.shape

(850, 13)

In [19]:
movies.drop(movies[movies['release_year'] < 2000].index, inplace = True)

In [20]:
movies.shape

(690, 13)

In [21]:
movies['budget'].max()

356000000

In [22]:
movies['budget'].min()

0

In [None]:
#dropna didnt't address the null values in budget

In [23]:
movies.drop(movies[movies['budget'] == 0].index, inplace = True)

In [24]:
movies['worldwide_gross'].max()

2797800564

In [25]:
movies['worldwide_gross'].min()

86086881

In [26]:
movies.shape

(637, 13)

In [27]:
movies.head(2)

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross,international_gross,worldwide_gross,runtime,rating,release_month,release_date,genres
0,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
1,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [None]:
action = ['Action', 'Sport','Adventure']
family = ['Family', 'Drama', 'Commedy', 'Romance']
documentary = ['History','Documentary','Biography']
crime_thriller = ['crime', 'thriller']
fantasy = ['fantasy', 'Sci-Fi']
animation = ['Animation']
others = ['others']
def genre(c)
    for g in movies['genres']:
        if g in action:
            return 'Action'
        if g in family:
            return 'Family'
        if g in documentary:
            return 'Documentary'
        if g in crime_thriller:
            return 'Crime'
        if g in fantasy:
            return 'Fantasy'
        if g in animation:
            return 'Animation'
        else:
            return 'Others'

In [39]:
action = ['Action', 'Sport','Adventure']
family = ['Family', 'Drama', 'Commedy', 'Romance']
documentary = ['History','Documentary','Biography']
crime_thriller = ['crime', 'thriller']
fantasy = ['fantasy', 'Sci-Fi']
animation = ['Animation']
others = ['others']
def genre(c)
    for g in movies['genres']:
        if g in action:
            movies['genre'].append(g)
        if g in family:
            movies['genre'].append(g)
        if g in documentary:
            movies['genre'].apppend(g)
        if g in crime_thriller:
            movies['genre'].append(g)
        if g in fantasy:
            movies['genre'].append(g)
        if g in animation:
            movies['genre'].append(g)
        else:
            movies['genre']+=['others']
  

SyntaxError: invalid syntax (<ipython-input-39-3f57cbbf23f3>, line 8)

In [38]:
movies.head()

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross,international_gross,worldwide_gross,runtime,rating,release_month,release_date,genres,genre
0,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']",othersothersothersothersothersothersothersothe...
1,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",othersothersothersothersothersothersothersothe...
2,Avatar,3,2009,77025481,237000000,760507625,2029931467,2790439092,162.0,PG-13,12,2009-12-16,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",othersothersothersothersothersothersothersothe...
6,Jurassic World,7,2015,208806270,150000000,652295625,1018130819,1670426444,124.0,PG-13,6,2015-06-10,"['Action', 'Adventure', 'Sci-Fi']",othersothersothersothersothersothersothersothe...
7,The Avengers,8,2012,207438708,220000000,623357910,895457605,1518815515,143.0,PG-13,4,2012-04-25,"['Action', 'Adventure', 'Sci-Fi']",othersothersothersothersothersothersothersothe...


# Initial assessments and visualizing using statsmodels, matplotlib and Seaborn

In [None]:
sns.heatmap(movies.corr(), cmap='seismic', annot = True, vmin = -1, vmax = 1)

In [None]:
sns.pairplot(movies, plot_kws=dict(alpha=0.1, edgecolor='none'))

For this project I would like to predict movie worldwide revenue, at the time we know the revenues from the domestic opening. I take the total domestic revenue and international revenues out from the dataset  as those data won't be available at upon domestic opening.  

In [None]:
lm = smf.ols('worldwide_gross ~ release_year + domestic_opening_gross + budget  + runtime_minutes + release_month', data = movies)
fit = lm.fit()
fit.summary()
        

Is this a good R^2 for movie industry? Notice the significant negative intercept. Is this correlation distorted by outliers such as may be movies that make huge losses?

In [None]:
min(movies['worldwide_gross'] - movies['budget'])

Among the features, release year and month yield the highest standard errors followed by runtime. However if I take those out from the feature variables, I'll be left only with purely financial features budget and the revenu from the domestic opening. Let's see by taking out those features, one at a time.

In [None]:
# Whithout release month:

lm = smf.ols('worldwide_gross ~ release_year + domestic_opening_gross + budget  + runtime_minutes', data = movies)
fit1 = lm.fit()
fit1.summary()

In [None]:
#Putting back the release month and taking out the release year:

lm = smf.ols('worldwide_gross ~ release_month + domestic_opening_gross + budget  + runtime_minutes', data = movies)
fit2 = lm.fit()
fit2.summary()

In [None]:
#Putting back the release month and taking out the running tme:

lm = smf.ols('worldwide_gross ~ release_year + domestic_opening_gross + budget  + release_month', data = movies)
fit3 = lm.fit()
fit3.summary()

Taking out the time features resulted in worst R^2 so I keep them all for further analysis. I will adding a new random variable 'rand'

In [None]:
movies['rand'] = np.randint[1:1000000]

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(fit.predict(),fit.resid);
fit.resid.plot(style = 'o', figsize=(10,6));

# Regression with sklearn

In [None]:
lr = LinearRegression()
X = movies[['release_year', 'domestic_opening_gross', 'budget', 'runtime_minutes', 'release_month']]
y = movies[['worldwide_gross']]
lr.fit(X, y)


In [None]:
sns.pairplot(smaller_df, plot_kws=dict(alpha=0.1, edgecolor='none'))

In [None]:
lr.score(X, y)
print(f"R^2 = {lr.score(X,y)}")

In [None]:
adj_R2 = 1 - (1 - lr.score(X, y))*(len(y) - 1)/(len(y) - X.shape[1]-1)
print(f"Adjusted R^2 = {adj_R2}")

In [None]:
print(f"Intercept : {lr.intercept_}")

In [None]:
print(f"Other coefficients : {lr.coef_}")

In [None]:
fit.save('movie_data_sm_model.pkl')

Let's first try applying some function to the target feature:

In [None]:
movies['log_revenue'] = np.log(movies.worldwide_gross)
lsm = smf.ols('log_revenue ~ release_year + domestic_opening_gross + budget  + runtime_minutes + release_month', data = movies)
fit1 = lsm.fit()
fit1.summary()

In [None]:
movies['sq_revenue'] = np.square(movies.worldwide_gross)
lsm = smf.ols('sq_revenue ~ release_year + domestic_opening_gross + budget  + runtime_minutes + release_month', data = movies)
fit2 = lsm.fit()
fit2.summary()

In [None]:
movies['sq_budget'] = np.square(movies.budget)
lsm = smf.ols('worldwide_gross ~ + sq_budget + release_year + domestic_opening_gross + runtime_minutes', data = movies)
fit3 = lsm.fit()
fit3.summary()

In [None]:
movies['sin_budget'] = np.sin(movies.budget)
lsm = smf.ols('worldwide_gross ~ + sin_budget + release_year + domestic_opening_gross + runtime_minutes +release_month', data = movies)
fit4 = lsm.fit()
fit4.summary()

In [None]:
movies['sq_opening'] = np.square(movies.domestic_opening_gross)
lsm = smf.ols('worldwide_gross ~ sq_opening + budget + release_year + runtime_minutes + release_month', data = movies)
fit5 = lsm.fit()
fit5.summary()

So far no luck with the financial features...

In [None]:
movies['d2k_year'] = 2000/(movies.release_year)
lsm = smf.ols('worldwide_gross ~ d2k_year + domestic_opening_gross + budget + runtime_minutes + release_month', data = movies)
fit6 = lsm.fit()
fit6.summary()

In [None]:
movies['sq_month'] = np.square(movies.release_month)
lsm = smf.ols('worldwide_gross ~ sq_month + release_year + domestic_opening_gross + budget + runtime_minutes', data = movies)
fit7 = lsm.fit()
fit7.summary()

In [None]:
movies['sq_runtime'] = np.square(movies.runtime_minutes)
lsm = smf.ols('worldwide_gross ~ sq_runtime + domestic_opening_gross + budget + release_year + release_month', data = movies)
fit6 = lsm.fit()
fit6.summary()

## Fitting polynomial functions

### Generating random data

In [None]:
from IPython.core.pylabtools import figsize
figsize(5,5)
plt.style.use('fivethirtyeight')

np.random.seed(9)

def f(x):
    ''' this is a function to generate random number '''
    return np.sin(2* np.pi * x)

x_plot = np.linspace(0, 1, 100)
n_samples = 100
X = np.random.uniform(0, 1, size=n_samples)[:, np.newaxis]
y = f(X) + np.random.normal(scale=0.3, size=n_samples)[:, np.newaxis]

fig,ax = plt.subplots(1,1);
ax.plot(x_plot, f(x_plot), label='ground thruth', color='green')
ax.scatter(X, y, label='data', s=100)
ax.set_ylim(-2,2)
ax.set_xlim(0,1)
ax.set_ylabel('y')
ax.set_xlabel('x')
ax.legend();


In [None]:
def plot_approximation(est, ax, label = None):
    
    """ Plot the approximation of ``est`` on axis ``ax``."""
    
    ax.plot(x_plot, f(x_plot),label = 'ground truth', color='green')
    ax.scatter(X, y, s=100)
    ax.plot(x_plot, est.predict(x_plot[:, np.newaxis]), color='red', label=label)
    ax.set_ylim((-2,2))
    ax.set_xlim((0,1))
    ax.set_ylabel('y')
    ax.set_xlabel('x')
    ax.legend(loc='upper right', frameon=True)
    
fig, ax = plt.subplots(1,1)
degree =3

est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
est.fit(X,y)
plot_approximation(est, ax, label='degree=%d'% degree)
    

In [None]:
fig, ax = plt.subplots(1,1)
degree = 9
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
est.fit(X,y)
plot_approximation(est, ax, label='degree=%d' % degree)

In [None]:
fig, ax_rows = plt.subplots(5, 2, figsize=(15, 20))
for degree in range(10):
    est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    est.fit(X, y)
    # This sets the appropriate axis for each degree (KEEP)
    ax_row_left, ax_row_right = ax_rows[degree//2]
    if degree%2 == 0:
        ax = ax_row_left
    else:
        ax = ax_row_right
    plot_approximation(est, ax, label='degree=%d' % degree)

In [None]:
np.savez('poly_data.npz', X, y)