In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
#from sklearn.externals import joblib

import matplotlib.pyplot as plt
%matplotlib inline


# EDA

In [2]:
datafile = 'mojo_data.csv'
df = pd.read_csv(datafile, index_col=0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 996 entries, 0 to 199
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0.1            996 non-null    object 
 1   link_stub               996 non-null    object 
 2   rank                    996 non-null    object 
 3   title                   996 non-null    object 
 4   domestic_gross_x        996 non-null    object 
 5   release_year            996 non-null    int64  
 6   domestic_opening_gross  996 non-null    int64  
 7   budget                  996 non-null    int64  
 8   domestic_gross_y        996 non-null    int64  
 9   international_gross     996 non-null    int64  
 10  worldwide_gross         996 non-null    int64  
 11  runtime_minutes         975 non-null    float64
 12  rating                  867 non-null    object 
 13  release_month           996 non-null    int64  
 14  release_date            996 non-null    ob

In [4]:
df.rename( columns={'Unnamed: 0.1':'movie_title'}, inplace=True )
df.head(2)

Unnamed: 0,movie_title,link_stub,rank,title,domestic_gross_x,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
0,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
1,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [5]:
df.index = np.arange(1, len(df) + 1)

In [6]:
df.head(2)

Unnamed: 0,movie_title,link_stub,rank,title,domestic_gross_x,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [7]:
df.drop(['link_stub'], axis=1, inplace = True)

In [8]:
df.head(2)

Unnamed: 0,movie_title,rank,title,domestic_gross_x,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,Avengers: Endgame,"$858,373,000",2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [9]:
df.drop(['domestic_gross_x'], axis=1, inplace = True)

In [10]:
df.head(2)

Unnamed: 0,movie_title,rank,title,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,Star Wars: Episode VII - The Force Awakens,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,Avengers: Endgame,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [11]:
df.drop(['title'], axis=1, inplace = True)

In [12]:
df.head(2)

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross_y,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"


In [13]:
df.rename( columns={'domestic_gross_y':'domestic_gross'}, inplace=True )

In [14]:
df.head()

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross,international_gross,worldwide_gross,runtime_minutes,rating,release_month,release_date,genres
1,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']"
2,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']"
3,Avatar,3,2009,77025481,237000000,760507625,2029931467,2790439092,162.0,PG-13,12,2009-12-16,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
4,Black Panther,4,2018,202003951,0,700426566,647171407,1347597973,134.0,PG-13,2,2018-02-13,"['Action', 'Adventure', 'Sci-Fi']"
5,Avengers: Infinity War,5,2018,257698183,0,678815482,1369544272,2048359754,149.0,PG-13,4,2018-04-25,"['Action', 'Adventure', 'Sci-Fi']"


In [15]:
df.rename( columns={'runtime_minutes':'runtime'}, inplace=True )

In [16]:
df.shape

(996, 13)

In [17]:
dummy = df.replace('?', np.NaN).dropna().reset_index(drop=True)

In [18]:
dummy.shape

(850, 13)

In [19]:
dummy.drop(dummy[dummy['release_year'] < 2000].index, inplace = True)

In [20]:
dummy.shape

(690, 13)

In [21]:
dummy['budget'].max()

356000000

In [22]:
dummy['budget'].min()

0

In [23]:
#dropna didnt't address the null values in budget

In [24]:
dummy.drop(dummy[dummy['budget'] == 0].index, inplace = True)

In [25]:
dummy['worldwide_gross'].max()

2797800564

In [26]:
dummy['worldwide_gross'].min()

86086881

In [27]:
dummy.shape

(637, 13)

In [28]:
dummy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637 entries, 0 to 849
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   movie_title             637 non-null    object 
 1   rank                    637 non-null    object 
 2   release_year            637 non-null    int64  
 3   domestic_opening_gross  637 non-null    int64  
 4   budget                  637 non-null    int64  
 5   domestic_gross          637 non-null    int64  
 6   international_gross     637 non-null    int64  
 7   worldwide_gross         637 non-null    int64  
 8   runtime                 637 non-null    float64
 9   rating                  637 non-null    object 
 10  release_month           637 non-null    int64  
 11  release_date            637 non-null    object 
 12  genres                  637 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 69.7+ KB


In [30]:
print(dummy.genres.value_counts())

['Action', 'Adventure', 'Sci-Fi']                            46
['Comedy']                                                   29
['Comedy', 'Romance']                                        26
['Adventure', 'Animation', 'Comedy', 'Family', 'Fantasy']    21
['Action', 'Adventure', 'Sci-Fi', 'Thriller']                19
                                                             ..
['Adventure', 'Drama', 'Family', 'Fantasy']                   1
['Action', 'Adventure', 'Fantasy', 'Horror', 'War']           1
['Animation', 'Drama', 'Family', 'Fantasy']                   1
['Action', 'Drama', 'History', 'Thriller', 'War']             1
['Action', 'Adventure', 'Fantasy', 'Sci-Fi', 'War']           1
Name: genres, Length: 237, dtype: int64


In [42]:
action = 'Action', 'Sport','Adventure'
family = 'Family', 'Drama', 'Commedy', 'Romance'
documentary = 'History','Documentary','Biography'
crime_thriller = 'crime', 'thriller'
fantasy = 'fantasy', 'Sci-Fi'
animation = 'Animation'

def classify_movie_genre(g):
    for g in dummy['genres']:
        dummy['type']=[]
        if 'Animation' in g:
            dummy['type'].apend['Animation']
        if 'Action' or 'Sport'or 'Adventure'in g:
            dummy['type'].append['Action']
            
        if 'Family' or 'Drama'or 'Commedy' or 'Romance' in g:
            dummy['type'].append['Family']
        if 'History'or'Documentary'or 'Biography'in g:
            dummy['type'].append['Documentary']
        if 'crime' or'thriller' in g:
            dummy['type'].append['Crime_Thriller']
        else:
            dummy['type'].append['Others']
                
        
    return dummy['type']

In [None]:
action = 'Action', 'Sport','Adventure'
family = 'Family', 'Drama', 'Commedy', 'Romance'
documentary = 'History','Documentary','Biography'
crime_thriller = 'crime', 'thriller'
fantasy = 'fantasy', 'Sci-Fi'
animation = 'Animation'
def classify_movie_genre(g):
    for g in dummy['genres']:
        for e in g:
            if e in action: 
                movie['class']='Action'
                dummy['genre']+= movie['class']
        for e in g:
            if e in family:
                movie['class']='Family'
                dummy['genre']+= movie['class']
        for e in g:
            if e in documentary:
                movie['class']='Documentary'
                dummy['genre']+= movie['class']
        for e in g:
            if e in crime_thriller:
                movie['class']='Crime'
                dummy['genre']+= movie['class']
        for e in g:
            if e in fantasy:
                movie['class']='Fantasy'
                dummy['genre']+= movie['class']
        if e in g:
            if e in animation:
                movie['class']='Animation'
                dummy['genre']+= movie['class']
            else:
                movie['class']='Others'
                dummy['genre']+= movie['class']
        
    return dummy['genre']

In [43]:
dummy.head()

Unnamed: 0,movie_title,rank,release_year,domestic_opening_gross,budget,domestic_gross,international_gross,worldwide_gross,runtime,rating,release_month,release_date,genres,type
0,Star Wars: Episode VII - The Force Awakens,1,2015,247966675,245000000,936662225,1131791908,2068454133,138.0,PG-13,12,2015-12-16,"['Action', 'Adventure', 'Sci-Fi']",
1,Avengers: Endgame,2,2019,357115007,356000000,858373000,1939427564,2797800564,181.0,PG-13,4,2019-04-24,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",
2,Avatar,3,2009,77025481,237000000,760507625,2029931467,2790439092,162.0,PG-13,12,2009-12-16,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",
6,Jurassic World,7,2015,208806270,150000000,652295625,1018130819,1670426444,124.0,PG-13,6,2015-06-10,"['Action', 'Adventure', 'Sci-Fi']",
7,The Avengers,8,2012,207438708,220000000,623357910,895457605,1518815515,143.0,PG-13,4,2012-04-25,"['Action', 'Adventure', 'Sci-Fi']",


In [41]:
my_list = [1,2,3]
for e in len(my_list):
    print(e)

TypeError: 'int' object is not iterable