In [33]:
# Data Science Libs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import ast

# model preprocessing & metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [34]:
# Import the data - Movies Metadata
df = pd.read_csv("moviedata/Movie_meta_cleaned.csv")
df.head()

Unnamed: 0,belongs_to_collection,budget,genres,original_language,original_title,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,title,vote_average,vote_count
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,Toy Story,21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",10/30/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,7.7,5415.0
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,Jumanji,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/15/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,6.9,2413.0
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",en,Grumpier Old Men,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men,6.5,92.0
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,Waiting to Exhale,3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Waiting to Exhale,6.1,34.0
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",en,Father of the Bride Part II,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/10/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,5.7,173.0


In [35]:
#Function to cleaned the columns of lists of dicts to pull the names
def parse_column(x):
    try:
        name = ast.literal_eval(x)[0]['name']
    except: 
        name = None
    
    return(name)

In [36]:
#Create new columns for name only
df['genre'] = df.genres.apply(lambda x: parse_column(x))
df['production_company'] = df.production_companies.apply(lambda x: parse_column(x))
df['production_country'] = df.production_countries.apply(lambda x: parse_column(x))
df['spoken_language'] = df.spoken_languages.apply(lambda x: parse_column(x))

df.head()

Unnamed: 0,belongs_to_collection,budget,genres,original_language,original_title,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,title,vote_average,vote_count,genre,production_company,production_country,spoken_language
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",en,Toy Story,21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",10/30/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,7.7,5415.0,Animation,Pixar Animation Studios,United States of America,English
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",en,Jumanji,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/15/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,6.9,2413.0,Adventure,TriStar Pictures,United States of America,English
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",en,Grumpier Old Men,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men,6.5,92.0,Romance,Warner Bros.,United States of America,English
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,Waiting to Exhale,3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Waiting to Exhale,6.1,34.0,Comedy,Twentieth Century Fox Film Corporation,United States of America,English
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",en,Father of the Bride Part II,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/10/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,5.7,173.0,Comedy,Sandollar Productions,United States of America,English


In [37]:
#Correct the types of the following
df['budget'] = pd.to_numeric(df.budget, errors='coerce')
df['revenue'] = pd.to_numeric(df.revenue, errors='coerce')
df['release_date'] = pd.to_datetime(df.release_date, errors='coerce')


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4494 non-null   object        
 1   budget                 45463 non-null  float64       
 2   genres                 45466 non-null  object        
 3   original_language      45455 non-null  object        
 4   original_title         45466 non-null  object        
 5   popularity             45461 non-null  object        
 6   production_companies   45463 non-null  object        
 7   production_countries   45463 non-null  object        
 8   release_date           45376 non-null  datetime64[ns]
 9   revenue                45460 non-null  float64       
 10  runtime                45203 non-null  float64       
 11  spoken_languages       45460 non-null  object        
 12  title                  45460 non-null  object        
 13  v

In [39]:
#Drop useless columns
df = df.drop(columns=['belongs_to_collection', 'genres', 'original_language', 'original_title', 'production_companies', 'production_countries', 'spoken_languages'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   budget              45463 non-null  float64       
 1   popularity          45461 non-null  object        
 2   release_date        45376 non-null  datetime64[ns]
 3   revenue             45460 non-null  float64       
 4   runtime             45203 non-null  float64       
 5   title               45460 non-null  object        
 6   vote_average        45460 non-null  float64       
 7   vote_count          45460 non-null  float64       
 8   genre               43024 non-null  object        
 9   production_company  33585 non-null  object        
 10  production_country  39178 non-null  object        
 11  spoken_language     41631 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(6)
memory usage: 4.2+ MB


In [31]:
#df = df.dropna().reset_index(drop=True)

In [40]:
df['release_year'] = df.release_date.apply(lambda x: x.year)
df['release_month'] = df.release_date.apply(lambda x: x.month)
df['release_dayofweek'] = df.release_date.apply(lambda x: x.dayofweek)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   budget              45463 non-null  float64       
 1   popularity          45461 non-null  object        
 2   release_date        45376 non-null  datetime64[ns]
 3   revenue             45460 non-null  float64       
 4   runtime             45203 non-null  float64       
 5   title               45460 non-null  object        
 6   vote_average        45460 non-null  float64       
 7   vote_count          45460 non-null  float64       
 8   genre               43024 non-null  object        
 9   production_company  33585 non-null  object        
 10  production_country  39178 non-null  object        
 11  spoken_language     41631 non-null  object        
 12  release_year        45376 non-null  float64       
 13  release_month       45376 non-null  float64   

In [None]:
#to produce weekend results
release_is_weekend(lambda x: x.dayofweek >3)