In [62]:
# preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

# success
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

In [34]:
df = pd.read_csv("movie_data/cleaned_data.csv", index_col="id").drop("title", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5255 entries, 862 to 63281
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          5255 non-null   int64  
 1   revenue             5255 non-null   float64
 2   vote_average        5255 non-null   float64
 3   budget              5255 non-null   float64
 4   runtime             5255 non-null   float64
 5   original_language   5255 non-null   object 
 6   is_franchise        5255 non-null   bool   
 7   genre               5255 non-null   object 
 8   production_company  5255 non-null   object 
 9   country             5255 non-null   object 
 10  release_month       5255 non-null   int64  
 11  release_year        5255 non-null   int64  
 12  actor_1             5255 non-null   object 
 13  actor_2             5255 non-null   object 
 14  actor_3             5255 non-null   object 
 15  Director            5255 non-null   object 
 16  Pro

In [35]:
df.iloc[:,5:] = df.iloc[:,5:].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5255 entries, 862 to 63281
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Unnamed: 0          5255 non-null   int64   
 1   revenue             5255 non-null   float64 
 2   vote_average        5255 non-null   float64 
 3   budget              5255 non-null   float64 
 4   runtime             5255 non-null   float64 
 5   original_language   5255 non-null   category
 6   is_franchise        5255 non-null   category
 7   genre               5255 non-null   category
 8   production_company  5255 non-null   category
 9   country             5255 non-null   category
 10  release_month       5255 non-null   category
 11  release_year        5255 non-null   category
 12  actor_1             5255 non-null   category
 13  actor_2             5255 non-null   category
 14  actor_3             5255 non-null   category
 15  Director            5255 non-null  

In [36]:
df = df.iloc[: , 1:]
df.head()

Unnamed: 0_level_0,revenue,vote_average,budget,runtime,original_language,is_franchise,genre,production_company,country,release_month,release_year,actor_1,actor_2,actor_3,Director,Producer,Screenwriter,keyword
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
862,373554033.0,7.7,30000000.0,81.0,en,True,Animation,Pixar Animation Studios,United States of America,10,1995,Tom Hanks,Tim Allen,Don Rickles,John Lasseter,Bonnie Arnold,Joss Whedon,jealousy
8844,262797249.0,6.9,65000000.0,104.0,en,False,Adventure,TriStar Pictures,United States of America,12,1995,Robin Williams,Jonathan Hyde,Kirsten Dunst,Joe Johnston,Larry J. Franco,Jonathan Hensleigh,board game
31357,81452156.0,6.1,16000000.0,127.0,en,False,Comedy,Twentieth Century Fox Film Corporation,United States of America,12,1995,Whitney Houston,Angela Bassett,Loretta Devine,Forest Whitaker,Ronald Bass,Ronald Bass,based on novel
949,187436818.0,7.7,60000000.0,170.0,en,False,Action,Regency Enterprises,United States of America,12,1995,Al Pacino,Robert De Niro,Val Kilmer,Michael Mann,Art Linson,Michael Mann,robbery
9091,64350171.0,5.5,35000000.0,106.0,en,False,Action,Universal Pictures,United States of America,12,1995,Jean-Claude Van Damme,Powers Boothe,Dorian Harewood,Peter Hyams,Moshe Diamant,Karen Elise Baldwin,terrorist


In [37]:
df_dummies = pd.get_dummies(df)
df_dummies.head()

Unnamed: 0_level_0,revenue,vote_average,budget,runtime,original_language_af,original_language_bm,original_language_ca,original_language_cn,original_language_da,original_language_de,...,keyword_writer,keyword_writing,keyword_wyoming,keyword_yacht,keyword_yakuza,keyword_yuppie,keyword_zombie,keyword_zoo,keyword_любовь,keyword_绝地奶霸
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,373554033.0,7.7,30000000.0,81.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8844,262797249.0,6.9,65000000.0,104.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31357,81452156.0,6.1,16000000.0,127.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
949,187436818.0,7.7,60000000.0,170.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9091,64350171.0,5.5,35000000.0,106.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
X = df_dummies.drop(["revenue", "vote_average"], axis=1)
y = df_dummies["revenue"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=29)

In [42]:
lasso_model = LassoCV(cv=10, random_state=29).fit(X_train, y_train)

In [64]:
y_pred = lasso_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

121811612.3846579

In [63]:
explained_variance_score(y_test, y_pred)

0.4994107397968124