In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# model
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from keras.models import Sequential
from keras.layers import Dense

In [2]:
df = pd.read_csv("../out/film.txt" , sep=",")

In [3]:
df = df.drop_duplicates()

In [4]:
gender_series = df['Gender']

In [5]:
gender_set = set()
for i , v in enumerate(gender_series):
    for j in v.split("|"):
        gender_set.add(j)

In [6]:
df['gender_list'] = df['Gender'].apply(lambda x : x.split("|"))

In [7]:
for gender in gender_set:
    df['dummy_' + gender] = df['gender_list'].apply(lambda x : 1 if gender in x else 0)

In [8]:
df_num = df.drop(columns=['ID' , "Name" , "Year" , 'Gender' , 'gender_list'])

In [9]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 0 to 78
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Duration           78 non-null     int64  
 1   grossBox           78 non-null     float64
 2   totalActorBox      78 non-null     float64
 3   totalDirectorBox   78 non-null     float64
 4   totalWriterBox     78 non-null     float64
 5   totalProdFirmBox   78 non-null     float64
 6   totalDistrFirmBox  78 non-null     float64
 7   dummy_Musical      78 non-null     int64  
 8   dummy_Drama        78 non-null     int64  
 9   dummy_Thriller     78 non-null     int64  
 10  dummy_Romance      78 non-null     int64  
 11  dummy_Adventure    78 non-null     int64  
 12  dummy_History      78 non-null     int64  
 13  dummy_Animation    78 non-null     int64  
 14  dummy_Action       78 non-null     int64  
 15  dummy_Crime        78 non-null     int64  
 16  dummy_Mystery      78 non-nu

In [10]:
df_num_2 = df_num.iloc[: , 0:7]

In [11]:
df_num = df_num[df_num['grossBox'] != 0]

In [12]:
X = df_num_2.drop("grossBox", axis=1)
y = df_num_2["grossBox"]

X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    X, y, test_size=0.20, random_state=0)

In [13]:
# StandardScaler
scaler = StandardScaler()

# X_train = scaler.fit_transform(X_train_df)
# X_test  = scaler.fit_transform(X_test_df)

X_train = X_train_df
X_test  = X_test_df

y_train = y_train_df
y_test  = y_test_df

In [16]:
# RandomForestRegressor

rf_param_grid = {
    "n_estimators": [100, 200, 300, 1000] ,
    "min_samples_split": [1, 2, 3, 4, 5],
    "n_estimators": [10, 30, 50 , 100]
}

grid = GridSearchCV(RandomForestRegressor(n_jobs=-1 , random_state=10), param_grid=rf_param_grid, cv=5)

grid.fit(X_train, y_train)
print("train score: ", grid.best_score_)

dtr_model = grid.best_estimator_
print("Test score: ", dtr_model.score(X_test, y_test))

print("RMSE: " , sqrt(mean_squared_error(y_test , grid.predict(X_test))))

train score:  -0.27008454241813323
Test score:  0.8214704597360097
RMSE:  123083543.46912526


In [20]:
pd.DataFrame([np.round(dtr_model.predict(X_test) , 0) , y_test])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,16404624.0,6857197.0,79604170.0,55782881.0,36107728.0,166382542.0,33177379.0,33151747.0,31279881.0,61959641.0,9155.0,159181556.0,115889407.0,51880804.0,827979100.0,67443553.0
1,9345649.0,13075926.0,48825526.0,8665577.0,193678298.0,205637183.0,10044092.0,31222161.0,85557721.0,188116796.0,193338.0,312242626.0,98203196.0,27670000.0,1236005000.0,117991727.0
