In [208]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [209]:
imdb_df = pd.read_csv('cleaned_dataset.csv')
imdb_df.head()

Unnamed: 0,certificate,runtime,genre,imdb_rating,meta_score,no_of_votes,gross,director_category,star1_category,star2_category,star3_category,star4_category,decade
0,all public,142,Drama,9.3,80.0,2343110,28341469.0,average,low,low,low,low,1990's
1,all public,175,Thriller/Crime/Mystery,9.2,100.0,1620367,134966411.0,high,average,average,low,average,1970's
2,family,152,Action/Adventure,9.0,84.0,2303232,534858444.0,high,high,average,low,high,2000's
3,all public,202,Thriller/Crime/Mystery,9.0,90.0,1129952,57300000.0,high,high,average,average,average,1970's
4,all public,96,Thriller/Crime/Mystery,9.0,96.0,689845,4360000.0,high,average,low,low,low,1950's


In [210]:
# Define our target and features.
target = imdb_df['imdb_rating']
features = imdb_df.drop('imdb_rating', axis=1)

# Split dataset into categorical and numerical
categorical_features = features.select_dtypes(['object', 'bool'])
numerical_features = features.drop(categorical_features, axis=1)

In [211]:
# Now we perform the division between Train and Test, we will reserve 20% of our data to Test.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [212]:
# Initiate OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

# Split training set into categorical and numerical
X_train_cat = X_train.select_dtypes(['object', 'bool'])
X_train_num = X_train.drop(X_train_cat, axis=1)

# Fit OneHotEncoder with the categorical data and transform it into numerical values
ohe.fit(X_train_cat)
X_train_trans_np = ohe.transform(X_train_cat)

# Create a dataframe using the transformed values and the original index
X_train_trans_df = pd.DataFrame(X_train_trans_np, columns=ohe.get_feature_names_out(), index=X_train.index)

# Concatenate the newly transformed train dataframe with the train numerical dataframe
X_train = pd.concat([X_train_trans_df, X_train_num], axis=1)
X_train.head()

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
692,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,103,63.0,488817,285761243.0
266,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,189,72.0,142110,70405498.0
17,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,133,83.0,918088,112000000.0
477,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,72,73.0,25229,193817.0
302,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,130,82.0,454203,165359751.0


In [213]:
# Split test set into categorical and numerical
X_test_cat = X_test.select_dtypes(['object', 'bool'])
X_test_num = X_test.drop(X_test_cat, axis=1)

# Transform the categorical data into numerical values
X_test_trans_np = ohe.transform(X_test_cat)

# Create a dataframe using the transformed values and the original index
X_test_trans_df = pd.DataFrame(X_test_trans_np, columns=ohe.get_feature_names_out(), index=X_test.index)

# Concatenate the newly transformed test dataframe with the test numerical dataframe
X_test = pd.concat([X_test_trans_df, X_test_num], axis=1)
X_test.head()

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
338,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,142,82.0,552493,249358727.0
142,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,95,94.0,616228,356461711.0
242,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,126,74.0,77554,2086345.0
235,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,88,72.0,50610,238507.0
468,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,97,66.0,357026,45875171.0


In [214]:
normalizer = MinMaxScaler()
normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns, index = X_train.index)
X_train_norm.head()

X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns, index = X_test.index)
X_test_norm.head()

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
338,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.421687,0.75,0.227477,0.26622
142,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.138554,0.916667,0.254974,0.380565
242,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.325301,0.638889,0.022574,0.002226
235,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.096386,0.611111,0.01095,0.000253
468,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.150602,0.527778,0.143147,0.048976


## Linear Regression

In [216]:
lin_reg = LinearRegression()

In [217]:
lin_reg.fit(X_train_norm, y_train) # determines the b0 and b1's values

In [218]:
pred = lin_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}") #pred = model prediction , y_test=Real value
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {lin_reg.score(X_test_norm, y_test): .2f}")

MAE  0.16
RMSE,  0.21
R2 score,  0.51


In [219]:
lin_reg_coef = {feature : coef for feature, coef in zip(X_train_norm.columns, lin_reg.coef_)}
lin_reg_coef

{'certificate_all public': -366148383.849931,
 'certificate_family': -366148383.8514234,
 'certificate_over 18': -366148383.816413,
 'genre_Action/Adventure': -106075704646.44489,
 'genre_Comedy': -106075704646.39789,
 'genre_Drama': -106075704646.38849,
 'genre_Other': -106075704646.2754,
 'genre_Thriller/Crime/Mystery': -106075704646.39444,
 'director_category_average': -157493889312.5595,
 'director_category_high': -157493889312.58334,
 'director_category_low': -157493889312.56046,
 'star1_category_average': -113366169328.64774,
 'star1_category_high': -113366169328.69423,
 'star1_category_low': -113366169328.66234,
 'star2_category_average': 112662286052.55687,
 'star2_category_high': 112662286052.53,
 'star2_category_low': 112662286052.58669,
 'star3_category_average': 324847288757.16315,
 'star3_category_high': 324847288757.2008,
 'star3_category_low': 324847288757.2173,
 'star4_category_average': -246413639359.6195,
 'star4_category_high': -246413639359.51837,
 'star4_category_l

## Decision Tree

In [221]:
tree = DecisionTreeRegressor(max_depth=10)

In [222]:
tree.fit(X_train_norm, y_train)

In [223]:
X_train_norm

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
692,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.186747,0.486111,0.200005,0.305084
266,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.704819,0.611111,0.050426,0.075165
17,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.367470,0.763889,0.385205,0.119572
477,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.625000,0.000000,0.000206
302,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.349398,0.750000,0.185072,0.176540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.240964,0.819444,0.000478,0.001022
192,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.349398,0.888889,0.116055,0.030960
629,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.385542,0.833333,0.133347,0.229846
559,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.246988,0.861111,0.123955,0.024403


In [224]:
y_test

338    7.9
142    8.1
242    8.0
235    8.0
468    7.8
      ... 
693    7.6
557    7.7
97     8.3
374    7.9
483    7.8
Name: imdb_rating, Length: 143, dtype: float64

In [225]:
pred = tree.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {tree.score(X_test_norm, y_test): .2f}")

MAE  0.19
RMSE  0.25
R2 score  0.29


## Bagging and Pasting

In [227]:
display(X_train_norm.shape)
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 500)

(570, 36)

In [228]:
bagging_reg.fit(X_train_norm, y_train)

In [229]:
pred = bagging_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {bagging_reg.score(X_test_norm, y_test): .2f}")

MAE  0.16
RMSE  0.20
R2 score  0.56


## Random Patches

In [231]:
forest = RandomForestRegressor(n_estimators=100,
                             max_depth=20)

In [232]:
forest.fit(X_train_norm, y_train)

In [233]:
pred = forest.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {forest.score(X_test_norm, y_test): .2f}")

MAE  0.16
RMSE  0.19
R2 score  0.57


## AdaBoost

In [235]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [236]:
ada_reg.fit(X_train_norm, y_train)

In [237]:
pred = ada_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {ada_reg.score(X_test_norm, y_test): .2f}")

MAE  0.15
RMSE  0.20
R2 score  0.56


## Gradient Boosting

In [239]:
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)

In [240]:
gb_reg.fit(X_train_norm, y_train)

In [241]:
pred = gb_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {gb_reg.score(X_test_norm, y_test): .2f}")

MAE  0.19
RMSE  0.24
R2 score  0.34
