In [61]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
imdb_df = pd.read_csv('cleaned_dataset.csv')
imdb_df.head()

Unnamed: 0,certificate,runtime,genre,imdb_rating,meta_score,no_of_votes,gross,director_category,star1_category,star2_category,star3_category,star4_category,decade
0,all public,142,Drama,9.3,80.0,2343110,28341469.0,average,low,low,low,low,1990's
1,all public,175,Thriller/Crime/Mystery,9.2,100.0,1620367,134966411.0,high,average,average,low,average,1970's
2,family,152,Action/Adventure,9.0,84.0,2303232,534858444.0,high,high,average,low,high,2000's
3,all public,202,Thriller/Crime/Mystery,9.0,90.0,1129952,57300000.0,high,high,average,average,average,1970's
4,all public,96,Thriller/Crime/Mystery,9.0,96.0,689845,4360000.0,high,average,low,low,low,1950's


In [3]:
# Define our target and features.
target = imdb_df['imdb_rating']
features = imdb_df.drop('imdb_rating', axis=1)

# Split dataset into categorical and numerical
categorical_features = features.select_dtypes(['object', 'bool'])
numerical_features = features.drop(categorical_features, axis=1)

In [4]:
# Now we perform the division between Train and Test, we will reserve 20% of our data to Test.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [5]:
# Initiate OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

# Split training set into categorical and numerical
X_train_cat = X_train.select_dtypes(['object', 'bool'])
X_train_num = X_train.drop(X_train_cat, axis=1)

# Fit OneHotEncoder with the categorical data and transform it into numerical values
ohe.fit(X_train_cat)
X_train_trans_np = ohe.transform(X_train_cat)

# Create a dataframe using the transformed values and the original index
X_train_trans_df = pd.DataFrame(X_train_trans_np, columns=ohe.get_feature_names_out(), index=X_train.index)

# Concatenate the newly transformed train dataframe with the train numerical dataframe
X_train = pd.concat([X_train_trans_df, X_train_num], axis=1)
X_train.head()

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
692,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,103,63.0,488817,285761243.0
266,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,189,72.0,142110,70405498.0
17,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,133,83.0,918088,112000000.0
477,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,72,73.0,25229,193817.0
302,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,130,82.0,454203,165359751.0


In [6]:
# Split test set into categorical and numerical
X_test_cat = X_test.select_dtypes(['object', 'bool'])
X_test_num = X_test.drop(X_test_cat, axis=1)

# Transform the categorical data into numerical values
X_test_trans_np = ohe.transform(X_test_cat)

# Create a dataframe using the transformed values and the original index
X_test_trans_df = pd.DataFrame(X_test_trans_np, columns=ohe.get_feature_names_out(), index=X_test.index)

# Concatenate the newly transformed test dataframe with the test numerical dataframe
X_test = pd.concat([X_test_trans_df, X_test_num], axis=1)
X_test.head()

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
338,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,142,82.0,552493,249358727.0
142,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,95,94.0,616228,356461711.0
242,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,126,74.0,77554,2086345.0
235,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,88,72.0,50610,238507.0
468,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,97,66.0,357026,45875171.0


In [7]:
normalizer = StandardScaler()
normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns, index = X_train.index)
X_train_norm.head()

X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns, index = X_test.index)
X_test_norm.head()

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
338,0.932123,-0.624695,-0.475191,1.704145,-0.418638,-0.772424,-0.320179,-0.383251,-0.723923,1.53393,...,-0.235702,-0.274721,-0.352506,-0.46685,1.614805,-0.581402,0.686116,0.36777,0.53417,1.55259
142,0.932123,-0.624695,-0.475191,-0.586805,-0.418638,-0.772424,3.123254,-0.383251,1.381363,-0.65192,...,-0.235702,-0.274721,-0.352506,-0.46685,-0.61927,1.719981,-1.098451,1.316674,0.716934,2.518059
242,0.932123,-0.624695,-0.475191,-0.586805,-0.418638,1.294626,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,-0.274721,-0.352506,-0.46685,-0.61927,1.719981,0.078604,-0.264833,-0.827743,-0.676423
235,0.932123,-0.624695,-0.475191,-0.586805,-0.418638,-0.772424,-0.320179,2.609256,1.381363,-0.65192,...,-0.235702,-0.274721,-0.352506,-0.46685,1.614805,-0.581402,-1.364238,-0.422984,-0.905007,-0.69308
468,-1.07282,1.600781,-0.475191,-0.586805,2.388699,-0.772424,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,-0.274721,2.836833,-0.46685,-0.61927,-0.581402,-1.022512,-0.897436,-0.026342,-0.281693


## KNN Regression

In [65]:
knn = KNeighborsRegressor(n_neighbors=13) # K=10

In [67]:
knn.fit(X_train_norm, y_train)

In [70]:
#We are going to evaluate our model performance with R-Squared
print(f"The R2 of the model is {knn.score(X_test_norm, y_test): .2f}")

The R2 of the model is  0.21


## Linear Regression

In [9]:
lin_reg = LinearRegression()

In [10]:
lin_reg.fit(X_train_norm, y_train) # determines the b0 and b1's values

In [11]:
pred = lin_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}") #pred = model prediction , y_test=Real value
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {lin_reg.score(X_test_norm, y_test): .2f}")

MAE  0.16
RMSE,  0.21
R2 score,  0.51


In [12]:
lin_reg_coef = {feature : coef for feature, coef in zip(X_train_norm.columns, lin_reg.coef_)}
lin_reg_coef

{'certificate_all public': -171791745.76525,
 'certificate_family': -154768066.16720518,
 'certificate_over 18': -133521303.15047899,
 'genre_Action/Adventure': 26429051385.218636,
 'genre_Comedy': 21567636299.607735,
 'genre_Drama': 29291804636.380005,
 'genre_Other': 17583504037.641365,
 'genre_Thriller/Crime/Mystery': 20233072772.891037,
 'director_category_average': 56182403932.02573,
 'director_category_high': 54111670005.454834,
 'director_category_low': 56701181365.09023,
 'star1_category_average': 48547660913.98557,
 'star1_category_high': 44434138256.615036,
 'star1_category_low': 53201243014.71217,
 'star2_category_average': 24505492209.419964,
 'star2_category_high': 13354724584.32857,
 'star2_category_low': 26359481175.55464,
 'star3_category_average': 24053709259.646652,
 'star3_category_high': 12700916783.42791,
 'star3_category_low': 26092242144.359417,
 'star4_category_average': -7554758262.068928,
 'star4_category_high': -1731440148.522738,
 'star4_category_low': -7706

## Decision Tree

In [14]:
tree = DecisionTreeRegressor(max_depth=10)

In [15]:
tree.fit(X_train_norm, y_train)

In [16]:
X_train_norm

Unnamed: 0,certificate_all public,certificate_family,certificate_over 18,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Other,genre_Thriller/Crime/Mystery,director_category_average,director_category_high,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
692,0.932123,-0.624695,-0.475191,-0.586805,2.388699,-0.772424,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,-0.274721,-0.352506,2.142017,-0.61927,-0.581402,-0.794695,-1.134662,0.351576,1.880737
266,-1.072820,1.600781,-0.475191,-0.586805,-0.418638,1.294626,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,-0.274721,-0.352506,2.142017,-0.61927,-0.581402,2.470682,-0.422984,-0.642625,-0.060567
17,0.932123,-0.624695,-0.475191,-0.586805,-0.418638,1.294626,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,3.640055,-0.352506,-0.466850,-0.61927,-0.581402,0.344390,0.446845,1.582534,0.314383
477,0.932123,-0.624695,-0.475191,-0.586805,-0.418638,-0.772424,3.123254,-0.383251,-0.723923,-0.65192,...,-0.235702,3.640055,-0.352506,-0.466850,-0.61927,-0.581402,-1.971750,-0.343908,-0.977788,-0.693483
302,-1.072820,1.600781,-0.475191,-0.586805,2.388699,-0.772424,-0.320179,-0.383251,-0.723923,-0.65192,...,-0.235702,-0.274721,-0.352506,-0.466850,-0.61927,1.719981,0.230482,0.367770,0.252318,0.795389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.932123,-0.624695,-0.475191,-0.586805,2.388699,-0.772424,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,3.640055,-0.352506,-0.466850,-0.61927,-0.581402,-0.452970,0.763147,-0.974611,-0.686585
192,-1.072820,1.600781,-0.475191,-0.586805,-0.418638,1.294626,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,3.640055,-0.352506,-0.466850,-0.61927,-0.581402,0.230482,1.158523,-0.206413,-0.433813
629,-1.072820,1.600781,-0.475191,-0.586805,-0.418638,1.294626,-0.320179,-0.383251,-0.723923,-0.65192,...,-0.235702,-0.274721,-0.352506,-0.466850,-0.61927,1.719981,0.458299,0.842222,-0.091476,1.245470
559,-1.072820,-0.624695,2.104417,-0.586805,2.388699,-0.772424,-0.320179,-0.383251,1.381363,-0.65192,...,-0.235702,-0.274721,-0.352506,2.142017,-0.61927,-0.581402,-0.415000,1.000373,-0.153902,-0.489171


In [17]:
y_test

338    7.9
142    8.1
242    8.0
235    8.0
468    7.8
      ... 
693    7.6
557    7.7
97     8.3
374    7.9
483    7.8
Name: imdb_rating, Length: 143, dtype: float64

In [18]:
pred = tree.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {tree.score(X_test_norm, y_test): .2f}")

MAE  0.19
RMSE  0.24
R2 score  0.31


## Bagging and Pasting

In [20]:
display(X_train_norm.shape)
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 500)

(570, 36)

In [21]:
bagging_reg.fit(X_train_norm, y_train)

In [22]:
pred = bagging_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {bagging_reg.score(X_test_norm, y_test): .2f}")

MAE  0.16
RMSE  0.20
R2 score  0.56


## Random Patches

In [24]:
forest = RandomForestRegressor(n_estimators=100,
                             max_depth=20)

In [25]:
forest.fit(X_train_norm, y_train)

In [26]:
pred = forest.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {forest.score(X_test_norm, y_test): .2f}")

MAE  0.15
RMSE  0.19
R2 score  0.59


## AdaBoost

In [28]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [29]:
ada_reg.fit(X_train_norm, y_train)

In [30]:
pred = ada_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {ada_reg.score(X_test_norm, y_test): .2f}")

MAE  0.15
RMSE  0.20
R2 score  0.56


## Gradient Boosting

In [32]:
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)

In [33]:
gb_reg.fit(X_train_norm, y_train)

In [34]:
pred = gb_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {gb_reg.score(X_test_norm, y_test): .2f}")

MAE  0.19
RMSE  0.24
R2 score  0.35
