In [54]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [100]:
imdb_df = pd.read_csv("new_cleaned.csv")

In [101]:
imdb_df.shape

(714, 13)

In [102]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 714 entries, 0 to 713
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   released_year  714 non-null    object 
 1   certificate    714 non-null    object 
 2   runtime        714 non-null    int64  
 3   genre          714 non-null    object 
 4   imdb_rating    714 non-null    float64
 5   meta_score     714 non-null    float64
 6   director       714 non-null    object 
 7   star1          714 non-null    object 
 8   star2          714 non-null    object 
 9   star3          714 non-null    object 
 10  star4          714 non-null    object 
 11  no_of_votes    714 non-null    int64  
 12  gross          714 non-null    float64
dtypes: float64(3), int64(2), object(8)
memory usage: 72.6+ KB


In [103]:
imdb_df.head()

Unnamed: 0,released_year,certificate,runtime,genre,imdb_rating,meta_score,director,star1,star2,star3,star4,no_of_votes,gross
0,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469.0
1,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411.0
2,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444.0
3,1974,A,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000.0
4,1957,U,96,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000.0


In [151]:
imdb_df.certificate.unique()

array(['A', 'UA', 'U', 'R', 'G', 'PG-13', 'PG', 'Passed', 'Approved',
       'TV-PG', 'U/A', 'GP'], dtype=object)

In [149]:
imdb_df.groupby('director').size()

director
Abdellatif Kechiche    1
Abhishek Kapoor        1
Adam McKay             1
Akira Kurosawa         2
Alan J. Pakula         1
                      ..
Yimou Zhang            1
Yoshiaki Kawajiri      1
Yôjirô Takita          1
Zack Snyder            2
Zoya Akhtar            1
Length: 402, dtype: int64

In [80]:
# Remove non-numeric entries from 'released_year'
imdb_df = imdb_df[pd.to_numeric(imdb_df['released_year'], errors='coerce').notnull()]

# Convert years to integers
imdb_df['released_year'] = imdb_df['released_year'].astype(int)

# Function to categorize each year into its respective decade
def categorize_decade(year):
    return str((year // 10) * 10) + "'s"

# Apply the function to create a new column with decade categories
imdb_df['decade'] = imdb_df['released_year'].apply(categorize_decade)

# Drop unwanted columns
imdb_df = imdb_df.drop(['released_year', 'genre', 'director', 'star1', 'star2', 'star3', 'star4'], axis=1)

# Display the DataFrame
imdb_df.head()

Unnamed: 0,certificate,runtime,imdb_rating,meta_score,no_of_votes,gross,decade
0,A,142,9.3,80.0,2343110,28341469.0,1990's
1,A,175,9.2,100.0,1620367,134966411.0,1970's
2,UA,152,9.0,84.0,2303232,534858444.0,2000's
3,A,202,9.0,90.0,1129952,57300000.0,1970's
4,U,96,9.0,96.0,689845,4360000.0,1950's


In [82]:
# Define our target and features.
target = imdb_df['imdb_rating']
features = imdb_df.drop('imdb_rating', axis=1)

In [113]:
# Initiate OneHotEncoder

# Split dataset into categorical and numerical
categorical_features = features.select_dtypes(['object', 'bool'])
numerical_features = features.drop(categorical_features, axis=1)

cat = [ list(df[col].unique()) for col in categorical_features ]

ohe = OneHotEncoder(sparse_output=False, categories=cat)

# Fit OneHotEncoder with the categorical data and transform it into numerical values
ohe.fit(categorical_features)
categorical_features_trans_np = ohe.transform(categorical_features)

# Create a dataframe using the transformed values and the original index
categorical_features_trans_df = pd.DataFrame(categorical_features_trans_np, columns=ohe.get_feature_names_out(), index=categorical_features.index)

# Concatenate the newly transformed dataframe with the numerical dataframe
features = pd.concat([categorical_features_trans_df, numerical_features], axis=1)
features.head()

Unnamed: 0,certificate_A,certificate_Approved,certificate_G,certificate_GP,certificate_PG,certificate_PG-13,certificate_Passed,certificate_R,certificate_TV-PG,certificate_U,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,142,80.0,2343110,28341469.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,175,100.0,1620367,134966411.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,152,84.0,2303232,534858444.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,202,90.0,1129952,57300000.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,96,96.0,689845,4360000.0


In [84]:
# Now we perform the division between Train and Test, we will reserve 20% of our data to Test.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [119]:
normalizer = StandardScaler()
normalizer.fit(X_train)

X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns, index = X_train.index)
X_train_norm.head()

X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns, index = X_test.index)
X_test_norm.head()

Unnamed: 0,certificate_A,certificate_Approved,certificate_G,certificate_GP,certificate_PG,certificate_PG-13,certificate_Passed,certificate_R,certificate_TV-PG,certificate_U,...,decade_1960's,decade_1970's,decade_1980's,decade_1990's,decade_2000's,decade_2010's,runtime,meta_score,no_of_votes,gross
338,-0.592208,-0.103142,-0.11931,-0.041922,-0.158682,-0.227289,-0.103142,-0.475191,-0.041922,1.736112,...,-0.235702,-0.274721,-0.352506,-0.46685,1.614805,-0.581402,0.686116,0.36777,0.53417,1.55259
142,-0.592208,-0.103142,-0.11931,-0.041922,-0.158682,-0.227289,-0.103142,-0.475191,-0.041922,1.736112,...,-0.235702,-0.274721,-0.352506,-0.46685,-0.61927,1.719981,-1.098451,1.316674,0.716934,2.518059
242,-0.592208,-0.103142,-0.11931,-0.041922,-0.158682,-0.227289,-0.103142,-0.475191,-0.041922,1.736112,...,-0.235702,-0.274721,-0.352506,-0.46685,-0.61927,1.719981,0.078604,-0.264833,-0.827743,-0.676423
235,-0.592208,-0.103142,-0.11931,-0.041922,-0.158682,-0.227289,-0.103142,-0.475191,-0.041922,1.736112,...,-0.235702,-0.274721,-0.352506,-0.46685,1.614805,-0.581402,-1.364238,-0.422984,-0.905007,-0.69308
468,-0.592208,-0.103142,-0.11931,-0.041922,-0.158682,-0.227289,-0.103142,-0.475191,-0.041922,-0.576,...,-0.235702,-0.274721,2.836833,-0.46685,-0.61927,-0.581402,-1.022512,-0.897436,-0.026342,-0.281693


##  Regression

In [714]:
# we will use KNN Regressor, we will use hyperparameter n_neighbors =10 .

In [86]:
from sklearn.neighbors import KNeighborsRegressor

In [139]:
knn = KNeighborsRegressor(n_neighbors=13) # K=10

In [140]:
knn.fit(X_train_norm, y_train)

In [141]:
#We are going to evaluate our model performance with R-Squared
print(f"The R2 of the model is {knn.score(X_test_norm, y_test): .2f}")

The R2 of the model is  0.31
