In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
data = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv', sep=',', encoding='latin1' )

In [3]:
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
data.dtypes

Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [5]:
data.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

# Preprossessing

In [6]:
data['Duration'] = data['Duration'].fillna('0').astype(str)

In [7]:
data['Year'] = data['Year'].str.replace(r'[()]', '', regex=True)
data['Duration'] = data['Duration'].str.replace(r'[min]', '', regex=True)
data['Votes'] = data['Votes'].str.replace(',', '', regex=False)

In [8]:
data['Duration'] = data['Duration'].astype(int)

In [9]:
median_duration_by_genre = data.groupby('Genre')['Duration'].median()

# Replace Duration == 0 with median duration by genre inplace
for genre, median_duration in median_duration_by_genre.items():
    data.loc[(data['Duration'] == 0) & (data['Genre'] == genre), 'Duration'] = median_duration

  data.loc[(data['Duration'] == 0) & (data['Genre'] == genre), 'Duration'] = median_duration


In [10]:
median_duration_by_director = data.groupby('Director')['Duration'].median()

# Replace Duration == 0 with median duration by Director inplace
for director, median_duration in median_duration_by_director.items():
    data.loc[(data['Duration'] == 0) & (data['Director'] == director), 'Duration'] = median_duration

In [11]:
actors = ['Actor 1','Actor 2','Actor 3']
for actor in actors:
    median_duration_by_actor = data.groupby(actor)['Duration'].median()

    for act, median_duration in median_duration_by_actor.items():
        data.loc[(data['Duration'] == 0) & (data[actor] == act), 'Duration'] = median_duration

In [12]:
data=data[(data['Duration'] >= 60) & (data['Duration'] <= 180)]

In [13]:
data = data.dropna(subset=['Rating'],axis=0)
data = data.dropna(subset=['Director'],axis=0)

In [14]:
data['Genre'] = data['Genre'].str.split(', ')
data = data.explode('Genre')
data['Genre'].fillna(data['Genre'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Genre'].fillna(data['Genre'].mode()[0], inplace=True)


In [15]:
data = data.dropna()
data['Year'] = data['Year'].astype(int)
data['Votes'] = data['Votes'].astype(int)

In [16]:
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),2019,109.0,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,2019,110.0,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,#Yaaram,2019,110.0,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,1997,147.0,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,...Aur Pyaar Ho Gaya,1997,147.0,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [17]:
data.drop(['Name'], axis=1, inplace=True)

In [18]:
mean_genre = data.groupby('Genre')['Rating'].transform('mean')
data['mean_genre'] = mean_genre

mean_director = data.groupby('Director')['Rating'].transform('mean')
data['mean_director'] = mean_director

mean_a1 = data.groupby('Actor 1')['Rating'].transform('mean')
data['mean_a1'] = mean_a1

mean_a2 = data.groupby('Actor 2')['Rating'].transform('mean')
data['mean_a2'] = mean_a2

mean_a3 = data.groupby('Actor 3')['Rating'].transform('mean')
data['mean_a3'] = mean_a3

# Model Training

In [19]:
y = data['Rating']
X = data.drop(['Rating','Genre','Director', 'Actor 1','Actor 2', 'Actor 3',], axis=1)

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.columns) 
    ]
)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [22]:
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor())
])

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

In [23]:
gb_param_grid = {
    'model__n_estimators': [200,250, 300],
    'model__learning_rate': [0.2, 0.3, 0.4],
    'model__max_depth': [ 7,10,13]
}

rf_param_grid = {
    'model__n_estimators': [200,250, 300],
    'model__max_depth': [5, 10, 20],
    'model__min_samples_split': [2, 5, 10]
}

# Create GridSearchCV objects
gb_grid_search = GridSearchCV(estimator=gb_pipeline, param_grid=gb_param_grid, cv=2, scoring='r2')
rf_grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=rf_param_grid, cv=2, scoring='r2')

# Fit GridSearchCV
gb_grid_search.fit(X_train, y_train)
rf_grid_search.fit(X_train, y_train)

# Get the best models and parameters
best_gb_model = gb_grid_search.best_estimator_
best_rf_model = rf_grid_search.best_estimator_

print(f"Best Gradient Boosting Parameters: {gb_grid_search.best_params_}")
print(f"Best Random Forest Parameters: {rf_grid_search.best_params_}")
print('\n\n')

# Predict with the best models
gb_y_pred = best_gb_model.predict(X_test)
rf_y_pred = best_rf_model.predict(X_test)

# Evaluate the models
gb_mse = mean_squared_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)
print(f"Optimized Gradient Boosting Regressor Mean Squared Error: {gb_mse}")
print(f"Optimized Gradient Boosting Regressor R² Score: {gb_r2}")
print('\n\n')

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)
print(f"Optimized Random Forest Regressor Mean Squared Error: {rf_mse}")
print(f"Optimized Random Forest Regressor R² Score: {rf_r2}")

Best Gradient Boosting Parameters: {'model__learning_rate': 0.2, 'model__max_depth': 10, 'model__n_estimators': 200}
Best Random Forest Parameters: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 200}



Optimized Gradient Boosting Regressor Mean Squared Error: 0.07599843713808607
Optimized Gradient Boosting Regressor R² Score: 0.9589523932771568



Optimized Random Forest Regressor Mean Squared Error: 0.10123301772292301
Optimized Random Forest Regressor R² Score: 0.9453229137948321


In [24]:
# Save the best models
joblib.dump(best_gb_model, 'best_gb_model.pkl')
joblib.dump(best_rf_model, 'best_rf_model.pkl')

['best_rf_model.pkl']