In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movie = pd.read_csv("IMDb Movies India.csv")
print("Sample Datasets are :")
print(movie.head())

Sample Datasets are :
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi) -2019.0  109 min            Drama   
2                         #Homecoming -2021.0   90 min   Drama, Musical   
3                             #Yaaram -2019.0  110 min  Comedy, Romance   
4                   ...And Once Again -2010.0  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  S

In [3]:
movie.describe()

Unnamed: 0,Year,Rating
count,14981.0,7919.0
mean,-1987.012215,5.841621
std,25.416689,1.381777
min,-2022.0,1.1
25%,-2009.0,4.9
50%,-1991.0,6.0
75%,-1968.0,6.8
max,-1913.0,10.0


In [4]:
movie['Rating'].value_counts()

Rating
6.2     269
6.8     264
6.5     254
6.6     239
6.7     227
       ... 
9.6       1
1.4       1
9.7       1
10.0      1
1.1       1
Name: count, Length: 84, dtype: int64

In [5]:
print(movie.dtypes)

Name         object
Year        float64
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object


In [6]:
print("Dataset Informations: ")
print("Number of Records : ")
print(movie.shape[0])
print("Number of features : ")
print(movie.shape[1])

Dataset Informations: 
Number of Records : 
15509
Number of features : 
10


In [7]:
print("Missing values:")
print(movie.isnull().sum())
print("Total number of missing values :")
print(movie.isnull().sum().sum())

Missing values:
Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64
Total number of missing values :
33523


In [8]:
movie.dropna(inplace=True)
print(movie.isnull().sum().sum())
print(movie.isnull().sum())

0
Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64


In [9]:
movie.shape

(5659, 10)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
X = movie[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = movie['Rating']

In [16]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)
scaler = StandardScaler()
X_encoded_scaled = scaler.fit_transform(X_encoded)
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
new_movie = pd.DataFrame({
    'Genre': ['Action'],
    'Director': ['Christopher Nolan'],
    'Actor 1': ['Christian Bale'],
    'Actor 2': ['Michael Caine'],
    'Actor 3': ['Heath Ledger']
})
new_movie_encoded = encoder.transform(new_movie)
new_movie_encoded_scaled = scaler.transform(new_movie_encoded)
predicted_rating = model.predict(new_movie_encoded_scaled)
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")
print(f"Predicted Rating for the new movie: {predicted_rating[0]}")


Mean Squared Error: 1.6158156925795049
R-squared Score: 0.1274085395376381
Predicted Rating for the new movie: 5.111999999999998
