In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Load The Data

In [2]:
movies = pd.read_csv('/kaggle/input/movies-dataset/IMDb Movies India.csv',  encoding = ('ISO-8859-1'))

In [3]:
movies.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
# Number of Rows
movies.shape[0]

15509

### Let's check for the missing values

In [5]:
movies.isnull().any()

Name        False
Year         True
Duration     True
Genre        True
Rating       True
Votes        True
Director     True
Actor 1      True
Actor 2      True
Actor 3      True
dtype: bool

#### Looks like except for 'Name', all other features have missing values!!

In [6]:
movies.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [7]:
# Target variable
y = movies['Rating']
y.head()

0    NaN
1    7.0
2    NaN
3    4.4
4    NaN
Name: Rating, dtype: float64

In [8]:
y = y.fillna(0)
y.shape

(15509,)

#### Director count

In [9]:
dir = movies['Director'].value_counts()
d = dir.head(10)
d

Director
Jayant Desai        58
Kanti Shah          57
Babubhai Mistry     50
Mahesh Bhatt        48
Master Bhagwan      47
Nanabhai Bhatt      46
Dhirubhai Desai     46
B.R. Ishara         44
David Dhawan        44
Mohammed Hussain    44
Name: count, dtype: int64

#### Actor 1 count

In [10]:
act_1 = movies['Actor 1'].value_counts()
a_1 = act_1.head(10)
a_1

Actor 1
Ashok Kumar           158
Dharmendra            140
Jeetendra             140
Mithun Chakraborty    133
Amitabh Bachchan      129
Rajesh Khanna         122
Dev Anand              93
Shashi Kapoor          91
Akshay Kumar           88
Sanjeev Kumar          83
Name: count, dtype: int64

#### Actor 2 count

In [11]:
act_2 = movies['Actor 2'].value_counts()
a_2 = act_2.head(10)
a_2

Actor 2
Rekha                 83
Hema Malini           72
Mithun Chakraborty    63
Dharmendra            61
Mala Sinha            48
Helen                 48
Mumtaz                45
Reena Roy             45
Jaya Prada            44
Shabana Azmi          44
Name: count, dtype: int64

#### Actor 3 count

In [12]:
act_3 = movies['Actor 3'].value_counts()
a_3 = act_3.head(1)
a_3

Actor 3
Pran    91
Name: count, dtype: int64

#### Different Genre count

In [13]:
genre_count = movies['Genre'].value_counts()
g = genre_count.head(10)
g

Genre
Drama                   2780
Action                  1289
Thriller                 779
Romance                  708
Drama, Romance           524
Comedy                   495
Action, Crime, Drama     455
Drama, Family            418
Horror                   322
Action, Drama            316
Name: count, dtype: int64

In [14]:
X = movies.drop(['Rating'], axis = 1)

In [15]:
X.head()

Unnamed: 0,Name,Year,Duration,Genre,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [16]:
# Let's see how the model goes with dropping 'na' values
X = X.fillna(0)
X.head()

Unnamed: 0,Name,Year,Duration,Genre,Votes,Director,Actor 1,Actor 2,Actor 3
0,,0,0,Drama,0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [17]:
X.shape[0]

15509

In [18]:
X.isnull().any()

Name        False
Year        False
Duration    False
Genre       False
Votes       False
Director    False
Actor 1     False
Actor 2     False
Actor 3     False
dtype: bool

In [19]:
actors = pd.concat([X['Actor 1'], X['Actor 2'], X['Actor 3']])

In [20]:
top_ten_actors = actors.value_counts().head(10) # On the basis of number of movies done
top_ten_actors

0                     7145
Mithun Chakraborty     241
Dharmendra             231
Ashok Kumar            227
Jeetendra              179
Amitabh Bachchan       178
Rekha                  142
Rajesh Khanna          139
Shashi Kapoor          133
Shatrughan Sinha       131
Name: count, dtype: int64

In [21]:
X = X.astype('str')

### We will do Ordinal Encoding for converting categorical data to numerical data

In [22]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()

In [23]:
X = oe.fit_transform(X)

### Split the data into training and testing

In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

#### It's a regression problem , and we will use Random Forest Regressor as it is one of the most powerful model

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [26]:
# Model
model = RandomForestRegressor()

In [27]:
# Fit the model
model.fit(X_train,y_train)

# Predict the model
y_pred = model.predict(X_test)

In [28]:
# Check the performance of the model
r2 = r2_score(y_test,y_pred)
r2

0.919023856681358

#### Random Forest gives a good performance!!