# CODSOFT INTERNSHIP

**TASK - 2: Movie Rating Prediction**

Author: Arkansh Tripathi

Domain: Data Science

Batch: September-2024

In [None]:
# Import Libraries for data processing and modelling

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
from google.colab import files
uploaded = files.upload()

Saving IMDb Movies India.csv to IMDb Movies India.csv


In [None]:
# Linking the dataset into Colab
import io
imdb_df = pd.read_csv((io.BytesIO(uploaded['IMDb Movies India.csv'])), encoding='latin1')

In [None]:
# Dataset First Look
imdb_df.head(10)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
5,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
7,.in for Motion,-2008.0,59 min,Documentary,,,Anirban Datta,,,
8,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
9,@Andheri,-2014.0,116 min,"Action, Crime, Thriller",4.0,11.0,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon


In [None]:
# to check total no. of data input
imdb_df.shape

(15509, 10)

# Data Cleaning

In [None]:
# to check null values in each column
imdb_df.isnull().sum()

Unnamed: 0,0
Name,0
Year,528
Duration,8269
Genre,1877
Rating,7590
Votes,7589
Director,525
Actor 1,1617
Actor 2,2384
Actor 3,3144


In [None]:
# total Information in the dataset
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  float64
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(2), object(8)
memory usage: 1.2+ MB


In [None]:
# to check if any duplicated value present in the dataset
imdb_df.duplicated().sum()

6

In [None]:
# to drop the null values from the dataset
imdb_df.dropna(inplace=True)

In [None]:
imdb_df.shape

(5659, 10)

In [None]:
imdb_df.isnull().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,0
Genre,0
Rating,0
Votes,0
Director,0
Actor 1,0
Actor 2,0
Actor 3,0


In [None]:
imdb_df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

# **Data Pre-Processing**



*   Data have to processed to make more easier to derive insights from it and pre-processed would be more suitable while fitting the data.

*   In this project we have processed the data by column wise based on the requirement.

In [None]:
# Replacing the brackets from the year column
imdb_df['Year'] = imdb_df['Year'].replace(r'[()]', '', regex=True).astype(int)

In [None]:
# remove the min word from 'Duration' column and convert all values to numeric
imdb_df['Duration'] = pd.to_numeric(imdb_df['Duration'].str.replace(' min', ''))

In [None]:
# Spiltting genre by, to keep only unique genres and replacing the null values with mode
imdb_df['Genre'] = imdb_df['Genre'].str.split(',').str[0].fillna(imdb_df['Genre'].mode()[0])

In [None]:
# convert 'Votes' to numeric and replace the, to keep only numerical part
imdb_df['Votes'] = pd.to_numeric(imdb_df['Votes'].str.replace(',', ''))

In [None]:
# Checking the dataset is there any null values present and data types of the features present
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   int64  
 2   Duration  5659 non-null   int64  
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   int64  
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 486.3+ KB


# **Data Visualization**


*   Visualization part is done to show the relationships between the features present in the dataset.

*   In this part, we have used multiple charts to see the relations within components of data which involves in the result.

In [None]:
# Created a histogram
year = px.histogram(imdb_df, x = 'Year', histnorm= 'probability density', nbins=30)
year.show()

In [None]:
# Group data by Year and calculate the average rating
avg_rating_by_year = imdb_df.groupby('Year')['Rating'].mean().reset_index()

# Rename the columns for clarity
avg_rating_by_year.columns = ['Year', 'Rating']

# Sort the data by Year
avg_rating_by_year = avg_rating_by_year.sort_values('Year')

# Create a line plot
line_plot = px.line(avg_rating_by_year, x='Year', y='Rating', title='Average Rating by Year')

# Show the plot
line_plot.show()

In [None]:
# This histogram shows the distribution of ratings and its probable density

rating_fig= px.histogram(imdb_df, x = 'Rating', histnorm= 'probability density', nbins=40)
rating_fig.update_layout(title_text='Rating Distribution', title_x=0.5, title_pad=dict(t=40), title_font=dict(size=24), xaxis_title='Rating', yaxis_title='Probability Density')
rating.show()

In [None]:
# Importing essesntial libraries for model building

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Dropping Name column because it doesn't impact the outcome
imdb_df.drop('Name', axis=1, inplace=True)

In [None]:
# Grouping the columns with their average rating and then creating a new feature

genre_mean_rating = imdb_df.groupby('Genre')['Rating'].transform('mean')
imdb_df['Genre_Mean_Rating'] = genre_mean_rating

director_mean_rating = imdb_df.groupby('Director')['Rating'].transform('mean')
imdb_df['Director_Mean_Rating'] = director_mean_rating

actor1_mean_rating = imdb_df.groupby('Actor 1')['Rating'].transform('mean')
imdb_df['Actor1_Mean_Rating'] = actor1_mean_rating

actor2_mean_rating = imdb_df.groupby('Actor 2')['Rating'].transform('mean')
imdb_df['Actor2_Mean_Rating'] = actor2_mean_rating

actor3_mean_rating = imdb_df.groupby('Actor 3')['Rating'].transform('mean')
imdb_df['Actor3_Mean_Rating'] = actor3_mean_rating

In [None]:
# Keeping the predictor and target variable

X = imdb_df.drop(['Year', 'Votes', 'Duration', 'Rating', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], axis=1)
y = imdb_df['Rating']

In [None]:
# Spitting the dataset into training and testing parts

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Model Building**

In [None]:
# Building machine learning model and training them

Model = LinearRegression()
Model.fit(x_train, y_train)
Model_pred = Model.predict(x_test)

In [None]:
# Evaluating the performance of model with evaluation metrics

print('The performance evaluation of Logistic Regression is below: ', '\n')
print('Mean Squared Error: ', mean_squared_error(y_test, Model_pred))
print('Mean absolute Error: ', mean_squared_error(y_test, Model_pred))
print('R2 Score: ', r2_score(y_test, Model_pred))

The performance evaluation of Logistic Regression is below:  

Mean Squared Error:  0.442944722257985
Mean absolute Error:  0.442944722257985
R2 Score:  0.7607958730230165


# **Model Testing**

In [None]:
X.head(10)

Unnamed: 0,Genre_Mean_Rating,Director_Mean_Rating,Actor1_Mean_Rating,Actor2_Mean_Rating,Actor3_Mean_Rating
1,6.248697,7.0,6.85,7.0,7.0
3,5.838423,4.4,5.42,4.4,4.45
5,5.838423,5.313333,4.788889,5.786667,5.872727
6,6.248697,7.383333,5.435,6.933333,6.5
8,4.6875,5.6,5.6,5.883333,5.6
9,5.511985,4.0,4.0,4.55,4.0
10,6.248697,6.2,5.48,5.45,5.233333
11,4.6875,6.95,6.066667,5.9,5.9
12,4.6875,4.907143,5.187179,5.97619,5.041667
13,5.838423,5.7,6.3,5.7,5.7


In [None]:
y.head(10)

Unnamed: 0,Rating
1,7.0
3,4.4
5,4.7
6,7.4
8,5.6
9,4.0
10,6.2
11,5.9
12,6.5
13,5.7


In [None]:
# For testing, we created a new dataframe with values close to the any of our existing data to evaluate.

data = {'Year': [2019], 'Votes': [36], 'Duration': [111], 'Genre_Mean_Rating': [5.8], 'Director_Mean_Rating': [4.5], 'Actor1_Mean_Rating': [5.3], 'Actor2_Mean_Rating': [4.5], 'Actor3_Mean_Rating': [4.0]}
new_data = pd.DataFrame(data)

In [None]:
new_data = new_data[X.columns]
rating_prediction = Model.predict(new_data)

# Display the predicted result from the Model
print('Predicted Rating:', rating_prediction[0])

Predicted Rating: 4.117013527224948
