In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Load dataset
file_path = "IMDb Movies India.csv"
movie_data = pd.read_csv(file_path, encoding="latin1")
movie_data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [5]:
# Data Cleaning
# Remove extra characters from 'Year' and convert to numeric
movie_data['Year'] = movie_data['Year'].str.extract(r'(\d{4})').astype(float)

# Remove 'min' from 'Duration' and convert to numeric
movie_data['Duration'] = movie_data['Duration'].str.replace('min', '').astype(float, errors='ignore')

# Convert 'Votes' to numeric, handling any non-numeric entries
movie_data['Votes'] = pd.to_numeric(movie_data['Votes'], errors='coerce')

In [6]:

# Fill missing values
movie_data['Duration'].fillna(movie_data['Duration'].median(), inplace=True)
movie_data['Rating'].fillna(movie_data['Rating'].median(), inplace=True)
movie_data['Votes'].fillna(movie_data['Votes'].median(), inplace=True)
movie_data['Year'].fillna(movie_data['Year'].mode()[0], inplace=True)

# Fill categorical columns with "Unknown"
categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
movie_data[categorical_columns] = movie_data[categorical_columns].fillna("Unknown")


In [7]:
# Feature Engineering
# Encode categorical features using Label Encoding
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    movie_data[column] = le.fit_transform(movie_data[column])
    label_encoders[column] = le


In [8]:
# Split the dataset into features and target
X = movie_data.drop(columns=["Name", "Rating"])  # Drop irrelevant or target columns
y = movie_data["Rating"]

In [9]:
# Scale numerical features (optional for regression models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [11]:
# Model Training
# Use a Random Forest Regressor for prediction
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test)

In [12]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 0.6768844223081885
R-squared (R2) Score: 0.30391028644401774


In [13]:
# Function to predict ratings based on user input
def predict_movie_rating():
    print("\n--- Predict Movie Rating ---")
    # Take user input for features
    year = float(input("Enter the year of release (e.g., 2020): "))
    duration = float(input("Enter the movie duration in minutes: "))
    votes = float(input("Enter the number of votes: "))
    genre = input(f"Enter the genre (available options: {list(label_encoders['Genre'].classes_)}): ")
    director = input(f"Enter the director's name (available options: {list(label_encoders['Director'].classes_)}): ")
    actor1 = input(f"Enter Actor 1's name (available options: {list(label_encoders['Actor 1'].classes_)}): ")
    actor2 = input(f"Enter Actor 2's name (available options: {list(label_encoders['Actor 2'].classes_)}): ")
    actor3 = input(f"Enter Actor 3's name (available options: {list(label_encoders['Actor 3'].classes_)}): ")

    # Encode user inputs
    encoded_genre = label_encoders['Genre'].transform([genre])[0]
    encoded_director = label_encoders['Director'].transform([director])[0]
    encoded_actor1 = label_encoders['Actor 1'].transform([actor1])[0]
    encoded_actor2 = label_encoders['Actor 2'].transform([actor2])[0]
    encoded_actor3 = label_encoders['Actor 3'].transform([actor3])[0]

    # Create a feature array for prediction
    user_input = [[year, duration, votes, encoded_genre, encoded_director, encoded_actor1, encoded_actor2, encoded_actor3]]
    user_input_scaled = scaler.transform(user_input)

    # Predict the rating
    predicted_rating = model.predict(user_input_scaled)
    print(f"Predicted Movie Rating: {predicted_rating[0]:.2f}")

# Call the prediction function
predict_movie_rating()


--- Predict Movie Rating ---
Enter the year of release (e.g., 2020): 2020
Enter the movie duration in minutes: 230
Enter the number of votes: 23
Enter the genre (available options: ['Action', 'Action, Adventure', 'Action, Adventure, Biography', 'Action, Adventure, Comedy', 'Action, Adventure, Crime', 'Action, Adventure, Drama', 'Action, Adventure, Family', 'Action, Adventure, Fantasy', 'Action, Adventure, History', 'Action, Adventure, Horror', 'Action, Adventure, Musical', 'Action, Adventure, Mystery', 'Action, Adventure, Romance', 'Action, Adventure, Sci-Fi', 'Action, Adventure, Thriller', 'Action, Adventure, War', 'Action, Biography, Crime', 'Action, Biography, Drama', 'Action, Comedy', 'Action, Comedy, Crime', 'Action, Comedy, Drama', 'Action, Comedy, Fantasy', 'Action, Comedy, Horror', 'Action, Comedy, Musical', 'Action, Comedy, Mystery', 'Action, Comedy, Romance', 'Action, Comedy, Thriller', 'Action, Comedy, War', 'Action, Crime', 'Action, Crime, Drama', 'Action, Crime, Family', 

Enter the director's name (available options: ['A B Arjun', 'A. Bhimsingh', 'A. Habib', 'A. Jagannathan', 'A. Karim', 'A. Kodandarami Reddy', 'A. Majid', 'A. Muthu', 'A. Raja', 'A. Rashid', 'A. Salaam', 'A. Shakoor', 'A. Shamsheer', 'A. Veerappan', 'A. Venkatesh', 'A. Zahoor', 'A.A. Darpan', 'A.C. Trilogchander', 'A.G. Surma', 'A.H. Essa', 'A.K. Bir', 'A.K. Hangal', 'A.K. Mishra', 'A.L. Vijay', 'A.M. Khan', 'A.M.R. Ramesh', 'A.N. Bannerjee', 'A.P. Nagarajan', 'A.P. Subbaraav', 'A.R. Kabuli', 'A.R. Kardar', 'A.R. Murugadoss', 'A.R. Sheikh', 'A.R. Zamindar', 'A.S. Arora', 'A.S.A. Sami', 'A.T. Joy', 'A.T. Raghu', 'Aadesh Vishwakarma', 'Aadish Keluskar', 'Aakash Bhatia', 'Aakash Chaturvedi', 'Aakash Gupta', 'Aakkoo', 'Aamir Bashir', 'Aamir Khan', 'Aamreeta Gautam', 'Aanand L. Rai', 'Aanand Raut', 'Aanchal Malhotra', 'Aanuj Tewari', 'Aarif Sheikh', 'Aarthi Agarwal', 'Aarti S. Bagdi', 'Aarti Shrivastava', 'Aarun Nagar', 'Aaryaan Saxena', 'Aashish Chanana', 'Aashish Sinha', 'Aashnee', 'Aasika

Enter Actor 1's name (available options: ['A. Muthu', 'A. Nairang', 'A.K. Hangal', 'A.K. Misra', 'A.N. Ansari', 'A.R. Rahman', 'A.R.S.', 'A.S. Ravindra Babu', 'Aachi Manorama', 'Aadar Jain', 'Aadhi', 'Aadi Jain', 'Aadil Abedi', 'Aadil Chahal', 'Aaditya Pratap Singh', 'Aadya Bedi', 'Aafreen', 'Aahana Kumra', 'Aajesh Rekhi', 'Aakansha Kadre', 'Aakar Kaushik', 'Aakash', 'Aakash Dabhade', 'Aakash Pandey', 'Aakriti Bisaria', 'Aalam', 'Aaloka', 'Aamani', 'Aamir', 'Aamir Bashir', 'Aamir Khan', 'Aanchal Chauhan', 'Aanchal Dwivedi', 'Aarnaa Sharma', 'Aarti', 'Aarti Chhabria', 'Aarti S. Bagdi', 'Aarvi Aruna', 'Aaryaman Pratap Kushwaha', 'Aaryan', 'Aaryan Adhikari', 'Aashika', 'Aashish Bhatt', 'Aashish Chaudhary', 'Aashish Kaul', 'Aashit Chatterjee', 'Aasif Sheikh', 'Aatish Devgan', 'Aayush Shah', 'Aban Raza', 'Abbas', 'Abdul Rehman Kabuli', 'Abdula Kabuli', 'Abdullah', 'Abha Ranta', 'Abhay Bhargava', 'Abhay Chaudhary', 'Abhay Deol', 'Abhay Singhal', 'Abhi Bhattacharya', 'Abhijeet', 'Abhijeet K.'

Enter Actor 2's name (available options: ['A. Hussain', 'A. Hussein', 'A.H. Shore', 'A.K. Hangal', 'A.V.M. Rajan', 'Aachi Manorama', 'Aadar Jain', 'Aadil Khan', 'Aaditya Pratap Singh', 'Aahana Kumra', 'Aahuthi Prasad', 'Aakash Dahiya', 'Aakash Kumar Sehdev', 'Aamir', 'Aamir Bashir', 'Aamir Dalvi', 'Aamir Khan', 'Aamna Sharif', 'Aanchal Mehra', 'Aarav Mavi', 'Aarthi Agarwal', 'Aarti Bhattacharya', 'Aarti Chhabria', 'Aarushi Vedikha', 'Aarya Adhav', 'Aarya Dange', 'Aashi Rawal', 'Aashish Chanana', 'Aashish Chaudhary', 'Aatish Devgan', 'Aayat Shaikh', 'Aayush Sharma', 'Aayush Thakur', 'Abbhimanyyu Singh', 'Abdul Rehman Kabuli', 'Abdul Rehman Sheikh', 'Abha Dhulia', 'Abha Parmar', 'Abha Paul', 'Abhay Bhargava', 'Abhay Deol', 'Abhay Joshi', 'Abhay Mahajan', 'Abhi Bhattacharya', 'Abhijeet Bagalkote', 'Abhijeet Choudary', 'Abhijeet Sawant', 'Abhijit Bhagat', 'Abhilash Chaudhary', 'Abhilash Narain', 'Abhimanyu Dasani', 'Abhimanyu Mishra', 'Abhimanyu Sharma', 'Abhinav Goyal', 'Abhinav Paatekar'

Enter Actor 3's name (available options: ['A. Shah Shikarpuri', 'A.H. Shore', 'A.K. Agnihotri', 'A.K. Hangal', 'A.S. Gyani', 'Aabid Shamim', 'Aachi Manorama', 'Aadi', 'Aadil Sharma', 'Aaditya Singh', 'Aahana Kumra', 'Aaloka', 'Aamir Ali', 'Aamir Bashir', 'Aamir Ghani', 'Aamir Haque', 'Aamir Khan', 'Aamrik Arjun', 'Aanaahad', 'Aanchal Anand', 'Aanchal Kohli', 'Aaran', 'Aarav Mavi', 'Aardra Athalye', 'Aarnaa Sharma', 'Aarthi Agarwal', 'Aarti Chhabria', 'Aarti Puri', 'Aartii Naagpal', 'Aarun D. Baliean', 'Aaryan Chopra', 'Aaryan Maheshwari', 'Aaryan Menghji', 'Aashish Chaudhary', 'Aashish Sachdeva', 'Aasif Mandvi', 'Aasif Sheikh', 'Aastha Gautam', 'Aayub Khan', 'Aayush Sharma', 'Abbas', 'Abbas Ali', 'Abbas Khandwala', 'Abdul Rehman Kabuli', 'Abdul Sheikh', 'Abdullah Osman', 'Abhay Bhargava', 'Abhay Chandarana', 'Abhay Chopra', 'Abhay Deol', 'Abhay Joshi', 'Abhay Mahajan', 'Abhay Shukla', 'Abhi Bhattacharya', 'Abhijeet Ghadge', 'Abhijeet Lahiri', 'Abhijeet Sen', 'Abhijit Kelkar', 'Abhilash

Predicted Movie Rating: 6.37


