In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load dataset
encodings = ['utf-8', 'latin1', 'ISO-8859-1']

for enc in encodings:
    try:
        data = pd.read_csv('/content/IMDb Movies India.csv', encoding=enc)
        print(f"Successfully read the file with {enc} encoding.")
        break
    except UnicodeDecodeError as e:
        print(f"Failed to read the file with {enc} encoding. Error: {e}")

# Display the first few rows of the dataset
print(data.head())

# Data Preprocessing
# Keep only the necessary columns
data = data[['Name', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Rating']]

# Drop rows with missing values
data = data.dropna()

# Combine actor columns into a single column
data['Actors'] = data['Actor 1'] + ', ' + data['Actor 2'] + ', ' + data['Actor 3']

# Drop the individual actor columns
data = data.drop(['Actor 1', 'Actor 2', 'Actor 3'], axis=1)

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Genre', 'Director', 'Actors'], drop_first=True)

# Split data into features and labels
X = data.drop(['Name', 'Rating'], axis=1)
y = data['Rating']

# Use a smaller sample of data for initial testing (optional)
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.1, random_state=42)

# Split the sample into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# Train the model with fewer trees and enable parallel processing
model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")


Failed to read the file with utf-8 encoding. Error: 'utf-8' codec can't decode byte 0xe9 in position 3: invalid continuation byte
Successfully read the file with latin1 encoding.
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4    