In [5]:
import pandas as pd



file_path = "IMDb Movies India.csv" 
df = pd.read_csv(file_path, encoding='latin1')




df['Year'] = df['Year'].str.extract('(\d{4})').astype(float)



df['Duration'] = pd.to_numeric(df['Duration'].str.extract('(\d+)')[0], errors='coerce')



df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', '', regex=True), errors='coerce')



num_cols = ['Year', 'Duration', 'Votes', 'Rating']
for col in num_cols: df[col].fillna(df[col].median(), inplace=True)

cat_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'] 
for col in cat_cols: df[col].fillna(df[col].mode()[0], inplace=True)



director_avg_rating = df.groupby('Director')['Rating'].mean()
df['Director_Success_Rate'] = df['Director'].map(director_avg_rating)

genre_avg_rating = df.groupby('Genre')['Rating'].mean() 
df['Genre_Avg_Rating'] = df['Genre'].map(genre_avg_rating)



for actor_col in ['Actor 1', 'Actor 2', 'Actor 3']: actor_avg_rating = df.groupby(actor_col)['Rating'].mean() 
df[f'{actor_col}_Avg_Rating'] = df[actor_col].map(actor_avg_rating)



df.to_csv("Cleaned_IMDb_Movies_India.csv", index=False)



df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Director_Success_Rate,Genre_Avg_Rating,Actor 3_Avg_Rating
0,,1991.0,131.0,Drama,6.0,55.0,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia,5.85,6.088963,6.0
1,#Gadhvi (He thought he was Gandhi),2019.0,109.0,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,7.0,6.088963,7.0
2,#Homecoming,2021.0,90.0,"Drama, Musical",6.0,55.0,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana,6.0,6.366667,6.0
3,#Yaaram,2019.0,110.0,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,4.4,5.762143,4.45
4,...And Once Again,2010.0,105.0,Drama,6.0,55.0,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali,6.285714,6.088963,5.6


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = "IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding='latin1')

# Data Cleaning
# Extract numeric values from 'Year'
df['Year'] = df['Year'].str.extract('(\d{4})').astype(float)

# Convert 'Duration' to numeric
df['Duration'] = pd.to_numeric(df['Duration'].str.extract('(\d+)')[0], errors='coerce')

# Convert 'Votes' to numeric
df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', '', regex=True), errors='coerce')

# Handle missing values
num_cols = ['Year', 'Duration', 'Votes', 'Rating']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

cat_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Feature Engineering
# Director Success Rate
director_avg_rating = df.groupby('Director')['Rating'].mean()
df['Director_Success_Rate'] = df['Director'].map(director_avg_rating)

# Genre-Based Average Rating
genre_avg_rating = df.groupby('Genre')['Rating'].mean()
df['Genre_Avg_Rating'] = df['Genre'].map(genre_avg_rating)

# Actor Influence (Average rating of movies featuring the actor)
for actor_col in ['Actor 1', 'Actor 2', 'Actor 3']:
    actor_avg_rating = df.groupby(actor_col)['Rating'].mean()
    df[f'{actor_col}_Avg_Rating'] = df[actor_col].map(actor_avg_rating)

# Encode categorical features
label_encoders = {}
for col in ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['Rating', 'Name'])
y = df['Rating']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Save cleaned data
df.to_csv("Cleaned_IMDb_Movies_India.csv", index=False)

# Display first few rows
df.head()




Mean Squared Error: 0.22373385428755638


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Director_Success_Rate,Genre_Avg_Rating,Actor 1_Avg_Rating,Actor 2_Avg_Rating,Actor 3_Avg_Rating
0,,1991.0,131.0,299,6.0,55.0,1926,2250,800,3108,5.85,6.088963,6.0,5.625714,6.0
1,#Gadhvi (He thought he was Gandhi),2019.0,109.0,299,7.0,8.0,1548,3280,4790,527,7.0,6.088963,6.85,7.0,7.0
2,#Homecoming,2021.0,90.0,351,6.0,55.0,5123,3713,2866,3450,6.0,6.366667,6.333333,6.9,6.0
3,#Yaaram,2019.0,110.0,228,4.4,35.0,3319,2917,1504,4020,4.4,5.762143,5.42,4.4,4.45
4,...And Once Again,2010.0,105.0,299,6.0,55.0,385,3112,3462,405,6.285714,6.088963,6.833333,5.8,5.6
