In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load the dataset
df = pd.read_csv('Movies.csv', encoding='latin-1')

# Step 2: Handle missing values

# Convert 'Year' column to numbers and fill missing values with median
df['Year'] = df['Year'].astype(str).str.extract(r'(\d+)').astype(float)
df['Year'].fillna(df['Year'].median(), inplace=True)

# Convert 'Duration' to numbers and fill missing values with median
df['Duration'] = df['Duration'].str.replace(' min', '', regex=False).astype(float)
df['Duration'].fillna(df['Duration'].median(), inplace=True)

# Fill missing 'Genre' with the most common (mode)
df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)

# Fill missing 'Rating' and 'Votes' with median values
df['Rating'].fillna(df['Rating'].median(), inplace=True)
df['Votes'] = df['Votes'].astype(str).str.extract(r'(\d+)').astype(float)
df['Votes'].fillna(df['Votes'].median(), inplace=True)

# Fill missing 'Director' and 'Actors' with "unknown"
df.fillna({'Director': 'unknown', 'Actor 1': 'unknown', 'Actor 2': 'unknown', 'Actor 3': 'unknown'}, inplace=True)

# Step 3: Convert categorical data into numbers
df['Genre'] = df['Genre'].factorize()[0]
df['Director'] = df['Director'].factorize()[0]
df['Actor 1'] = df['Actor 1'].factorize()[0]
df['Actor 2'] = df['Actor 2'].factorize()[0]
df['Actor 3'] = df['Actor 3'].factorize()[0]

# Step 4: Split data into Features (X) and Target (y)
X = df[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Year', 'Duration', 'Votes']]  # Features
y = df['Rating']  # Target (Movie Rating)

# Split dataset into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Year'].fillna(df['Year'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Duration'].fillna(df['Duration'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Mean Absolute Error: 0.45486750483558985
Mean Squared Error: 0.6918746985815603
R² Score: 0.28849468996496996
