In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [None]:
# Load the dataset
data_path = '/content/movies.csv'
data = pd.read_csv(data_path, encoding='latin-1')

In [None]:
# Explore the dataset
print(data.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [None]:
print("Columns in the dataset:", data.columns)

Columns in the dataset: Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


In [None]:
# Drop the irrelevant 'Name' column
data = data.drop(columns=['Name'], errors='ignore')

In [None]:
data['Year'] = pd.to_numeric(data['Year'], errors='coerce').fillna(0).astype(int)

In [None]:
# Handling categorical columns: 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'
label_encoder = LabelEncoder()

In [None]:
# Apply Label Encoding for all categorical columns
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Checking for missing values and handle them (filling with mean for numerical columns)
# Convert all columns to numeric, coerce errors to NaN and fillna with 0
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)

data.fillna(data.mean(), inplace=True)

In [None]:
print(data.head())

   Year  Duration  Genre  Rating  Votes  Director  Actor 1  Actor 2  Actor 3
0     0       0.0    299     0.0    0.0      1926     2250      800     3108
1     0       0.0    299     7.0    8.0      1548     3280     4790      527
2     0       0.0    351     0.0    0.0      5123     3713     2866     3450
3     0       0.0    228     4.4   35.0      3319     2917     1504     4020
4     0       0.0    299     0.0    0.0       385     3112     3462      405


In [None]:
# Splitting features (X) and target variable (y)
X = data.drop(columns=['Rating'])  # Features (everything except 'Rating')
y = data['Rating']  # Target variable

In [None]:
# Standardizing the features (for Support Vector Regressor)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Defining models
linear_reg = LinearRegression()
random_forest = RandomForestRegressor(random_state=42)
svr = SVR()

In [None]:
# Train the models
linear_reg.fit(X_scaled, y)
random_forest.fit(X_scaled, y)
svr.fit(X_scaled, y)

In [None]:
# Evaluating models using cross-validation and calculating the MSE
algorithms = [('Linear Regression', linear_reg),
              ('Random Forest', random_forest),
              ('Support Vector Regressor', svr)]

for name, model in algorithms:
    mse_scores = -cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    mean_mse = mse_scores.mean()
    print(f'{name}: Mean MSE = {mean_mse:.4f}')

Linear Regression: Mean MSE = 7.6684
Random Forest: Mean MSE = 2.2168
Support Vector Regressor: Mean MSE = 5.6763


In [None]:
# Function to handle unseen labels in categorical features
def safe_label_encode(encoder, value, default=-1):
    """Safely encode a label with fallback if the label wasn't seen during training."""
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        return default  # Fallback value for unseen labels

In [None]:
# Example: predicting for a new user input with potential unseen labels
user_input = {
    'Year': 2000,
    'Duration': 120,
    'Genre': safe_label_encode(label_encoder, 'Action'),  # Encode 'Action' genre using the fitted label_encoder
    'Votes': 500000,
    'Director': safe_label_encode(label_encoder, 'Christopher Nolan'),  # Safely encode 'Christopher Nolan'
    'Actor 1': safe_label_encode(label_encoder, 'Leonardo DiCaprio'),  # Safely encode 'Leonardo DiCaprio'
    'Actor 2': safe_label_encode(label_encoder, 'Joseph Gordon-Levitt'),  # Safely encode 'Joseph Gordon-Levitt'
    'Actor 3': safe_label_encode(label_encoder, 'Elliot Page')  # Safely encode 'Elliot Page'
}

In [None]:
# Convert user input to DataFrame
user_df = pd.DataFrame(user_input, index=[0])

In [None]:
# Standardize user input using the same scaler fitted on training data
user_scaled = scaler.transform(user_df)

In [None]:
# Make predictions using the trained models
linear_reg_pred = linear_reg.predict(user_scaled)
random_forest_pred = random_forest.predict(user_scaled)
svr_pred = svr.predict(user_scaled)

In [None]:
# Display predictions
print("Predicted Ratings:")
print(f'Linear Regression: {linear_reg_pred[0]:.2f}')
print(f'Random Forest: {random_forest_pred[0]:.2f}')
print(f'Support Vector Regressor: {svr_pred[0]:.2f}')

Predicted Ratings:
Linear Regression: 3075.36
Random Forest: 5.73
Support Vector Regressor: 1.78
