# ZOHO ASSIGNMENT 

In [2]:
#Here I Imported the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Here I Red data file from the file location and stored it as a Data Frame
df = pd.read_excel("C:\\Users\\renga\\Desktop\\Zoho\\Rotten_Tomatoes_Movies3.xls\\Rotten_Tomatoes_Movies3.xls", engine='xlrd')

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,movie_title,movie_info,critics_consensus,rating,genre,directors,writers,cast,in_theaters_date,on_streaming_date,runtime_in_minutes,studio_name,tomatometer_status,tomatometer_rating,tomatometer_count,audience_rating
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,Craig Titley,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,2010-06-29,83.0,20th Century Fox,Rotten,49,144,53.0
1,Please Give,Kate has a lot on her mind. There's the ethics...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,2010-10-19,90.0,Sony Pictures Classics,Certified Fresh,86,140,64.0
2,10,Blake Edwards' 10 stars Dudley Moore as George...,,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,1997-08-27,118.0,Waner Bros.,Fresh,68,22,53.0
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,2001-03-06,95.0,Criterion Collection,Certified Fresh,100,51,97.0
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,2003-05-20,127.0,Disney,Fresh,89,27,74.0


In [4]:
# Here I Check for missing columns and data
required_columns = ['movie_info', 'critics_consensus', 'rating', 'genre', 'directors',
                    'runtime_in_minutes', 'tomatometer_rating', 'audience_rating']

for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

In [5]:
# Here I Handle missing values
df['critics_consensus'] = df['critics_consensus'].fillna('')
df['movie_info'] = df['movie_info'].fillna('')
df['rating'] = df['rating'].fillna('Unknown')
df['genre'] = df['genre'].fillna('Unknown')
df['directors'] = df['directors'].fillna('Unknown')
df['runtime_in_minutes'] = df['runtime_in_minutes'].fillna(df['runtime_in_minutes'].mean())
df['tomatometer_rating'] = df['tomatometer_rating'].fillna(df['tomatometer_rating'].mean())
df['audience_rating'] = df['audience_rating'].fillna(df['audience_rating'].mean())

In [6]:
# Combine text columns into one for simplyfy the analysis
df['combined_text'] = df['movie_info'] + ' ' + df['critics_consensus']


In [7]:
# Here I Defined features and target
X = df[['combined_text', 'rating', 'genre', 'directors', 'runtime_in_minutes', 'tomatometer_rating']]
y = df['audience_rating']

In [8]:
# Here I Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Pipline step's and Random Forest Regressor Model

In [9]:
# Here I Define feature categories
text_feature = 'combined_text'
categorical_features = ['rating', 'genre', 'directors']
numerical_features = ['runtime_in_minutes', 'tomatometer_rating']


In [10]:
# Text preprocessing
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100))
])


In [11]:
# Categorical preprocessing
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
# Numerical preprocessing
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [13]:
# Here I Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)


In [14]:
# Here I Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])


In [15]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = model_pipeline.predict(X_test)

In [17]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [18]:
# Print evaluation results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Mean Absolute Error: 11.629282934900159
Mean Squared Error: 212.32701922322758
R² Score: 0.47949885667147674


### Inference:
###        The result are obtained from evaluation are provided below
###        Mean Absolute Error : 11.629282934900159
###        Mean Squared Error  : 212.32701922322758
###        R² Score                      : 0.47949885667147674
###        The "Random Forest Regressor" provides moderate result for this regression task. but we can improve using other Algorithms like Gradient-Boosting-Regressor , Cat-Boost-Regressor.

# Gradient Boosting Regressor Model

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [20]:

# Define the updated model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])


In [21]:
# Define parameter grid for tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

In [16]:
# Perform GridSearchCV
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train) 

In [17]:
# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

Best Parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 3, 'regressor__n_estimators': 300}
Best R² Score: 0.5110451205088199


In [18]:
# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [19]:
# Updated evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Improved Mean Absolute Error: {mae}")
print(f"Improved Mean Squared Error: {mse}")
print(f"Improved R² Score: {r2}")

Improved Mean Absolute Error: 11.390041860514716
Improved Mean Squared Error: 203.43404775441564
Improved R² Score: 0.5012991995295757


# Inference:
###        The result are obtained from evaluation are provided below
###        Improved Mean Absolute Error : 11.390041860514716
###        Improved Mean Squared Error  : 203.43404775441564
###        Improved R² Score                      : 0.5012991995295757
###        The "Gradient-Boosting-Regressor" provides better result for this regression task. but we can improve using other Algorithms like Cat-Boost-Regressor or XGBRegressor.

# Cat Boost Regressor Model

In [43]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the updated model pipeline using CatBoost
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', CatBoostRegressor(random_state=42, verbose=0))  # verbose=0 to suppress training output
])

# Define parameter grid for tuning
param_grid = {
    'regressor__iterations': [100, 200, 300],
    'regressor__depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__l2_leaf_reg': [1, 3, 5]  # L2 regularization parameter
}

# Perform GridSearchCV
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Updated evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Improved Mean Absolute Error: {mae}")
print(f"Improved Mean Squared Error: {mse}")
print(f"Improved R² Score: {r2}")


Best Parameters: {'regressor__depth': 3, 'regressor__iterations': 300, 'regressor__l2_leaf_reg': 1, 'regressor__learning_rate': 0.2}
Best R² Score: 0.5097807752707553
Improved Mean Absolute Error: 11.382840739174553
Improved Mean Squared Error: 202.5517131221731
Improved R² Score: 0.5034621658188437


### Inference:
###        The result are obtained from evaluation are provided below
###        Improved Mean Absolute Error : 11.382840739174553
###        Improved Mean Squared Error  : 202.5517131221731
###        Improved R² Score                      : 0.5034621658188437
###        The "Cat-Boost-Regressor" provides better result then above algorithm. but we can improve using other Algorithms like "XGBRegressor".

# XGBRegressor

In [40]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the updated model pipeline using XGBoost
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

# Define parameter grid for tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

# Perform GridSearchCV
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='r2')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Updated evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Improved Mean Absolute Error: {mae}")
print(f"Improved Mean Squared Error: {mse}")
print(f"Improved R² Score: {r2}")

Best Parameters: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 3, 'regressor__n_estimators': 300}
Best R² Score: 0.5116163530731451
Improved Mean Absolute Error: 11.301120398975018
Improved Mean Squared Error: 200.62886383492696
Improved R² Score: 0.508175862907992


### For predicting the "audience_rating," I experimented with multiple algorithms, including Random Forest Regressor, Gradient Boosting Regressor, CatBoost Regressor, and XGBRegressor. Among these, the XGBRegressor delivered the best results, demonstrating superior performance metrics.

### The XGBRegressor was particularly advantageous due to its ability to handle missing values, regularization techniques (L1 and L2), and its efficient use of computing resources. Furthermore, its robust handling of both small and large datasets and built-in support for parallel processing allowed for faster training and better optimization of model parameters.

### Additionally, I utilized a Deep Neural Network for prediction; however, it did not yield the desired results due to the small dataset size, which limited its effectiveness.

### The final and optimal results produced by the XGBRegressor are as follows:

### Mean Absolute Error (MAE): 11.30
### Mean Squared Error (MSE): 200.63
### R² Score: 0.5082
### These metrics reflect a notable improvement and affirm the effectiveness of the XGBRegressor for this task.