## IMPORTING THE LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## IMPORTING DATA

In [2]:
data = pd.read_csv('Movie dataset.csv', encoding='latin-1')
print(data.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

CREATING DATAFRAME

In [3]:
df = pd.DataFrame(data)
print(f'Data :\n{df.head()}')

Data :
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor 

## DATAFRAME INFORMATION

DATAFRAME INFORMATION

In [4]:
print(f'Dataframe Info :\n')
print(f'{df.info()}')
print(f'DataFrame shape: {df.shape}')

Dataframe Info :

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None
DataFrame shape: (15509, 10)


COLUMN NAME

In [5]:
column_name = df.columns.tolist()
print(f'Column Names: {column_name}')

Column Names: ['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']


COLUMN DATATYPE

In [6]:
dtype_info = df.dtypes
print(f'Data Types of each column:\n{dtype_info}')

Data Types of each column:
Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object


## DATA CLEANING AND MANPULATION

CHECK NULL VALUES

In [7]:
null_counts = df.isnull().sum()
print(f'Null Values in each column:\n{null_counts}')

Null Values in each column:
Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


DROPPING NULL VALUES

In [8]:
df = df.dropna(subset=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Year', 'Duration', 'Rating'],how='any')
print(f'null values:\n{df.isnull().sum()}')
print(f'Shape after dropping null values: {df.shape}')

null values:
Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64
Shape after dropping null values: (5659, 10)


DATAFRAME ORGANIZING

In [9]:
# Remove leading and trailing spaces from string columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Clean 'Year' column to extract numeric year
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})', expand=False)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
# Clean 'Duration' column to extract numeric duration
df['Duration'] = df['Duration'].astype(str).str.extract(r'(\d+)', expand=False)
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
print(f'Cleaned DataFrame head:\n{df.head()}')

Cleaned DataFrame head:
                                 Name  Year  Duration  \
1  #Gadhvi (He thought he was Gandhi)  2019       109   
3                             #Yaaram  2019       110   
5                ...Aur Pyaar Ho Gaya  1997       147   
6                           ...Yahaan  2005       142   
8                  ?: A Question Mark  2012        82   

                       Genre  Rating  Votes        Director          Actor 1  \
1                      Drama     7.0      8   Gaurav Bakshi     Rasika Dugal   
3            Comedy, Romance     4.4     35      Ovais Khan          Prateik   
5     Comedy, Drama, Musical     4.7    827    Rahul Rawail       Bobby Deol   
6        Drama, Romance, War     7.4  1,086  Shoojit Sircar  Jimmy Sheirgill   
8  Horror, Mystery, Thriller     5.6    326   Allyson Patel        Yash Dave   

                  Actor 2          Actor 3  
1          Vivek Ghamande    Arvind Jangid  
3              Ishita Raj  Siddhant Kapoor  
5  Aishwarya Rai 

## MOVIE RATING PREDICTION MODEL

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Define features and target
features = ["Genre", "Director", "Year", "Duration", "Actor 1", "Actor 2", "Actor 3"]
target = "Rating"

df = df[features + [target]]

#Identify categorical and numerical features
categorical = ["Genre", "Director", "Actor 1", "Actor 2", "Actor 3"]
numerical = ["Year", "Duration"]

#Train-test split
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Preprocessing + Random Forest model
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", ["Year", "Duration"])
    ]
)

model = Pipeline([
    ("preprocess", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ))
])

# Train model
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("\n--- Random Forest Model Evaluation ---")

model_mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", model_mse)

model_r2 = r2_score(y_test, y_pred)
print("R^2 Score:", model_r2)


# Predict example movie
sample = pd.DataFrame({
    "Genre": ["Action"],
    "Director": ["Anurag Kashyap"],
    "Actor 1": ["Actor A"],
    "Actor 2": ["Actor B"],
    "Actor 3": ["Actor C"],
    "Year": [2020],
    "Duration": [150]
})

print("\nPredicted Rating for sample movie:", model.predict(sample)[0])




--- Random Forest Model Evaluation ---
Mean Squared Error: 1.5168843092854338
R^2 Score: 0.18083460825982156

Predicted Rating for sample movie: 6.605999999999995
