In [28]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
df = pd.read_csv('/IMDb Movies India.csv', encoding='latin-1')
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [3]:
print(df)

                                     Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes            Director           A

In [7]:
df.duplicated().sum()

6

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.isnull().sum()

Name           0
Year         527
Duration    8264
Genre       1876
Rating      7584
Votes       7583
Director     524
Actor 1     1615
Actor 2     2381
Actor 3     3140
dtype: int64

In [10]:
def clean_year(year):
    if pd.isna(year):
        return year
    year_str = str(year)
    # Remove any non-digit characters
    year_str = ''.join(filter(str.isdigit, year_str))
    # If we have a valid 4-digit year, return it as an integer
    if len(year_str) == 4:
        return int(year_str)
    # If we don't have a valid year, return NaN
    return pd.np.nan

df['Year'] = df['Year'].apply(clean_year)

In [11]:
print(df['Year'].isna().sum())

527


In [12]:
median_year = df['Year'].median()
df['Year'].fillna(median_year, inplace=True)

In [13]:
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

In [14]:
df['Votes'] = np.log1p(df['Votes'])

In [16]:
median_rating = df['Rating'].median()
median_votes = df['Votes'].median()

df['Rating'].fillna(median_rating, inplace=True)
df['Votes'].fillna(median_votes, inplace=True)

In [17]:
df.head()


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,1991.0,,Drama,6.0,3.583519,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),2019.0,109 min,Drama,7.0,2.197225,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,2021.0,90 min,"Drama, Musical",6.0,3.583519,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,2019.0,110 min,"Comedy, Romance",4.4,3.583519,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,2010.0,105 min,Drama,6.0,3.583519,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [18]:
df.shape

(15503, 10)

In [19]:
df.drop(['Duration', 'Name'], axis=1, inplace=True)

In [20]:
df.isnull().sum()

Year           0
Genre       1876
Rating         0
Votes          0
Director     524
Actor 1     1615
Actor 2     2381
Actor 3     3140
dtype: int64

In [21]:
# Drop rows with any remaining NaN values
df.dropna(inplace=True)

In [22]:
df.isnull().sum()

Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [23]:
df.head()

Unnamed: 0,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,1991.0,Drama,6.0,3.583519,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,2019.0,Drama,7.0,2.197225,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,2021.0,"Drama, Musical",6.0,3.583519,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,2019.0,"Comedy, Romance",4.4,3.583519,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,2010.0,Drama,6.0,3.583519,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [24]:
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(df[categorical_features])



In [25]:
print(encoded_features)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [26]:
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numerical_features = ['Year', 'Votes']

In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features)
    ])

In [30]:
X = df[numerical_features + categorical_features]
y = df['Rating']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [33]:
pipeline.fit(X_train, y_train)



In [34]:
y_pred = pipeline.predict(X_test)

In [35]:
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")

Mean Squared Error: 6.792387173129162e+22
