In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1.Cleaning and Imputing the Data

In [None]:
df = pd.read_csv('/kaggle/input/imdb-india-movies/IMDb Movies India.csv',encoding='latin-1')
df.head()

In [None]:
df.info()

In [None]:
missing_values = df.isna().sum()
percentages = (missing_values / len(df)) * 100
result_df = pd.DataFrame({
    'no of missing values': missing_values.values,
    'percentage': percentages.apply(lambda x: f'{x:.2f}%')  # Format percentages with 2 decimal places and percentage sign
}, index=df.columns)
result_df

**Rating will be the target variable for prediction so im dropping its null values**

In [None]:
df.dropna(subset=['Rating'],inplace=True)

In [None]:
missing_values = df.isna().sum()
percentages = (missing_values / len(df)) * 100

result_df = pd.DataFrame({
    'no of missing values': missing_values.values,
    'percentage': percentages.apply(lambda x: f'{x:.2f}%')  # Format percentages with 2 decimal places and percentage sign
}, index=df.columns)

result_df

**Now for other column except genre the missing values are less than 4% so we will drop them**

In [None]:
df.dropna(subset=['Actor 1','Actor 2','Actor 3','Director','Genre'],inplace=True)

In [None]:
missing_values = df.isna().sum()
percentages = (missing_values / len(df)) * 100

result_df = pd.DataFrame({
    'no of missing values': missing_values.values,
    'percentage': percentages.apply(lambda x: f'{x:.2f}%')  # Format percentages with 2 decimal places and percentage sign
}, index=df.columns)

result_df

**Before imputing duration values I will change years from (2019) to 2019 and convert votes to integer by removing comma in higher values for votes and duration is in min so to convert it into integer we will have to remove 'min' string**

In [None]:
df['Votes']= df['Votes'].str.replace(',','').astype(int)
df['Year']= df['Year'].str.strip('()').astype(int)
df['Duration']=df['Duration'].str.strip(' min')

**I will impute the Duration column with random imputation of values from 90 to 180 as our original distribution without imputation have most values in this range and after imputation the original shape is mantained rather than if i had filled with mean below graphs make this clear take a look**

In [None]:
df['Duration_copy']=df['Duration']
mask = df['Duration'].isnull()
random_values = np.random.randint(90, 181, size=mask.sum())  # Generate random numbers
df['Duration'][mask] = random_values
org_duration = df.loc[~df['Duration_copy'].isnull(), 'Duration_copy'].astype(int)
df['Duration'] = df['Duration'].astype(int)    

In [None]:
sns.set_style('darkgrid')
fig,ax = plt.subplots(2,2,figsize=(15,10))
sns.histplot(data=org_duration,bins=20,kde=True,ax=ax[0][0])
sns.histplot(data=df,x=df['Duration_copy'].fillna(org_duration.mean()).astype(int),bins=20,kde=True,ax=ax[0][1])
sns.histplot(data=df,x=df['Duration'],bins=20,kde=True,ax=ax[1][0])
ax[0][0].set_xlabel('Duration in minutes')
ax[0][1].set_xlabel('Duration in minutes')
ax[1][0].set_xlabel('Duration in minutes')
ax[0][0].set_title('original distribution of duration of movies')
ax[0][1].set_title('missing values filled with mean')
ax[1][0].set_title('missing values filled with random values between 90 and 180')
fig.delaxes(ax[1][1])
plt.show()

In [None]:
df.drop(columns=['Duration_copy'],inplace=True)

In [None]:
df.info()

**Now the data is cleaned and imputed**

# 2. EDA

**Top 10 rated movies**

In [None]:
top_10_movies = df.loc[df['Rating'].sort_values(ascending=False)[:10].index]
top_10_movies

**Below dataframe contains top rated movie for every year**

In [None]:
df.groupby('Year').apply(pd.DataFrame.nlargest, n=1, columns=['Rating'])

**Below graph shows average rating for every year and according to it the year with best ratings should be 1948 and worst is 2002**

In [None]:
sns.set_style('darkgrid')
df.groupby('Year')[['Rating']].mean().plot(figsize=(15,5))
plt.xlabel('Year')
plt.ylabel('Rating')
plt.title('Average movie ratings by year')
plt.xticks(np.arange(1917,2023,5))
plt.xlim(1917,2023)
plt.show()

**Now below graph add more detail by also showing average votes for that rating in that year. It also show the relation that years with less votes have higher rating and rating drops as votes grow.**

In [None]:
fig,ax1 = plt.subplots(figsize=(15,6))
sns.lineplot(data=df,x='Year',y='Votes',errorbar=None,ax=ax1,label='Average Votes',color='#2ca02c')
ax1.set_xlabel('Year')
ax1.set_ylabel('Average Votes')
ax1.set_xlim(1917,2023)
ax1.set_ylim(0,10000)
ax1.set_xticks(np.arange(1917,2023,5))
ax2 = ax1.twinx()
sns.lineplot(data=df,x='Year',y='Rating',errorbar=None,ax=ax2,color='#17becf',label='Average Rating')
ax2.set_ylabel('Average Rating')
ax2.set_ylim(4,8)
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')
plt.show()

In [None]:
# sns.set_style('darkgrid')
# df.groupby('Year')[['Votes']].mean().plot(figsize=(15,5))
# plt.xlabel('Year')
# plt.ylabel('Average number of votes')
# plt.title('Average votes by year')
# plt.xticks(np.arange(1917,2023,5))
# plt.xlim(1917,2023)
# plt.show()

**Below graph shows the average number of movies released each year which goes on increasing**

In [None]:
sns.set_style('darkgrid')
df.groupby(['Year'])['Name'].count().plot(figsize=(15,5))
plt.xlabel('Year')
plt.ylabel('Number of movies')
plt.title('Number of movies released every year')
plt.ylim(0,250)
plt.xlim(1917,2023)
plt.xticks(np.arange(1917,2023,5))
plt.show()

In [None]:
sns.set_style('darkgrid')
fig,ax1 = plt.subplots(figsize=(15,6))
df.groupby(['Year'])['Name'].count().plot(ax=ax1,label='Number of movies')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of movies')
ax1.set_ylim(0,250)
ax1.set_xlim(1917,2023)
ax1.set_xticks(np.arange(1917,2023,5))
ax2=ax1.twinx()
df.groupby('Year')[['Rating']].mean().plot(ax=ax2,color='#17becf',label='Average rating')
ax2.set_ylabel('Average Rating')
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')
plt.show()

**Below graph shows us the average vots for each rating and we can see the movies rated 9.5-10 may drop in rating as the votes increase or they may retain their rating depends on your vote**

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.lineplot(data=df,x='Rating',y='Votes',errorbar=None)
plt.xlabel('Rating')
plt.ylabel('Average Votes')
plt.xticks(np.arange(0,10.5,0.5))
plt.title('Average votes for each rating')
plt.show()

**Below graph show top movies with raint greater than 8 and also more than 10000 votes so we can say that these movies are actually good. and certainly 3 idiots is a great movie you can see more below**

In [None]:
sns.set_style('darkgrid')
d = df.loc[(df['Rating']>8) & (df['Votes']>10000), ['Rating','Votes','Name']]
plt.figure(figsize=(15, 6))
ax=sns.barplot(data=d,x='Name',y='Votes',hue='Rating',dodge=False,width=0.5,palette='muted')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')
ax.legend(loc='upper right')
ax.set_xlabel('Movie Name')
ax.set_ylabel('Votes')
ax.set_title('Movies with rating greater than 8 and votes greater than 10000')
plt.show()

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15, 6))
sns.lineplot(data=df,x='Year',y='Duration',errorbar=None)
plt.xlabel('Year')
plt.ylabel('Duration in minutes')
plt.title('Duration of movies by year')
plt.xticks(np.arange(1917,2023,5))
plt.show()

**Average duration have such a messy relation with rating**

In [None]:
fig,ax1 = plt.subplots(figsize=(15,6))
sns.lineplot(data=df,x='Year',y='Duration',errorbar=None,ax=ax1,label='Average Duration')
ax1.set_xlabel('Year')
ax1.set_ylabel('Average Duration')
ax1.set_xlim(1917,2023)
ax1.set_xticks(np.arange(1917,2023,5))
ax2 = ax1.twinx()
sns.lineplot(data=df,x='Year',y='Rating',errorbar=None,ax=ax2,color='red',label='Average Rating')
ax2.set_ylabel('Average Rating')
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper right')
plt.show()

**Now i will perform EDA on every single genre**

In [None]:
genre = df['Genre']
genre_stack = genre.str.split(',').apply(pd.Series).stack()
genre_stack.index = genre_stack.index.droplevel(-1)
g=[genre.str.split(',').apply(pd.Series)[i].str.strip().value_counts(dropna=False).to_dict() for i in range(3)]
# g_dict = {}
# for dic in g:
#     for k,v in dic.items():
#         if k in g_dict:
#             g_dict[k]+=v
#         else:
#             g_dict[k]=v
# below code does same job as above
g_dict = {k: sum(dic.get(k,0) for dic in g) for dic in g for k in dic}
genres_count = pd.Series(g_dict).sort_values(ascending=False).drop(np.nan)
# Now for average rating of each genre
# genre_rating = {}
# for i in genres_count.index:
#     genre_rating[i]=df.loc[df['Genre'].str.contains(i),'Rating'].mean().round(1)
genre_rating = {k:df.loc[df['Genre'].str.contains(k),'Rating'].mean().round(1) for k in genres_count.index}
genre_rating = pd.Series(genre_rating).sort_values(ascending=False)
genres_single = pd.concat([genres_count,genre_rating],axis=1).sort_values(by=1,ascending=False).rename(columns={0:'Movie count',1:'Average rating'})
genres_single.sort_values(by='Movie count',ascending=False,inplace=True)

In [None]:
# genres_count = pd.Series(g_dict).sort_values(ascending=False).drop(np.nan)
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.barplot(data=genres_single,x=genres_single.index.values,y='Movie count',palette='coolwarm')
plt.xlabel('Genre')
plt.ylabel('Number of movies')
plt.title('Number of movies in each genre')
plt.xticks(rotation=90)
plt.show()

In [None]:
from wordcloud import WordCloud
from random import choice
colors = ["#476A2A", "#7851B8", "#BD3430", "#4A2D4E", "#875525", "#A83683", "#4E655E", "#853541", "#3A3120", "#535D8E",'#17becf']
def color_func(word, *args, **kwargs):
    return choice(colors)
wordcloud = WordCloud(width=800, height=400, background_color='white',color_func=color_func).generate_from_frequencies(genres_count)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Genre Word Cloud')
plt.show()

**So there are more movies with genre of Drama followed by Action and Romance**

**Now below Graph shows the average rating for each genre but drama has more movies so it is logical for rating to drop as some movies may have performed bad**

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.barplot(data=genres_single,x=genres_single.index.values,y='Average rating',palette='coolwarm')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.title('Average rating of movies in each genre')
plt.xticks(rotation=90)
plt.show()

**For prediction of rating I will replace every genre with its average rating for all the movies for that particular genres and I will do same for directors and actors**

In [None]:
genre_df = df.groupby('Genre').agg({'Rating':['mean','count']})
genre_df.reset_index(inplace=True)
genre_df.columns = ['Genre','Average Rating','Movie Count']
genre_df['Average Rating'] = genre_df['Average Rating'].round(1)
genre_df

In [None]:
# it will be used for mapping
genre_dict = dict(zip(genre_df['Genre'],genre_df['Average Rating']))

**Directors Analysis**

In [None]:
directors  = df.groupby('Director').agg({'Rating':['mean','count']})
directors.columns = directors.columns.droplevel(0)
directors.reset_index(inplace=True)
directors.columns = ['Director','Average Rating','Movie count']
directors['Average Rating'] = directors['Average Rating'].round(1)
directors.sort_values(by='Movie count',ascending=False,inplace=True)
directors.head()

In [None]:
directors_dict = dict(zip(directors['Director'],directors['Average Rating']))

In [None]:
plt.figure(figsize=(15,5))
sns.set_style('darkgrid')
sns.barplot(data=directors.head(30),x='Director',y='Movie count',palette='coolwarm')
plt.xlabel('Director')
plt.ylabel('Number of movies')
plt.xticks(rotation=90)
plt.title('Top 30 directors with most number of movies')
plt.show()

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.barplot(data=directors.head(30),x='Director',y='Average Rating',palette='coolwarm')
plt.xticks(rotation=90)
plt.xlabel('Directors')
plt.ylabel('Average Rating')
plt.title('Average rating of top 30 directors with most movies')
plt.show()

**Now below bar plot shows top rated directors**

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.barplot(data=directors.sort_values(by='Average Rating',ascending=False).head(30) ,x='Director',y='Average Rating',palette='muted')
plt.xticks(rotation=90)
plt.xlabel('Directors')
plt.ylabel('Average Rating')
plt.title('Top 30 rated directors')
plt.show()

**Actors Analysis**

In [None]:
df_melted = df.melt(id_vars='Rating', value_name='actor', var_name='role', value_vars=['Actor 1', 'Actor 2', 'Actor 3'])
actor_scores = df_melted.groupby('actor')['Rating'].agg(['mean', 'count'])
actor_scores.reset_index(inplace=True)
actor_scores.columns = ['Actor','Average Score', 'Number of movies']
actor_scores.sort_values('Number of movies', ascending=False, inplace=True)
actor_scores['Average Score']=actor_scores['Average Score'].round(1)
actor_scores

In [None]:
actor_score_dict = dict(zip(actor_scores['Actor'], actor_scores['Average Score']))

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data=actor_scores[:30],x='Actor',y='Number of movies',dodge=False,palette='coolwarm')
plt.xticks(rotation=90)
plt.xlabel('Actors')
plt.ylabel('Number of movies')
plt.title('Top 30 actors by number of movies')
plt.show()

**Below graph also shows their average rating**

In [None]:
fig,ax1 = plt.subplots(figsize=(15,6))
sns.set_style('white')
sns.barplot(data=actor_scores[:30],x='Actor',y='Number of movies',dodge=True,ax=ax1,label='Number of movies',color='blue')
ax1.set(xlabel='Name of Actor', ylabel='Number of movies')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90, ha='right')
ax2 = ax1.twinx()
sns.barplot(data=actor_scores[:30],x='Actor',y='Average Score',dodge=True,ax=ax2,color='#17becf',label='Average Rating',alpha=0.5)
ax2.set_ylabel('Average Rating')
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper right')
plt.show()

In [None]:
actor_scores.sort_values(by='Average Score',ascending=False,inplace=True)

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(15,5))
sns.barplot(data=actor_scores[:30],x='Actor',y='Average Score',dodge=True,hue='Number of movies',palette='muted')
plt.xticks(rotation=90)
plt.xlabel('Actors')
plt.ylabel('Average Rating')
plt.title('Average rating of top 30 rated actors with their number of movies')
plt.show()

# 3. Data Preprocessing

In [None]:
num_columns = list(df.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.histplot(data=df,x=num_columns[0],kde=True,bins=20,ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Histograms of numerical columns', fontsize=16)
plt.show()

In [None]:
num_columns = list(df.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.boxplot(data=df,x=num_columns[0],ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Boxplots to show outliers', fontsize=16)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,FunctionTransformer,RobustScaler,PowerTransformer,QuantileTransformer
num_df = df.select_dtypes(include=np.number)
num_df

**I will use box cox method to transform my features to make distributions more normal and control outliers in data**

In [None]:
pt = PowerTransformer()
num_df_pt = pd.DataFrame(pt.fit_transform(num_df),columns=num_df.columns)

In [None]:
num_columns = list(num_df_pt.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.boxplot(data=num_df_pt,x=num_columns[0],ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Boxplots of features', fontsize=16)
plt.show()

**So it works well on Rating,Year and votes and decrease outliers except Duration**

In [None]:
num_columns = list(num_df_pt.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(12,10))
for j in range(num):
    for i in range(num):
        try:
            sns.histplot(data=num_df_pt,x=num_columns[0],ax=ax[j][i],kde=True,bins=20)
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Histograms of features', fontsize=16)
plt.show()

In [None]:
df_2 = df.drop(['Name'],axis=1)
df_2['Genre'] = df_2['Genre'].map(genre_dict)
df_2['Director'] = df_2['Director'].map(directors_dict)
df_2['Actor 1'] = df_2['Actor 1'].map(actor_score_dict)
df_2['Actor 2'] = df_2['Actor 2'].map(actor_score_dict)
df_2['Actor 3'] = df_2['Actor 3'].map(actor_score_dict)
df_2

In [None]:
pt = PowerTransformer()
qt = QuantileTransformer(output_distribution='normal')
df_2[['Rating','Votes','Year']] = pt.fit_transform(df_2[['Rating','Votes','Year']])
df_2[['Genre','Director','Duration','Actor 1','Actor 2','Actor 3']] = qt.fit_transform(df_2[['Genre','Director','Duration','Actor 1','Actor 2','Actor 3']])

In [None]:
num_columns = list(df_2.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(15,15))
for j in range(num):
    for i in range(num):
        try:
            sns.histplot(data=df_2,x=num_columns[0],ax=ax[j][i],kde=True,bins=20)
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Histograms of features', fontsize=16)
plt.show()

In [None]:
num_columns = list(df_2.select_dtypes(include=np.number).columns)
num=int(len(num_columns)/2) if int(len(num_columns)/2)>1 else 2
fig ,ax = plt.subplots(num,num,figsize=(15,15))
for j in range(num):
    for i in range(num):
        try:
            sns.boxplot(data=df_2,x=num_columns[0],ax=ax[j][i])
            num_columns.pop(0)
        except:
            fig.delaxes(ax=ax[j][i])
fig.suptitle('Boxplots of features', fontsize=16)
plt.show()

**As you can see from above distributions this is so far as best I can do to make data more normal and control outliers**

In [None]:
corr_df = df_2.corr(numeric_only=True)
corr_df['Rating'].sort_values(ascending=False)

In [None]:
sns.heatmap(corr_df,annot=False,cmap='coolwarm')

**Now our transformed columns are much correlated with the target variable so we are ready to go**

# 4. Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score


In [None]:
X=df_2.drop('Rating',axis=1)
y=df_2['Rating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print('Mean squared error: ',mean_squared_error(y_test,y_pred))
print('Mean absolute error: ',mean_absolute_error(y_test,y_pred))
print('R2 score: ',r2_score(y_test,y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
X=df_2.drop('Rating',axis=1)
y=df_2['Rating']
# Assuming X and y are your data and labels
lr = LinearRegression()
scores = cross_val_score(lr, X, y, cv=5)
# print(scores)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


**So 73 percent score after performing cross validation**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Define the models
models = {
    'RandomForestRegressor': RandomForestRegressor()
    # ,'LinearRegression': LinearRegression()
}

# Define the parameters for grid search
params = {
    'RandomForestRegressor': { 'n_estimators': [75,100,125,150], 'max_features': ['sqrt', 'log2'] }
    # ,'LinearRegression': {  }
}


In [None]:
X = df_2.drop('Rating',axis=1)
y = df_2['Rating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
for model_name, model in models.items():
    model_to_tune = GridSearchCV(model, params[model_name], cv=5)
    model_to_tune.fit(X_train, y_train)
    
    print(f"Best parameters for {model_name}: {model_to_tune.best_params_}")
    print(f"Best score for {model_name}: {model_to_tune.best_score_}")


**So the maximum my model can reach is 77 percent**

In [None]:
X = df_2.drop('Rating',axis=1)
y = df_2['Rating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=7,random_state=42)
rf = LinearRegression()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
# print('Mean squared error: ',mean_squared_error(y_test,y_pred))
# print('Mean absolute error: ',mean_absolute_error(y_test,y_pred))
print('R2 scorefor training data: ',r2_score(y_train,y_pred))
print('R2 score for testing data: ',r2_score(y_test,y_pred_test))

**So 72.5 is best score for Decision Tree regressor avoiding overfitting**

**I am a beginner and learning ML models so if you can suggest me some improvements or any mistake I made kindly tell me in the comments and if you like the notebook kindly upvote**