In [None]:
# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset

In [None]:
df = pd.read_csv('mymoviedb.csv', lineterminator='\n')
df.head()

In [None]:
# Check dataset info

In [None]:
df.info()

In [None]:
# Check first few rows of Genre

In [None]:
df['Genre'].head()

In [None]:
# Check for duplicate rows

In [None]:
df.duplicated().sum()

In [None]:
# Summary statistics

In [None]:
df.describe()

In [None]:
# Convert Release_Date to datetime and extract year

In [None]:
df['Release_Date'] = pd.to_datetime(df['Release_Date'])
df['Release_Date'] = df['Release_Date'].dt.year

In [None]:
# Drop unnecessary columns

In [None]:
cols = ['Overview', 'Original_Language', 'Poster_Url']
df.drop(cols, axis=1, inplace=True)

In [None]:
# Categorize Vote_Average into labels

In [None]:
def catigorize_col(df, col, labels):
    """
    Categorizes a column based on its quartiles.
    """
    edges = [df[col].describe()['min'],
             df[col].describe()['25%'],
             df[col].describe()['50%'],
             df[col].describe()['75%'],
             df[col].describe()['max']]
    df[col] = pd.cut(df[col], edges, labels=labels, duplicates='drop')
    return df

In [None]:
labels = ['not_popular', 'below_avg', 'average', 'popular']
df = catigorize_col(df, 'Vote_Average', labels)

In [None]:
# Drop NaNs

In [None]:
df.dropna(inplace=True)

In [None]:
# Split Genre into list and explode

In [None]:
df['Genre'] = df['Genre'].str.split(', ')
df = df.explode('Genre').reset_index(drop=True)

In [None]:
# Convert Genre to category

In [None]:
df['Genre'] = df['Genre'].astype('category')

In [None]:
# Summary after cleaning

In [None]:
df.info()

In [None]:
# Setup seaborn

In [None]:
sns.set_style('whitegrid')

In [None]:
# Q1: Most frequent genre

In [None]:
print(df['Genre'].describe())
sns.catplot(y='Genre', data=df, kind='count',
            order=df['Genre'].value_counts().index,
            color='#4287f5')
plt.title('Genre Distribution')
plt.show()

In [None]:
# Q2: Genres by vote average

In [None]:
sns.catplot(y='Vote_Average', data=df, kind='count',
            order=df['Vote_Average'].value_counts().index,
            color='#4287f5')
plt.title('Votes Distribution')
plt.show()

In [None]:
# Q3: Movie with highest popularity

In [None]:
print(df[df['Popularity'] == df['Popularity'].max()])

In [None]:
# Q4: Movie with lowest popularity

In [None]:
print(df[df['Popularity'] == df['Popularity'].min()])

In [None]:
# Q5: Year with most films

In [None]:
df['Release_Date'].hist()
plt.title('Movies Released per Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()