In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the Dataset

In [2]:
df = pd.read_csv("mymoviedb.csv",lineterminator="\n")
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9827 non-null   object 
 1   Title              9827 non-null   object 
 2   Overview           9827 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   int64  
 5   Vote_Average       9827 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9827 non-null   object 
 8   Poster_Url         9827 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 691.1+ KB


* After viewing dataset info. I came to know that Release_Date column needs to be casted into date_time to extract year as we are only concerned 
about years second thing is we do not need Overview,Original_Language and Poster_Url columns for our data anaysis

In [4]:
# Checking Genre column
df["Genre"].head()

0    Action, Adventure, Science Fiction
1              Crime, Mystery, Thriller
2                              Thriller
3    Animation, Comedy, Family, Fantasy
4      Action, Adventure, Thriller, War
Name: Genre, dtype: object

* Genres are seprated by commas followed by whitespaces

In [6]:
# Checking Null Values in the dataset
df.isnull().sum()

Release_Date         0
Title                0
Overview             0
Popularity           0
Vote_Count           0
Vote_Average         0
Original_Language    0
Genre                0
Poster_Url           0
dtype: int64

* There are no null values in our dataset

In [7]:
df.describe() # Checking the summary of the dataset

Unnamed: 0,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0
mean,40.326088,1392.805536,6.439534
std,108.873998,2611.206907,1.129759
min,13.354,0.0,0.0
25%,16.1285,146.0,5.9
50%,21.199,444.0,6.5
75%,35.1915,1376.0,7.1
max,5083.954,31077.0,10.0


# Exploration Summary


* The dataframe contains 9,827 rows and 9 columns.
* The dataset appears tidy with no missing (NaN) or duplicated values.
* Release_Date needs to be converted to datetime format, and only the year part should be extracted.
* Overview, Original_Language, and Poster_URL are likely not useful for analysis and can be dropped.
* Vote_Average should be categorized into groups for better analysis.
* Genre contains comma-separated values and extra white spaces that need to be cleaned and handled properly.

# Data Preprocessing

In [45]:
# Data Cleaning
df["Release_Date"] = pd.to_datetime(df["Release_Date"])
print(df['Release_Date'].dtypes) # Verifying the Changes

datetime64[ns]


In [46]:
# Extracting Year
df['Release_Date'] = df["Release_Date"].dt.year
df.head()

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,1970,Spider-Man: No Way Home,5083.954,8940,popular,"Action,"
1,1970,Spider-Man: No Way Home,5083.954,8940,popular,"Adventure,"
2,1970,Spider-Man: No Way Home,5083.954,8940,popular,Science
3,1970,Spider-Man: No Way Home,5083.954,8940,popular,Fiction
4,1970,The Batman,3827.658,1151,popular,"Crime,"


* Removing Unnecssary Columns

In [15]:
df.drop(columns=["Overview", "Original_Language", "Poster_Url"], inplace=True)
df.head() # Verifying Changes

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,1970,Spider-Man: No Way Home,5083.954,8940,8.3,"Action, Adventure, Science Fiction"
1,1970,The Batman,3827.658,1151,8.1,"Crime, Mystery, Thriller"
2,1970,No Exit,2618.087,122,6.3,Thriller
3,1970,Encanto,2402.201,5076,7.7,"Animation, Comedy, Family, Fantasy"
4,1970,The King's Man,1895.511,1793,7.0,"Action, Adventure, Thriller, War"


* Categorizing Vote Average

In [16]:
df.describe()

Unnamed: 0,Release_Date,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0,9827.0
mean,1970.0,40.326088,1392.805536,6.439534
std,0.0,108.873998,2611.206907,1.129759
min,1970.0,13.354,0.0,0.0
25%,1970.0,16.1285,146.0,5.9
50%,1970.0,21.199,444.0,6.5
75%,1970.0,35.1915,1376.0,7.1
max,1970.0,5083.954,31077.0,10.0


In [21]:
def categorize_voteavg(df,col,labels):

    edges = [
        df[col].describe()['min'],
        df[col].describe()['25%'],
        df[col].describe()['50%'],
        df[col].describe()['75%'],
        df[col].describe()['max']

    ]

    df[col] = pd.cut(df[col],edges,labels= labels,duplicates='drop')
    return df

In [22]:
# define labels for edges
labels = ['not_popular', 'below_avg', 'average', 'popular']
categorize_voteavg(df,"Vote_Average",labels)
df["Vote_Average"].unique()


['popular', 'below_avg', 'average', 'not_popular', NaN]
Categories (4, object): ['not_popular' < 'below_avg' < 'average' < 'popular']

In [23]:
df.head()

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,1970,Spider-Man: No Way Home,5083.954,8940,popular,"Action, Adventure, Science Fiction"
1,1970,The Batman,3827.658,1151,popular,"Crime, Mystery, Thriller"
2,1970,No Exit,2618.087,122,below_avg,Thriller
3,1970,Encanto,2402.201,5076,popular,"Animation, Comedy, Family, Fantasy"
4,1970,The King's Man,1895.511,1793,average,"Action, Adventure, Thriller, War"


In [24]:

df["Vote_Average"].value_counts()

Vote_Average
not_popular    2467
popular        2450
average        2412
below_avg      2398
Name: count, dtype: int64

In [None]:
print(df["Vote_Average"].isnull().sum()) # Checking Number of NAN values

100


In [28]:
df.dropna(inplace=True)
df.isnull().sum()

Release_Date    0
Title           0
Popularity      0
Vote_Count      0
Vote_Average    0
Genre           0
dtype: int64

In [29]:
df.head()

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,1970,Spider-Man: No Way Home,5083.954,8940,popular,"Action, Adventure, Science Fiction"
1,1970,The Batman,3827.658,1151,popular,"Crime, Mystery, Thriller"
2,1970,No Exit,2618.087,122,below_avg,Thriller
3,1970,Encanto,2402.201,5076,popular,"Animation, Comedy, Family, Fantasy"
4,1970,The King's Man,1895.511,1793,average,"Action, Adventure, Thriller, War"


* First, we split the Genre column into a list for each movie. Then, we use explode to expand the dataframe so that each movie–genre pair appears on its own row.

In [30]:
df["Genre"] = df["Genre"].str.split()
df = df.explode('Genre').reset_index(drop=True)
df.head()

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,1970,Spider-Man: No Way Home,5083.954,8940,popular,"Action,"
1,1970,Spider-Man: No Way Home,5083.954,8940,popular,"Adventure,"
2,1970,Spider-Man: No Way Home,5083.954,8940,popular,Science
3,1970,Spider-Man: No Way Home,5083.954,8940,popular,Fiction
4,1970,The Batman,3827.658,1151,popular,"Crime,"


In [None]:
# df["Genre"].dtypes

dtype('O')

In [41]:
# Casting Genre Column into a category for better analysis
df["Genre"] = df['Genre'].astype('category')
df["Genre"].dtypes



CategoricalDtype(categories=['Action', 'Action,', 'Adventure', 'Adventure,', 'Animation',
                  'Animation,', 'Comedy', 'Comedy,', 'Crime', 'Crime,',
                  'Documentary', 'Documentary,', 'Drama', 'Drama,', 'Family',
                  'Family,', 'Fantasy', 'Fantasy,', 'Fiction', 'Fiction,',
                  'History', 'History,', 'Horror', 'Horror,', 'Movie',
                  'Movie,', 'Music', 'Music,', 'Mystery', 'Mystery,',
                  'Romance', 'Romance,', 'Science', 'TV', 'Thriller',
                  'Thriller,', 'War', 'War,', 'Western', 'Western,'],
, ordered=False, categories_dtype=object)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27021 entries, 0 to 27020
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Release_Date  27021 non-null  int32   
 1   Title         27021 non-null  object  
 2   Popularity    27021 non-null  float64 
 3   Vote_Count    27021 non-null  int64   
 4   Vote_Average  27021 non-null  category
 5   Genre         27021 non-null  category
dtypes: category(2), float64(1), int32(1), int64(1), object(1)
memory usage: 793.3+ KB


In [43]:
df.nunique()


Release_Date       1
Title           9415
Popularity      8088
Vote_Count      3265
Vote_Average       4
Genre             40
dtype: int64