In [33]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np

In [8]:
# 2. Load Dataset
df = pd.read_csv("../data/raw/mymoviedb.csv",lineterminator='\n')
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [13]:
# 3. Convert Release_Date to Datetime
df["Release_Date"] = pd.to_datetime(df["Release_Date"])
df["Release_Date"].dtype

dtype('<M8[ns]')

In [15]:
# 4. Extract Release Year
df["Release_Year"] = df["Release_Date"].dt.year
df[["Release_Year","Release_Date"]].head()

Unnamed: 0,Release_Year,Release_Date
0,2021,2021-12-15
1,2022,2022-03-01
2,2022,2022-02-25
3,2021,2021-11-24
4,2021,2021-12-22


In [17]:
# 5. Remove Duplicates
df.duplicated().sum()
df = df.drop_duplicates()

In [18]:
# 6. Check Missing Values
df.isnull().sum()

Release_Date         0
Title                0
Overview             0
Popularity           0
Vote_Count           0
Vote_Average         0
Original_Language    0
Genre                0
Poster_Url           0
Release_Year         0
dtype: int64

In [24]:
# 7. Clean Genre Column
df["Genre"] = df["Genre"].str.strip()
df['Genre'] = df['Genre'].str.replace(" ", "")

In [23]:
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url,Release_Year
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...,2021
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...,2022
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...,2022
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...,2021
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...,2021


In [25]:
df["Primary_Genre"] = df["Genre"].str.split(',').str[0]

In [26]:
# 9. Drop Unnecessary Columns
df = df.drop(columns=['Poster_Url'])

In [27]:
# 10. Reorder Columns
df = df[
    ['Title',
     'Release_Date',
     'Release_Year',
     'Overview',
     'Popularity',
     'Vote_Count',
     'Vote_Average',
     'Original_Language',
     'Genre',
     'Primary_Genre']
]

In [28]:
df.head()

Unnamed: 0,Title,Release_Date,Release_Year,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Primary_Genre
0,Spider-Man: No Way Home,2021-12-15,2021,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action,Adventure,ScienceFiction",Action
1,The Batman,2022-03-01,2022,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime,Mystery,Thriller",Crime
2,No Exit,2022-02-25,2022,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,Thriller
3,Encanto,2021-11-24,2021,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation,Comedy,Family,Fantasy",Animation
4,The King's Man,2021-12-22,2021,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action,Adventure,Thriller,War",Action


In [30]:
# 11. Save Cleaned Dataset
df.to_excel("../data/clean/mymoviedb_cleaned.xlsx", index=False)

<br></br>

## Preprocessing Summary

- Converted Release_Date to datetime format.
- Extracted Release_Year for trend analysis.
- Checked and removed duplicate records.
- Verified missing values.
- Standardized Genre column.
- Created Primary_Genre feature.
- Dropped unnecessary columns (Poster_Url).
- Saved cleaned dataset for analysis stage.