In [11]:
import os
print(os.path.splitext('Netflix.csv'))


('Netflix', '.csv')


In [12]:
import pandas as pd
df = pd.read_csv('Netflix.csv', engine='python', on_bad_lines='skip', encoding_errors='ignore')
print(df.head())


  Release_Date                    Title  \
0   2021-12-15  Spider-Man: No Way Home   
1   2022-03-01               The Batman   
2   2022-02-25                  No Exit   
3   2021-11-24                  Encanto   
4   2021-12-22           The King's Man   

                                            Overview  Popularity Vote_Count  \
0  Peter Parker is unmasked and no longer able to...    5083.954       8940   
1  In his second year of fighting crime, Batman u...    3827.658       1151   
2  Stranded at a rest stop in the mountains durin...    2618.087        122   
3  The tale of an extraordinary family, the Madri...    2402.201       5076   
4  As a collection of history's worst tyrants and...    1895.511       1793   

  Vote_Average Original_Language                               Genre  \
0          8.3                en  Action, Adventure, Science Fiction   
1          8.1                en            Crime, Mystery, Thriller   
2          6.3                en                  

In [16]:
# Identify and handle missing values using .isnull() in Python or filters in Excel
print("Missing values in each column:")
print(df.isnull().sum())
print("=" * 50)
#Remove duplicate rows using
df.drop_duplicates(inplace=True)
print("Data after removing duplicates:")
print(df.head())

Missing values in each column:
Release_Date          0
Title                 9
Overview              9
Popularity           10
Vote_Count           10
Vote_Average         10
Original_Language    10
Genre                11
Poster_Url           11
dtype: int64
Data after removing duplicates:
  Release_Date                    Title  \
0   2021-12-15  Spider-Man: No Way Home   
1   2022-03-01               The Batman   
2   2022-02-25                  No Exit   
3   2021-11-24                  Encanto   
4   2021-12-22           The King's Man   

                                            Overview  Popularity Vote_Count  \
0  Peter Parker is unmasked and no longer able to...    5083.954       8940   
1  In his second year of fighting crime, Batman u...    3827.658       1151   
2  Stranded at a rest stop in the mountains durin...    2618.087        122   
3  The tale of an extraordinary family, the Madri...    2402.201       5076   
4  As a collection of history's worst tyrants and...  

In [21]:
#Standardize text values of Languages
df['Original_Language'] = df['Original_Language'].str.strip().str.lower().replace({
    'en':'english',
    'hi':'hindi',
    'fr':'french',
    'es':'spanish',
    'ja':'japanese',
    'de':'german',
})
print("Data after standardizing Language column:")
print(df[['Original_Language']].drop_duplicates().head())


Data after standardizing Language column:
   Original_Language
0            english
11          japanese
12            french
18             hindi
35           spanish


In [24]:
#Convert date formats to a consistent type
df['Release_Date'] = pd.to_datetime(df['Release_Date'], errors='coerce')
print("Data after converting Date_Added to datetime:")
print(df[['Release_Date']])


Data after converting Date_Added to datetime:
     Release_Date
0      2021-12-15
1      2022-03-01
2      2022-02-25
3      2021-11-24
4      2021-12-22
...           ...
9832   1973-10-15
9833   2020-10-01
9834   2016-05-06
9835   2021-03-31
9836   1984-09-23

[9837 rows x 1 columns]


In [25]:
#Rename column headers to be clean and uniform 
df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)
print("Data after renaming columns:")
print(df.head())

Data after renaming columns:
  release_date                    title  \
0   2021-12-15  Spider-Man: No Way Home   
1   2022-03-01               The Batman   
2   2022-02-25                  No Exit   
3   2021-11-24                  Encanto   
4   2021-12-22           The King's Man   

                                            overview  popularity vote_count  \
0  Peter Parker is unmasked and no longer able to...    5083.954       8940   
1  In his second year of fighting crime, Batman u...    3827.658       1151   
2  Stranded at a rest stop in the mountains durin...    2618.087        122   
3  The tale of an extraordinary family, the Madri...    2402.201       5076   
4  As a collection of history's worst tyrants and...    1895.511       1793   

  vote_average original_language                               genre  \
0          8.3           english  Action, Adventure, Science Fiction   
1          8.1           english            Crime, Mystery, Thriller   
2          6.3       

In [26]:
#Check and fix data types
print("Data types of each column:")
print(df.dtypes) 

Data types of each column:
release_date         datetime64[ns]
title                        object
overview                     object
popularity                  float64
vote_count                   object
vote_average                 object
original_language            object
genre                        object
poster_url                   object
dtype: object
