# DATA CLEANING FOR NETFLIX DATASET USING PANDAS

### First the environment created by importing necessary libraries
> And first look to dataset shows us some null values. This must be handled before data analysis. 

In [112]:
import pandas  as pd
import seaborn as sns

df = pd.read_csv("/Users/boryabaghir/Desktop/netflix.csv")
df

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2314731.0,MX
1,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,516750.0,"AT, CH, DE"
2,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.2,1221088.0,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CA, CI, CM..."
3,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,211418.0,"AD, AE, AG, AL, AO, AR, AT, AZ, BA, BB, BE, BG..."
4,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,443508.0,"AU, BA, BE, BG, CZ, HR, HU, MD, ME, MK, NZ, PL..."
...,...,...,...,...,...,...,...,...
15855,,tv,"Drama, Mystery",,,,,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE..."
15856,,tv,,2024.0,,,,"HK, ID, IN, KR, MY, PH, SG, TH"
15857,,tv,,2024.0,,,,"AL, AO, AU, AZ, BA, BB, BG, BM, BS, BY, BZ, CA..."
15858,,tv,,,,,,"AG, AO, AR, AU, BB, BM, BO, BS, BZ, CA, CI, CL..."


## Checking for the null values
> Using pandas built in functions to check if there is any null values in the dataset

In [113]:
df.isna().sum()

title                 497
type                    0
genres                147
releaseYear            12
imdbId                679
imdbAverageRating     724
imdbNumVotes          724
availableCountries      0
dtype: int64

## Handling missing values
1. The title of movies are specific and string values and must be unique so the only option to handle missing values for movie names is to drop null values.
2. Dropping null values for imdbId where there is no id generated. 

In [114]:
dfr = df.dropna(subset=['title'])
dfr = df.dropna(subset=['imdbId'])
dfr

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2314731.0,MX
1,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,516750.0,"AT, CH, DE"
2,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.2,1221088.0,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CA, CI, CM..."
3,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,211418.0,"AD, AE, AG, AL, AO, AR, AT, AZ, BA, BB, BE, BG..."
4,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,443508.0,"AU, BA, BE, BG, CZ, HR, HU, MD, ME, MK, NZ, PL..."
...,...,...,...,...,...,...,...,...
15848,Blood Legacy,tv,"Drama, Thriller",2024.0,tt33344252,7.4,105.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE..."
15849,Chef's Table: Noodles,tv,Documentary,2024.0,tt33323674,7.5,212.0,"AD, AE, AL, AO, AR, AT, AU, AZ, BA, BB, BE, BG..."
15851,After School Doctor,tv,Drama,2024.0,tt33889224,,,"JP, LI, MC"
15854,Making It in Marbella,tv,Reality-TV,2024.0,tt31828643,4.6,144.0,"AD, AE, AL, AO, AR, AT, AU, AZ, BA, BB, BE, BG..."


## Handling missing values continued with filling missing values instead of dropping
3. Handling missing values for the values that could be averaged or found best fit values based on column data. 
4. so for average ratings I choose to mean the value and get average of average for those missing values
5. For the number of votes mean wouldn't work because the data in this column is skewed, means there is high and low values with massive difference that's why I choose to median it to find the best fit data to fill in

In [115]:
dfr['imdbAverageRating'] = dfr['imdbAverageRating'].fillna(dfr['imdbAverageRating'].mean())
dfr['imdbNumVotes'] = dfr['imdbNumVotes'].fillna(dfr['imdbNumVotes'].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['imdbAverageRating'] = dfr['imdbAverageRating'].fillna(dfr['imdbAverageRating'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfr['imdbNumVotes'] = dfr['imdbNumVotes'].fillna(dfr['imdbNumVotes'].median())


### Checking for final version of cleaned data again

In [116]:
dfr.isnull().sum()

title                 0
type                  0
genres                0
releaseYear           0
imdbId                0
imdbAverageRating     0
imdbNumVotes          0
availableCountries    0
dtype: int64

In [117]:
dfr

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.800000,2314731.0,MX
1,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.600000,516750.0,"AT, CH, DE"
2,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.200000,1221088.0,"AE, AL, AO, AT, AU, AZ, BG, BH, BY, CA, CI, CM..."
3,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.000000,211418.0,"AD, AE, AG, AL, AO, AR, AT, AZ, BA, BB, BE, BG..."
4,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.200000,443508.0,"AU, BA, BE, BG, CZ, HR, HU, MD, ME, MK, NZ, PL..."
...,...,...,...,...,...,...,...,...
15848,Blood Legacy,tv,"Drama, Thriller",2024.0,tt33344252,7.400000,105.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE..."
15849,Chef's Table: Noodles,tv,Documentary,2024.0,tt33323674,7.500000,212.0,"AD, AE, AL, AO, AR, AT, AU, AZ, BA, BB, BE, BG..."
15851,After School Doctor,tv,Drama,2024.0,tt33889224,6.491675,2093.0,"JP, LI, MC"
15854,Making It in Marbella,tv,Reality-TV,2024.0,tt31828643,4.600000,144.0,"AD, AE, AL, AO, AR, AT, AU, AZ, BA, BB, BE, BG..."


### Checking for duplicated data 
> And there is no any duplicates in our dataset

In [118]:
dfr.duplicated().sum()

0

### Downloading the cleaned data for data analysis using bigQuery

In [119]:
df.to_csv('/Users/boryabaghir/Desktop/python pandas/dfr.csv', index=False)


