# GDP Data Cleaning 

This notebook cleans the World Bank GDP data to only include GDP values for 2021, region, and IncomeGroup variables. Following cleaning, the data is merged with the Netflix data. 
Only the countries included in both datasets are kept in the final analytic dataset.

In [1]:
import pandas as pd 
import numpy as np 
import os 

In [2]:
os.chdir("/Users/steffenerickson/Desktop/repos/DS-Netflix-Project")

In [3]:
path_to_gdp = "/Users/steffenerickson/Desktop/repos/DS-Netflix-Project/gdp_data/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_3628616.csv"

In [4]:
path_to_metadata = "/Users/steffenerickson/Desktop/repos/DS-Netflix-Project/gdp_data/Metadata_Country_API_NY.GDP.MKTP.CD_DS2_en_csv_v2_3628616.csv"

In [5]:
path_to_netflix = "/Users/steffenerickson/Desktop/repos/DS-Netflix-Project/Netflix subscription fee Dec-2021.csv"

 ### Cleaning GDP Data 

In [6]:
df_gdp = pd.read_csv(path_to_gdp, skiprows = 3, header = 0 , dtype = str)

In [7]:
df_gdp2 = df_gdp[['Country Code','Country Name', '2020']].copy()

In [8]:
df_gdp2 = df_gdp2.rename(columns={'2020':'2020_GDP'})
df_gdp2['2020_GDP'] = df_gdp2['2020_GDP'].apply(lambda x: float(x) if isinstance(x, str) else x)

In [9]:
df_gdp2.head()

Unnamed: 0,Country Code,Country Name,2020_GDP
0,ABW,Aruba,
1,AFE,Africa Eastern and Southern,920792300000.0
2,AFG,Afghanistan,20116140000.0
3,AFW,Africa Western and Central,784587600000.0
4,AGO,Angola,58375980000.0


### Cleaning Meta Data 

In [10]:
df_meta = pd.read_csv(path_to_metadata, header = 0 , dtype = str, usecols = [0,1,2])

In [11]:
df_meta.head()
len(df_meta)

265

### Merging data 

In [12]:
GDP_Final = pd.merge(df_gdp2, df_meta, on="Country Code", how="right") 

In [20]:
GDP_Final.head(30)

Unnamed: 0,Country Code,Country Name,2020_GDP,Region,IncomeGroup
0,ABW,Aruba,,Latin America & Caribbean,High income
1,AFE,Africa Eastern and Southern,920792300000.0,,
2,AFG,Afghanistan,20116140000.0,South Asia,Low income
3,AFW,Africa Western and Central,784587600000.0,,
4,AGO,Angola,58375980000.0,Sub-Saharan Africa,Lower middle income
5,ALB,Albania,14887630000.0,Europe & Central Asia,Upper middle income
6,AND,Andorra,,Europe & Central Asia,High income
7,ARB,Arab World,2447584000000.0,,
8,ARE,United Arab Emirates,358868800000.0,Middle East & North Africa,High income
9,ARG,Argentina,389288100000.0,Latin America & Caribbean,Upper middle income


### Exporting GDP Data 

In [21]:
GDP_Final.to_csv('gdp_data/gdp_final.csv') 

### Merging with Netlflix data

In [22]:
netflix = pd.read_csv(path_to_netflix, header = 0)

In [23]:
netflix = netflix.rename(columns={'Country':'Country Name'})

In [24]:
netflix_gdp = pd.merge(netflix, GDP_Final, on = "Country Name", how = "left")

In [25]:
netflix_gdp2 = netflix_gdp.drop(columns = 'Country Code')

In [26]:
netflix_gdp2 = netflix_gdp2.rename(columns={'Country Name':'Country'})

In [27]:
netflix_gdp2.head(10)

Unnamed: 0,Country_code,Country,Total Library Size,No. of TV Shows,No. of Movies,Cost Per Month - Basic ($),Cost Per Month - Standard ($),Cost Per Month - Premium ($),2020_GDP,Region,IncomeGroup
0,ar,Argentina,4760,3154,1606,3.74,6.3,9.26,389288100000.0,Latin America & Caribbean,Upper middle income
1,au,Australia,6114,4050,2064,7.84,12.12,16.39,1327836000000.0,East Asia & Pacific,High income
2,at,Austria,5640,3779,1861,9.03,14.67,20.32,433258500000.0,Europe & Central Asia,High income
3,be,Belgium,4990,3374,1616,10.16,15.24,20.32,521861300000.0,Europe & Central Asia,High income
4,bo,Bolivia,4991,3155,1836,7.99,10.99,13.99,36572760000.0,Latin America & Caribbean,Lower middle income
5,br,Brazil,4972,3162,1810,4.61,7.11,9.96,1444733000000.0,Latin America & Caribbean,Upper middle income
6,bg,Bulgaria,6797,4819,1978,9.03,11.29,13.54,69889350000.0,Europe & Central Asia,Upper middle income
7,ca,Canada,6239,4311,1928,7.91,11.87,15.03,1645423000000.0,North America,High income
8,cl,Chile,4994,3156,1838,7.07,9.91,12.74,252940000000.0,Latin America & Caribbean,High income
9,co,Colombia,4991,3156,1835,4.31,6.86,9.93,271437600000.0,Latin America & Caribbean,Upper middle income


### Exporting Final Analystic Dataset

In [28]:
netflix_gdp2.to_csv('Netflix_GDP_Merged.csv') 