In [152]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from scipy.stats import pearsonr
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
from pylab import rcParams
import matplotlib.ticker as mt
from matplotlib.ticker import ScalarFormatter

<a id='top'></a>
# World Food Programme: food price and affordability analysis
#### Data understanding and preparation
### [The World Bank: Population by country ](https://databank.worldbank.org/reports.aspx?source=2&series=SP.POP.TOTL&country=#) 

#### [Data understanding](#understanding)

#### [Data wrangling](#wrangling)

#### [Data cleaning](#cleaning)

#### [Distribution analysis](#distribution)

<a id='understanding'></a>
### Data understanding
[Back to top](#top)

In [153]:
path=r'C:\Users\frauz\Documents\Python Projects\Final Project\Data' #creating a path

In [154]:
df_pop=pd.read_csv(os.path.join(path,'country_iso_population.csv'),index_col=False) #importing data

In [155]:
df_pop.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],...,2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022]
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,10694796,18493132,19262847,19542982,19688632,21000256,...,31541209,32716210,33753499,34636207,35643418,36686784,37769499,38972230,40099462,41128771
1,"Population, total",SP.POP.TOTL,Albania,ALB,3286542,3128530,3108778,3089027,3060173,3051010,...,2895092,2889104,2880703,2876101,2873457,2866376,2854191,2837849,2811666,2775634
2,"Population, total",SP.POP.TOTL,Algeria,DZA,25518074,29924668,30346083,30774621,31200985,31624696,...,38000626,38760168,39543154,40339329,41136546,41927007,42705368,43451666,44177969,44903225
3,"Population, total",SP.POP.TOTL,American Samoa,ASM,47818,56603,57594,58230,58324,58177,...,52995,52217,51368,50448,49463,48424,47321,46189,45035,44273
4,"Population, total",SP.POP.TOTL,Andorra,AND,53569,65186,65655,66097,67820,70849,...,71367,71621,71746,72540,73837,75013,76343,77700,79034,79824


In [156]:
df_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 30 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Series Name    268 non-null    object
 1   Series Code    266 non-null    object
 2   Country Name   266 non-null    object
 3   Country Code   266 non-null    object
 4   1990 [YR1990]  266 non-null    object
 5   1998 [YR1998]  266 non-null    object
 6   1999 [YR1999]  266 non-null    object
 7   2000 [YR2000]  266 non-null    object
 8   2001 [YR2001]  266 non-null    object
 9   2002 [YR2002]  266 non-null    object
 10  2003 [YR2003]  266 non-null    object
 11  2004 [YR2004]  266 non-null    object
 12  2005 [YR2005]  266 non-null    object
 13  2006 [YR2006]  266 non-null    object
 14  2007 [YR2007]  266 non-null    object
 15  2008 [YR2008]  266 non-null    object
 16  2009 [YR2009]  266 non-null    object
 17  2010 [YR2010]  266 non-null    object
 18  2011 [YR2011]  266 non-null   

In [157]:
df_pop.describe()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],...,2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022]
count,268,266,266,266,266,266,266,266,266,266,...,266,266,266,266,266,266,266,266,266,266
unique,3,1,266,266,264,264,264,264,264,264,...,264,264,264,264,264,264,264,264,264,264
top,"Population, total",SP.POP.TOTL,Afghanistan,AFG,516629688,637198518,653883261,671212486,689161982,707693440,...,955096702,981506608,1008698799,1036155989,1063885274,1092403973,1121549049,1151302081,1181163013,1211170184
freq,266,266,1,1,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


<a id='wrangling'></a>
### Data wrangling
[Back to top](#top)

In [158]:
# Unpivoting the data

df_pop=pd.melt(df_pop,id_vars=['Series Name','Series Code','Country Name','Country Code'], var_name='year', value_name='population')
df_pop.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,year,population
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,1990 [YR1990],10694796
1,"Population, total",SP.POP.TOTL,Albania,ALB,1990 [YR1990],3286542
2,"Population, total",SP.POP.TOTL,Algeria,DZA,1990 [YR1990],25518074
3,"Population, total",SP.POP.TOTL,American Samoa,ASM,1990 [YR1990],47818
4,"Population, total",SP.POP.TOTL,Andorra,AND,1990 [YR1990],53569


In [159]:
# Dropping unnecessary columns
df_pop=df_pop.drop(columns=['Series Name','Series Code'])

In [160]:
#Renaming columns
df_pop.rename(columns={'Country Name':'country','Country Code':'iso'},inplace=True)

In [161]:
df_pop.head()

Unnamed: 0,country,iso,year,population
0,Afghanistan,AFG,1990 [YR1990],10694796
1,Albania,ALB,1990 [YR1990],3286542
2,Algeria,DZA,1990 [YR1990],25518074
3,American Samoa,ASM,1990 [YR1990],47818
4,Andorra,AND,1990 [YR1990],53569


In [162]:
#Splitting year column 

df_pop[['year','year code']]=df_pop['year'].str.split(' ', expand=True)

In [163]:
df_pop.head()

Unnamed: 0,country,iso,year,population,year code
0,Afghanistan,AFG,1990,10694796,[YR1990]
1,Albania,ALB,1990,3286542,[YR1990]
2,Algeria,DZA,1990,25518074,[YR1990]
3,American Samoa,ASM,1990,47818,[YR1990]
4,Andorra,AND,1990,53569,[YR1990]


In [164]:
# Dropping unnecessary columns

df_pop=df_pop.drop(columns=['year code'])

In [165]:
df_pop[['country','iso']].value_counts() #checking the number of records for each country

country        iso
Afghanistan    AFG    26
Norway         NOR    26
Mozambique     MOZ    26
Myanmar        MMR    26
Namibia        NAM    26
                      ..
Guam           GUM    26
Guatemala      GTM    26
Guinea         GIN    26
Guinea-Bissau  GNB    26
Zimbabwe       ZWE    26
Length: 266, dtype: int64

In [166]:
df_pop['year'].value_counts() #checking the number of records for each year

1990    271
1998    271
2021    271
2020    271
2019    271
2018    271
2017    271
2016    271
2015    271
2014    271
2013    271
2012    271
2011    271
2010    271
2009    271
2008    271
2007    271
2006    271
2005    271
2004    271
2003    271
2002    271
2001    271
2000    271
1999    271
2022    271
Name: year, dtype: int64

In [167]:
# Converting population to float is impossible since some values are recorded as '..'

df_pop[df_pop['population']=='..']

Unnamed: 0,country,iso,year,population
252,Not classified,INX,1990,..
523,Not classified,INX,1998,..
794,Not classified,INX,1999,..
1065,Not classified,INX,2000,..
1336,Not classified,INX,2001,..
1607,Not classified,INX,2002,..
1878,Not classified,INX,2003,..
2149,Not classified,INX,2004,..
2420,Not classified,INX,2005,..
2691,Not classified,INX,2006,..


In [168]:
# Removing the records where population is recorded as '..'

df_pop=df_pop.drop(df_pop[df_pop['population']=='..'].index)

In [169]:
# Changing data type

df_pop=df_pop.astype({'year':'int', 'population':'float'})

In [170]:
df_pop.head()

Unnamed: 0,country,iso,year,population
0,Afghanistan,AFG,1990,10694796.0
1,Albania,ALB,1990,3286542.0
2,Algeria,DZA,1990,25518074.0
3,American Samoa,ASM,1990,47818.0
4,Andorra,AND,1990,53569.0


In [171]:
df_pop.dtypes

country        object
iso            object
year            int32
population    float64
dtype: object

In [172]:
df_pop.describe()

Unnamed: 0,year,population
count,7020.0,6890.0
mean,2009.230769,279244300.0
std,8.049977,881232100.0
min,1990.0,9182.0
25%,2003.0,1411245.0
50%,2009.5,9374909.0
75%,2016.0,58322220.0
max,2022.0,7951150000.0


<a id='cleaning'></a>
### Data cleaning
[Back to top](#top)

In [173]:
# Checking for missing values
df_pop.isnull().sum()

country       130
iso           130
year            0
population    130
dtype: int64

In [174]:
df_pop[df_pop['country'].isnull()]

Unnamed: 0,country,iso,year,population
266,,,1990,
267,,,1990,
268,,,1990,
269,,,1990,
270,,,1990,
...,...,...,...,...
7041,,,2022,
7042,,,2022,
7043,,,2022,
7044,,,2022,


In [175]:
# Removing the records that don't contain country values

df_population=df_pop.dropna()

In [176]:
df_population.head()

Unnamed: 0,country,iso,year,population
0,Afghanistan,AFG,1990,10694796.0
1,Albania,ALB,1990,3286542.0
2,Algeria,DZA,1990,25518074.0
3,American Samoa,ASM,1990,47818.0
4,Andorra,AND,1990,53569.0


In [177]:
df_population.isnull().sum() #checking for missing values

country       0
iso           0
year          0
population    0
dtype: int64

In [178]:
df_population.reset_index(inplace=True) #reseting index after removing missing values

In [179]:
df_population.index #checking index

RangeIndex(start=0, stop=6890, step=1)

In [180]:
df_population.drop(columns=['index'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_population.drop(columns=['index'], inplace=True)


In [181]:
df_population.head()

Unnamed: 0,country,iso,year,population
0,Afghanistan,AFG,1990,10694796.0
1,Albania,ALB,1990,3286542.0
2,Algeria,DZA,1990,25518074.0
3,American Samoa,ASM,1990,47818.0
4,Andorra,AND,1990,53569.0


<a id='distribution'></a>
### Distribution analysis
[Back to top](#top)

In [182]:
# Verifying the maximum population value

df_population.loc[df_population['population'].idxmax()]

country              World
iso                    WLD
year                  2022
population    7951149546.0
Name: 6889, dtype: object

In [183]:
# Verifying the minimum population value

df_population.loc[df_population['population'].idxmin()]

country       Tuvalu
iso              TUV
year            1990
population    9182.0
Name: 201, dtype: object

In [184]:
df_population.describe()

Unnamed: 0,year,population
count,6890.0,6890.0
mean,2009.230769,279244300.0
std,8.049988,881232100.0
min,1990.0,9182.0
25%,2003.0,1411245.0
50%,2009.5,9374909.0
75%,2016.0,58322220.0
max,2022.0,7951150000.0


In [185]:
df_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6890 entries, 0 to 6889
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     6890 non-null   object 
 1   iso         6890 non-null   object 
 2   year        6890 non-null   int32  
 3   population  6890 non-null   float64
dtypes: float64(1), int32(1), object(2)
memory usage: 188.5+ KB


In [186]:
# Exporting the data for further analysis
df_population.to_pickle(os.path.join(path,'Data Prepared','population.pkl'))