In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from scipy import stats
from scipy.stats import pearsonr
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
from pylab import rcParams
import matplotlib.ticker as mt
from matplotlib.ticker import ScalarFormatter
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

<a id='top'></a>
# World Food Programme: food price and affordability analysis
#### Data understanding and preparation
### [World Food Programme: Commodity prices in 99 countries ](https://data.humdata.org/dataset?dataseries_name=WFP+-+Food+Prices) 

#### [Data understanding](#understanding)

#### [Data wrangling](#wrangling)

#### [Data cleaning](#cleaning)

#### [Distribution analysis](#distribution)


<a id='understanding'></a>
### Data understanding
[Back to top](#top)

In [2]:
path=r'C:\Users\frauz\Documents\Python Projects\Final Project\Data\Data Prepared' #creating a path

In [3]:
df_food=pd.read_pickle(os.path.join(path,'global_food_prices.pkl')) #importing data

In [4]:
df_food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3542751 entries, 0 to 3542750
Data columns (total 16 columns):
 #   Column     Dtype  
---  ------     -----  
 0   index      int64  
 1   date       object 
 2   admin1     object 
 3   admin2     object 
 4   market     object 
 5   latitude   float64
 6   longitude  float64
 7   category   object 
 8   commodity  object 
 9   unit       object 
 10  priceflag  object 
 11  pricetype  object 
 12  currency   object 
 13  price      float64
 14  usdprice   float64
 15  iso        object 
dtypes: float64(4), int64(1), object(11)
memory usage: 432.5+ MB


In [5]:
# Removing an extra index column

df_food.drop(columns='index', inplace=True)

In [6]:
df_food.index

RangeIndex(start=0, stop=3542751, step=1)

In [7]:
df_food.describe().apply(lambda s: s.apply('{0:.5f}'.format)) # basic statistics

Unnamed: 0,latitude,longitude,price,usdprice
count,3515876.0,3515876.0,3542751.0,3538809.0
mean,13.5938,37.25309,6791.21714,13.54921
std,16.61378,46.51168,83441.09058,1075.81118
min,-34.61,-107.386,0.0,0.0
25%,1.7268,8.68138,1.54,0.1678
50%,12.32634,34.25617,130.5,0.7407
75%,27.65266,71.55583,1200.0,1.995
max,59.93,179.37736,17250000.0,599999.9911


In [8]:
df_food.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.0,3205.7378,AFG
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.0,2137.1586,AFG
2,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,actual,Retail,AFN,15.63,0.334,AFG
3,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,actual,Retail,AFN,9.13,0.1951,AFG
4,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,actual,Retail,AFN,10.06,0.215,AFG


In [9]:
df_food.index # index

RangeIndex(start=0, stop=3542751, step=1)

<a id='wrangling'></a>
### Data wrangling
[Back to top](#top)

In [10]:
# Analyzing priceflag and pricetype columns

df_food[['priceflag','pricetype']].value_counts()

priceflag         pricetype
actual            Retail       1939444
forecast          Retail        754906
aggregate         Retail        550001
actual            Wholesale     158295
aggregate         Wholesale      69322
forecast          Wholesale      61364
actual,aggregate  Retail          7813
                  Wholesale        801
actual            Farm Gate        437
                  Producer         248
forecast          Producer          72
                  Farm Gate         48
dtype: int64

In [11]:
# I'm only interested in actual and aggregated retail prices. All other categories will be removed 

df_food=df_food.drop(df_food[df_food['priceflag']=='forecast'].index)

In [12]:
df_food[['priceflag','pricetype']].value_counts()

priceflag         pricetype
actual            Retail       1939444
aggregate         Retail        550001
actual            Wholesale     158295
aggregate         Wholesale      69322
actual,aggregate  Retail          7813
                  Wholesale        801
actual            Farm Gate        437
                  Producer         248
dtype: int64

In [13]:
df_food['iso'].nunique()

99

In [14]:
df_food.shape

(2726361, 15)

In [15]:
# Analyzing the category and comodity columns content to figure out what records could be removed

pivot_table=pd.pivot_table(df_food, values='date', index=['commodity'], columns=['category'], aggfunc='count')

pivot_table # the dataframe contains 730 commodities and 8 categories

category,cereals and tubers,"meat, fish and eggs",milk and dairy,miscellaneous food,non-food,oil and fats,pulses and nuts,vegetables and fruits
commodity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alcohol (spray),,,,,2470.0,,,
Anchovies,,1640.0,,,,,,
Antibacterial wipes,,,,,195.0,,,
Antibiotics (imported),,,,,1350.0,,,
Antibiotics (local),,,,,1350.0,,,
...,...,...,...,...,...,...,...,...
Yam (yellow),157.0,,,,,,,
Yeast,,,,131.0,,,,
Yogurt,,,6364.0,,,,,
Young fern,,,,,,,,35.0


In [16]:
# Analyzing non-food category

df_food['commodity'][df_food['category']=='non-food'].value_counts()

Fuel (diesel)                  35670
Fuel (petrol-gasoline)         24929
Fuel (gas)                     12573
Wage (non-qualified labour)     9948
Exchange rate (unofficial)      7977
                               ...  
Pole                              87
Straw                             63
Transport (public, moto)          42
Feed (wheat bran)                 16
Cotton                            10
Name: commodity, Length: 85, dtype: int64

In [17]:
df_just_food=df_food.copy()
df_just_food.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2726361 entries, 0 to 3541113
Data columns (total 15 columns):
 #   Column     Dtype  
---  ------     -----  
 0   date       object 
 1   admin1     object 
 2   admin2     object 
 3   market     object 
 4   latitude   float64
 5   longitude  float64
 6   category   object 
 7   commodity  object 
 8   unit       object 
 9   priceflag  object 
 10  pricetype  object 
 11  currency   object 
 12  price      float64
 13  usdprice   float64
 14  iso        object 
dtypes: float64(4), object(11)
memory usage: 332.8+ MB


In [18]:
# Analyzing the comodity and unit columns 

pd.pivot_table(df_just_food, values='date', index=['commodity'], columns=['unit'], aggfunc='count')

unit,0.13 KG,0.5 KG,0.8 KG,1 GB,1 piece,1 sachet,1 ticket,1.1 KG,1.2 KG,1.3 KG,...,Month,Package,Packet,Pair,Pile,Pound,Sack,USD/LCU,Unit,kWh
commodity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alcohol (spray),,,,,,,,,,,...,,,,,,,,,,
Anchovies,,,,,,,,,,,...,,,,,,,,,,
Antibacterial wipes,,,,,,,,,,,...,,,195.0,,,,,,,
Antibiotics (imported),,,,,,,,,,,...,,,,,,,,,,
Antibiotics (local),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yam (yellow),,,,,,,,,,,...,,,,,,,,,,
Yeast,,,,,,,,,,,...,,,,,,,,,,
Yogurt,,,,,,,,,,,...,,,,,,,,,,
Young fern,,,,,,,,,,,...,,,,,,,,,,


In [19]:
# Analyzing the unit column

df_just_food['unit'].value_counts() # there are 131 different units

KG         1988591
L           197180
100 KG       62897
Unit         54099
Head         25092
            ...   
0.13 KG         17
450 ML          17
240 G           17
450 G           17
4.5 KG           2
Name: unit, Length: 156, dtype: int64

To be able to compare the price of vrious products I need to unify the units according to the following plan:
1. Split unit column into two: measure and measure unit
2. Calculate the price for KG, price for L and price for single unit of each product

In [20]:
# Splitting the unit column

df_just_food[['measure','measure_unit']]=df_just_food['unit'].str.split(' ', expand=True)

In [21]:
df_just_food.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso,measure,measure_unit
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.0,3205.7378,AFG,Day,
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.0,2137.1586,AFG,Day,
2,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,actual,Retail,AFN,15.63,0.334,AFG,KG,
3,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,actual,Retail,AFN,9.13,0.1951,AFG,KG,
4,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,actual,Retail,AFN,10.06,0.215,AFG,KG,


In [22]:
# Setting up measure value to 1 if measure unit is None

df_just_food.loc[df_just_food['measure_unit'].isnull(), 'measure']='1'

In [23]:
df_just_food.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso,measure,measure_unit
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.0,3205.7378,AFG,1,
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.0,2137.1586,AFG,1,
2,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,actual,Retail,AFN,15.63,0.334,AFG,1,
3,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,actual,Retail,AFN,9.13,0.1951,AFG,1,
4,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,actual,Retail,AFN,10.06,0.215,AFG,1,


In [24]:
df_just_food['measure'].value_counts()

1       2381617
100       74709
10        25311
50        21231
90        14836
         ...   
450          34
240          17
180          17
0.13         17
4.5           2
Name: measure, Length: 94, dtype: int64

In [25]:
# Imputing missing measure_unit values with the values fron unit column

df_just_food.loc[df_just_food['measure_unit'].isnull(), 'measure_unit']=df_just_food['unit']

In [26]:
df_just_food.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso,measure,measure_unit
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.0,3205.7378,AFG,1,Day
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.0,2137.1586,AFG,1,Day
2,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,actual,Retail,AFN,15.63,0.334,AFG,1,KG
3,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,actual,Retail,AFN,9.13,0.1951,AFG,1,KG
4,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,actual,Retail,AFN,10.06,0.215,AFG,1,KG


In [27]:
df_just_food['measure_unit'].value_counts()

KG           2190364
L             210455
G              62431
Unit           54099
pcs            30011
Head           25092
Day            24104
ML             20446
USD/LCU        15660
Pound          11377
Packet          9001
Marmite         8379
tablets         6750
Gallon          6208
Pounds          5853
MT              4256
Loaf            3875
piece           3635
Libra           3591
Bar             3557
Bunch           2852
Pair            2606
GB              2415
Course          2159
Tubers          2051
Sack            1807
Dozen           1353
Cylinder        1263
ticket          1260
sachet          1245
Box             1194
Bundle          1178
LCU/3.5kg       1142
meter           1079
Brush           1026
ml               738
Cuartilla        586
Dozens           337
Package          269
kWh              262
Pile             244
Month             88
Heap              63
Name: measure_unit, dtype: int64

In [28]:
df_just_food['measure'].value_counts()

1       2381617
100       74709
10        25311
50        21231
90        14836
         ...   
450          34
240          17
180          17
0.13         17
4.5           2
Name: measure, Length: 94, dtype: int64

In [29]:
# Checking what'Day' measure means

df_just_food.loc[df_just_food['measure_unit']=='Day']

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso,measure,measure_unit
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.0,3205.7378,AFG,1,Day
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.0,2137.1586,AFG,1,Day
5,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,60000.0,1282.2951,AFG,1,Day
9,2000-01-15,Kabul,Kabul,Kabul,34.516667,69.183333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,55000.0,1175.4372,AFG,1,Day
13,2000-01-15,Kandahar,Kandahar,Kandahar,31.612500,65.709444,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,75000.0,1602.8689,AFG,1,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3448957,2023-07-15,Shabwah,Bayhan,Attaq Town,14.801087,45.719959,non-food,Wage (qualified labour),Day,actual,Retail,YER,15000.0,10.7603,YEM,1,Day
3448975,2023-07-15,Socotra,Hidaybu,Soqatra (Hudaibo),12.650000,54.033333,non-food,Wage (non-qualified labour),Day,actual,Retail,YER,15000.0,10.7603,YEM,1,Day
3448976,2023-07-15,Socotra,Hidaybu,Soqatra (Hudaibo),12.650000,54.033333,non-food,Wage (qualified labour),Day,actual,Retail,YER,20000.0,14.3470,YEM,1,Day
3448992,2023-07-15,Taizz,Al Qahirah,Taiz City,13.580000,44.020000,non-food,Wage (non-qualified labour),Day,actual,Retail,YER,8000.0,5.7388,YEM,1,Day


In [30]:
# Analyzing comodity values that include details in parentheses 

df_comodity=df_just_food.loc[df_just_food['commodity'].str.endswith(')')]

In [31]:
df_comodity 

#I will remove information in prentheses to be able to find an average price for the same product for a certain country and date 

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso,measure,measure_unit
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.00,3205.7378,AFG,1,Day
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.00,2137.1586,AFG,1,Day
5,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,60000.00,1282.2951,AFG,1,Day
9,2000-01-15,Kabul,Kabul,Kabul,34.516667,69.183333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,55000.00,1175.4372,AFG,1,Day
13,2000-01-15,Kandahar,Kandahar,Kandahar,31.612500,65.709444,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,75000.00,1602.8689,AFG,1,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541102,2023-01-15,Mashonaland West,Kariba Urban,Mahombekombe,-16.525861,28.778411,oil and fats,Oil (vegetable),L,aggregate,Retail,ZWL,2018.75,2.7643,ZWE,1,L
3541103,2023-01-15,Mashonaland West,Kariba Urban,Mahombekombe,-16.525861,28.778411,pulses and nuts,Beans (sugar),KG,aggregate,Retail,ZWL,1545.00,2.1156,ZWE,1,KG
3541106,2023-01-15,Matabeleland North,Nkayi,Nkayi Growth Point,-18.999158,28.898316,"meat, fish and eggs",Fish (kapenta),KG,aggregate,Retail,ZWL,9636.00,13.1947,ZWE,1,KG
3541112,2023-01-15,Matabeleland North,Nkayi,Nkayi Growth Point,-18.999158,28.898316,oil and fats,Oil (vegetable),L,aggregate,Retail,ZWL,2000.00,2.7386,ZWE,1,L


In [32]:
# Analyzing powdered products that can or can't be analyzed as the same product

df_powder=df_just_food.loc[df_just_food['commodity'].str.contains(pat='powder')]

In [33]:
df_powder[['commodity','measure_unit']].value_counts()

# Powdered milk will be analyzed as a different product - Powdered Milk

commodity                      measure_unit
Milk (powder)                  KG              5047
                               G               4252
                               Unit            1522
Cocoa (powder)                 G               1120
Milk (powder, infant formula)  KG               429
Cocoa (powder)                 KG               287
Milk (powder)                  Pounds           156
                               Pound             84
dtype: int64

In [34]:
# Changing Milk (powder) to Powdered Milk

df_just_food.loc[df_just_food['commodity']=='Milk (powder)', 'commodity'] = 'Powdered milk'
df_just_food.loc[df_just_food['commodity']=='Milk (powder, infant formula)', 'commodity'] = 'Powdered milk'

In [35]:
df_milk=df_just_food.loc[df_just_food['commodity'].str.contains(pat='Milk')]

In [36]:
df_milk[['commodity','measure_unit']].value_counts()

commodity                measure_unit
Milk (pasteurized)       L               11064
Milk                     L                9307
Milk (non-pasteurized)   L                4654
Milk                     KG               2484
Milk (condensed)         ML               2071
Milk (cow, fresh)        L                1563
Milk (camel)             L                1248
Milk                     G                 882
Milk (fresh)             L                 828
Milk (pasteurized)       KG                610
Milk (UHT)               ML                356
Milk (cow, pasteurized)  ML                178
Milk (condensed)         G                 158
Milk (pasteurized)       ML                100
Milk (camel, fresh)      L                  78
Milk (pasteurized)       G                  68
Milk (cow, pasteurized)  L                  21
dtype: int64

In [37]:
# Splitting commodity column to separate the product name from the notes in the parentheses

df_just_food[['product_name','product_notes']]=df_just_food['commodity'].str.split('(', expand=True)

In [38]:
df_just_food.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name,product_notes
0,2000-01-15,Badakhshan,Faiz Abad,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,150000.0,3205.7378,AFG,1,Day,Wage,"non-qualified labour, non-agricultural)"
1,2000-01-15,Balkh,Mazar-e-Sharif,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,actual,Retail,AFN,100000.0,2137.1586,AFG,1,Day,Wage,"non-qualified labour, non-agricultural)"
2,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,actual,Retail,AFN,15.63,0.334,AFG,1,KG,Bread,
3,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,actual,Retail,AFN,9.13,0.1951,AFG,1,KG,Wheat,
4,2000-01-15,Hirat,Hirat,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,actual,Retail,AFN,10.06,0.215,AFG,1,KG,Wheat flour,


In [39]:
df_just_food['product_name'].value_counts() # There are 261 food and non-food commodities in the dataframe

Rice               235299
Meat               189651
Oil                145195
Beans              104911
Fuel                94612
                    ...  
Meat                   15
Cotton                 10
Corn Soy Blend          5
Butter                  4
Kocho                   1
Name: product_name, Length: 261, dtype: int64

In [40]:
# I realize that the same product can be quite different in various parts of the world.
# But this analysys will focus on types of products in general and ignore the differences.

df_just_food['product_notes'].value_counts()

white)             117928
imported)           78495
local)              78252
red)                50883
vegetable)          45218
                    ...  
basmati)                2
Turkey)                 2
haricot, white)         2
cow milk)               2
haricot, red)           1
Name: product_notes, Length: 430, dtype: int64

In [41]:
df_just_food['product_name'].iloc[0]

'Wage '

In [42]:
# Trimming an extra space in product_name columns

df_just_food['product_name']=df_just_food['product_name'].apply(lambda x: x.strip())

In [43]:
df_just_food['product_name'].iloc[0]

'Wage'

In [44]:
# Removing unnecessary columns

df_food_removed=df_just_food.drop(columns=['admin1','admin2','priceflag','product_notes'])

In [45]:
df_food_removed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2726361 entries, 0 to 3541113
Data columns (total 15 columns):
 #   Column        Dtype  
---  ------        -----  
 0   date          object 
 1   market        object 
 2   latitude      float64
 3   longitude     float64
 4   category      object 
 5   commodity     object 
 6   unit          object 
 7   pricetype     object 
 8   currency      object 
 9   price         float64
 10  usdprice      float64
 11  iso           object 
 12  measure       object 
 13  measure_unit  object 
 14  product_name  object 
dtypes: float64(4), object(11)
memory usage: 397.3+ MB


In [46]:
# Analyzing non-numerical measure values

df_food_removed.loc[df_food_removed['measure_unit']=='meter']

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name
1523058,2022-04-15,Tehran Market,35.720000,51.400000,non-food,Fuel (gas),Cubic meter,Retail,IRR,966.00,0.0230,IRN,Cubic,meter,Fuel
1523061,2022-04-15,Tehran Market,35.720000,51.400000,non-food,Water,Cubic meter,Retail,IRR,11640.00,0.2771,IRN,Cubic,meter,Water
1523092,2022-05-15,Tehran Market,35.720000,51.400000,non-food,Fuel (gas),Cubic meter,Retail,IRR,966.00,0.0230,IRN,Cubic,meter,Fuel
1523095,2022-05-15,Tehran Market,35.720000,51.400000,non-food,Water,Cubic meter,Retail,IRR,11640.00,0.2771,IRN,Cubic,meter,Water
1523126,2022-06-15,Tehran Market,35.720000,51.400000,non-food,Fuel (gas),Cubic meter,Retail,IRR,966.00,0.0230,IRN,Cubic,meter,Fuel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216009,2022-03-15,Izmir,38.423698,27.142799,non-food,Water,Cubic meter,Retail,TRY,10.84,0.7335,TUR,Cubic,meter,Water
3216052,2022-04-15,National Average,,,non-food,Water,Cubic meter,Retail,TRY,8.59,0.5875,TUR,Cubic,meter,Water
3216092,2022-04-15,Ankara,39.933399,32.859699,non-food,Water,Cubic meter,Retail,TRY,10.25,0.7013,TUR,Cubic,meter,Water
3216132,2022-04-15,Istanbul,41.012639,28.966742,non-food,Water,Cubic meter,Retail,TRY,9.47,0.6477,TUR,Cubic,meter,Water


In [47]:
# Changing measure unit to cubic measure

df_food_removed.loc[(df_food_removed['measure']=='Cubic')&(df_food_removed['measure_unit']=='meter'), 'measure_unit']='Cubic meter'


In [48]:
df_food_removed.loc[(df_food_removed['measure']=='Cubic')&(df_food_removed['measure_unit']!='Cubic meter')]

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name


In [49]:
df_food_removed.loc[df_food_removed['measure']=='Cubic', 'measure']=1

In [50]:
# Changing datatype to float

df_food_removed['measure']=df_food_removed['measure'].astype(float)
df_food_removed['measure'].dtype

dtype('float64')

In [51]:
# Changing date from string to date format

df_food_removed['date']=pd.to_datetime(df_food_removed['date'])

In [52]:
df_food_removed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2726361 entries, 0 to 3541113
Data columns (total 15 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   market        object        
 2   latitude      float64       
 3   longitude     float64       
 4   category      object        
 5   commodity     object        
 6   unit          object        
 7   pricetype     object        
 8   currency      object        
 9   price         float64       
 10  usdprice      float64       
 11  iso           object        
 12  measure       float64       
 13  measure_unit  object        
 14  product_name  object        
dtypes: datetime64[ns](1), float64(5), object(9)
memory usage: 397.3+ MB


<a id='cleaning'></a>
### Data cleaning
[Back to top](#top)

Changing inconsistent measure units as follows:  
G = 0.001 KG   
ML, ml = 0.001 L  
Gallon = 3.8 L  
Cuartilla = 13.9 L  
Pound/Pounds/Libra = 0.45 KG  
MT = 1000 KG  
LCU/3.5kg = 3.5 KG  
The rest of the units = Unit  

In [53]:
df_food_removed.loc[df_food_removed['measure_unit']=='LCU/3.5kg']

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name
2816729,2018-08-15,Konyokonyo,4.845972,31.601203,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,70.0,0.4809,SSD,1.0,LCU/3.5kg,Milling cost
2816758,2018-08-15,Bor,6.210000,31.570000,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,85.0,0.5840,SSD,1.0,LCU/3.5kg,Milling cost
2816837,2018-08-15,Jau,7.701110,27.989719,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,100.0,0.6870,SSD,1.0,LCU/3.5kg,Milling cost
2816866,2018-09-15,Konyokonyo,4.845972,31.601203,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,100.0,0.6715,SSD,1.0,LCU/3.5kg,Milling cost
2816896,2018-09-15,Bor,6.210000,31.570000,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,100.0,0.6715,SSD,1.0,LCU/3.5kg,Milling cost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2845984,2023-08-15,Jau,7.701110,27.989719,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,700.0,0.7011,SSD,1.0,LCU/3.5kg,Milling cost
2846011,2023-08-15,Makpandu,4.739905,28.679766,non-food,Milling cost (maize),LCU/3.5kg,Retail,SSP,900.0,0.9015,SSD,1.0,LCU/3.5kg,Milling cost
2846012,2023-08-15,Makpandu,4.739905,28.679766,non-food,Milling cost (sorghum),LCU/3.5kg,Retail,SSP,900.0,0.9015,SSD,1.0,LCU/3.5kg,Milling cost
2846043,2023-08-15,Yambio,4.550833,28.416668,non-food,Milling cost (maize),LCU/3.5kg,Retail,SSP,400.0,0.4007,SSD,1.0,LCU/3.5kg,Milling cost


In [54]:
# Changing measures to KG, L, and Units

df_food_removed.loc[df_food_removed['measure_unit']=='G', 'measure']=df_food_removed['measure']*0.001
df_food_removed.loc[df_food_removed['measure_unit'].isin(['Pound','Pounds','Libra']), 'measure']=df_food_removed['measure']*0.45
df_food_removed.loc[df_food_removed['measure_unit']=='MT', 'measure']=df_food_removed['measure']*1000
df_food_removed.loc[df_food_removed['measure_unit'].isin(['ML','ml']), 'measure']=df_food_removed['measure']*0.001
df_food_removed.loc[df_food_removed['measure_unit']=='Gallon', 'measure']=df_food_removed['measure']*3.8
df_food_removed.loc[df_food_removed['measure_unit']=='Cuartilla', 'measure']=df_food_removed['measure']*13.9
df_food_removed.loc[df_food_removed['measure_unit']=='LCU/3.5kg', 'measure']=df_food_removed['measure']*3.5

In [55]:
df_food_removed['measure'].value_counts()

1.00      2355536
100.00      65063
10.00       25311
50.00       18298
0.45        15002
           ...   
360.00         52
0.24           17
0.18           17
0.13           17
4.50            2
Name: measure, Length: 99, dtype: int64

In [56]:
#Checking the result

df_food_removed.loc[df_food_removed['measure_unit']=='ml']

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name
1539726,2020-03-15,Anbar,33.416312,43.302097,non-food,Toothpaste,150 ml,Retail,IQD,2000.00,1.6920,IRQ,0.15,ml,Toothpaste
1539736,2020-03-15,Babylon,32.483182,44.437380,non-food,Toothpaste,150 ml,Retail,IQD,1500.00,1.2690,IRQ,0.15,ml,Toothpaste
1539746,2020-03-15,Baghdad,33.338611,44.393889,non-food,Toothpaste,150 ml,Retail,IQD,3000.00,2.5381,IRQ,0.15,ml,Toothpaste
1539756,2020-03-15,Basrah,30.534884,47.788846,non-food,Toothpaste,150 ml,Retail,IQD,4000.00,3.3841,IRQ,0.15,ml,Toothpaste
1539766,2020-03-15,Dohuk,36.869498,42.993982,non-food,Toothpaste,150 ml,Retail,IQD,3000.00,2.5381,IRQ,0.15,ml,Toothpaste
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1564924,2023-07-15,Qadisiya,31.987723,44.930177,non-food,Toothpaste,150 ml,Retail,IQD,3000.00,2.3077,IRQ,0.15,ml,Toothpaste
1564960,2023-07-15,Salah al-deen,34.597186,43.685675,non-food,Toothpaste,150 ml,Retail,IQD,5000.00,3.8462,IRQ,0.15,ml,Toothpaste
1564996,2023-07-15,Sulaimaniyah,35.561127,45.437487,non-food,Toothpaste,150 ml,Retail,IQD,1883.33,1.4487,IRQ,0.15,ml,Toothpaste
1565032,2023-07-15,Thi-Qar,31.048263,46.271753,non-food,Toothpaste,150 ml,Retail,IQD,4000.00,3.0769,IRQ,0.15,ml,Toothpaste


In [57]:
df_food_removed.loc[df_food_removed['measure_unit']=='GB']

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name
2933966,2020-06-15,Al-Hasakeh,36.481689,40.756191,non-food,Internet bundle,1 GB,Retail,SYP,2750.0,6.3001,SYR,1.0,GB,Internet bundle
2934012,2020-06-15,Quamishli,37.052361,41.241371,non-food,Internet bundle,1 GB,Retail,SYP,2750.0,6.3001,SYR,1.0,GB,Internet bundle
2934054,2020-06-15,A'zaz,36.585899,37.045650,non-food,Internet bundle,1 GB,Retail,SYP,2000.0,4.5819,SYR,1.0,GB,Internet bundle
2934094,2020-06-15,Afrin,36.452709,36.816860,non-food,Internet bundle,1 GB,Retail,SYP,2000.0,4.5819,SYR,1.0,GB,Internet bundle
2934136,2020-06-15,Der Hafir,36.156387,37.703697,non-food,Internet bundle,1 GB,Retail,SYP,2750.0,6.3001,SYR,1.0,GB,Internet bundle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042159,2023-04-15,Qudsiya,33.549999,36.216671,non-food,Internet bundle,1 GB,Retail,SYP,7800.0,17.8694,SYR,1.0,GB,Internet bundle
3042205,2023-04-15,Saqba,33.516521,36.379803,non-food,Internet bundle,1 GB,Retail,SYP,7800.0,17.8694,SYR,1.0,GB,Internet bundle
3042252,2023-04-15,Rural Safita,34.875881,36.257259,non-food,Internet bundle,1 GB,Retail,SYP,7800.0,17.8694,SYR,1.0,GB,Internet bundle
3042298,2023-04-15,Sheikh Badr,34.990974,36.079144,non-food,Internet bundle,1 GB,Retail,SYP,7800.0,17.8694,SYR,1.0,GB,Internet bundle


In [58]:
# Changing measure units to KG, L or Unit

to_kg=['G','Pound','Pounds','Libra','MT','LCU/3.5kg']
to_l=['ML','Gallon','Cuartilla','ml']

In [59]:
df_food_removed['measure_unit']=df_food_removed['measure_unit'].replace(to_kg, 'KG')
df_food_removed['measure_unit']=df_food_removed['measure_unit'].replace(to_l, 'L')

In [60]:
df_food_removed.loc[df_food_removed['measure_unit'].isin(to_l)]

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name


In [61]:
df_food_removed['measure_unit'].value_counts()

KG             2279014
L               238433
Unit             54099
pcs              30011
Head             25092
Day              24104
USD/LCU          15660
Packet            9001
Marmite           8379
tablets           6750
Loaf              3875
piece             3635
Bar               3557
Bunch             2852
Pair              2606
GB                2415
Course            2159
Tubers            2051
Sack              1807
Dozen             1353
Cylinder          1263
ticket            1260
sachet            1245
Box               1194
Bundle            1178
Cubic meter       1079
Brush             1026
Dozens             337
Package            269
kWh                262
Pile               244
Month               88
Heap                63
Name: measure_unit, dtype: int64

In [62]:
# Changing the remaining units to Unit

df_food_removed.loc[~df_food_removed['measure_unit'].isin(['KG','L']), 'measure_unit'] = 'Unit'

In [63]:
df_food_removed['measure_unit'].value_counts()

KG      2279014
L        238433
Unit     208914
Name: measure_unit, dtype: int64

In [64]:
#Checking for missing values

df_food_removed.isnull().sum() # missing values for price in $usd will be ignored for now

date                0
market              0
latitude        24268
longitude       24268
category            0
commodity           0
unit                0
pricetype           0
currency            0
price               0
usdprice         2740
iso                 0
measure             0
measure_unit        0
product_name        0
dtype: int64

In [65]:
# Checking for records where price is 0. Exchange rate will be removed from the dataframe

df_food_removed.loc[df_food_removed['price']==0]

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure,measure_unit,product_name
3399682,2010-06-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399683,2010-07-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399684,2010-08-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399685,2010-09-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399686,2010-10-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399687,2010-11-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399688,2010-12-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399689,2011-01-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399690,2011-02-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate
3399691,2011-03-15,National Average,,,non-food,Exchange rate (unofficial),USD/LCU,Retail,VEF,0.0,0.0,VEN,1.0,Unit,Exchange rate


In [66]:
df_food_removed=df_food_removed[df_food_removed['price']!=0]

In [67]:
df_food_removed.reset_index(inplace=True)

In [68]:
df_food_removed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2726301 entries, 0 to 2726300
Data columns (total 16 columns):
 #   Column        Dtype         
---  ------        -----         
 0   index         int64         
 1   date          datetime64[ns]
 2   market        object        
 3   latitude      float64       
 4   longitude     float64       
 5   category      object        
 6   commodity     object        
 7   unit          object        
 8   pricetype     object        
 9   currency      object        
 10  price         float64       
 11  usdprice      float64       
 12  iso           object        
 13  measure       float64       
 14  measure_unit  object        
 15  product_name  object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(9)
memory usage: 332.8+ MB


In [69]:
# Checking for duplicates

df_food_removed.duplicated().value_counts()

# in the original data set, the same price measure was taken at the same time in different locations and for the same products of different types

False    2726301
dtype: int64

The duplicates will be handled according to the following plan:

1. Calculate price per unit for each product
2. Calculate average price for the same product name on a country-year-month level


In [70]:
# Calculating price per unit in local currency and usd

df_food_removed['price_unit']=df_food_removed['price']/df_food_removed['measure']
df_food_removed['usdprice_unit']=df_food_removed['usdprice']/df_food_removed['measure']

In [71]:
# Removing measure column

df_food_removed.drop(columns=['measure','index'],inplace=True)

In [72]:
df_food_removed.head()

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
0,2000-01-15,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,150000.0,3205.7378,AFG,Unit,Wage,150000.0,3205.7378
1,2000-01-15,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,100000.0,2137.1586,AFG,Unit,Wage,100000.0,2137.1586
2,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,Retail,AFN,15.63,0.334,AFG,KG,Bread,15.63,0.334
3,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,Retail,AFN,9.13,0.1951,AFG,KG,Wheat,9.13,0.1951
4,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,Retail,AFN,10.06,0.215,AFG,KG,Wheat flour,10.06,0.215


In [73]:
# Checking if any products have been measured multiple times using various measure units

df_mult_units=df_food_removed.groupby('product_name')['measure_unit'].nunique().sort_values(ascending=False).rename_axis('product').reset_index(name='count')

In [74]:
df_mult_units=df_mult_units[df_mult_units['count']>1]
df_mult_units #for 51 products, multiple units are used for measuring

Unnamed: 0,product,count
0,Noodles,3
1,Rice,3
2,Handwash soap,3
3,Wage,3
4,Fuel,3
5,Salt,2
6,Fish,2
7,Firewood,2
8,Eggs,2
9,Maize meal,2


Using different measure units will complicate further analysis. I will handle it using the following logic:

1. For bread unit will be converted into KG and the price will be duplicated, since a loaf of bread is typically about 500g
2. For yogurt and sour cream unit will be converted into L and the price will be duplicated since a typical unit is usually about 0.5 L
3. For other products units will be converted 1 to 1 though it might skew the results. But I don't have any way to check what ws the actual weight of the unit recorded

In [75]:
# Creating a list of products we need to handle

list=df_mult_units['product'].to_list()

In [76]:
df_food_removed[df_food_removed['product_name']=='Noodles'].head(80)

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
278383,2010-01-15,Trinidad,-14.837491,-64.904286,cereals and tubers,Noodles (short),KG,Retail,BOB,7.75,1.1260,BOL,KG,Noodles,7.750000,1.126000
278393,2010-01-15,Sucre,-19.043056,-65.259167,cereals and tubers,Noodles (short),KG,Retail,BOB,6.95,1.0097,BOL,KG,Noodles,6.950000,1.009700
278405,2010-01-15,Cochabamba City,-17.389498,-66.156797,cereals and tubers,Noodles (short),Pound,Retail,BOB,3.61,0.5245,BOL,KG,Noodles,8.022222,1.165556
278422,2010-01-15,La Paz City,-16.500000,-68.150000,cereals and tubers,Noodles (short),Pound,Retail,BOB,3.36,0.4882,BOL,KG,Noodles,7.466667,1.084889
278438,2010-01-15,Oruro City,-17.983333,-67.150000,cereals and tubers,Noodles (short),KG,Retail,BOB,7.00,1.0170,BOL,KG,Noodles,7.000000,1.017000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279392,2010-09-15,La Paz City,-16.500000,-68.150000,cereals and tubers,Noodles (short),Pound,Retail,BOB,3.44,0.5011,BOL,KG,Noodles,7.644444,1.113556
279410,2010-09-15,Oruro City,-17.983333,-67.150000,cereals and tubers,Noodles (short),KG,Retail,BOB,7.42,1.0808,BOL,KG,Noodles,7.420000,1.080800
279426,2010-09-15,Cobija,-11.783333,-67.900000,cereals and tubers,Noodles (short),KG,Retail,BOB,14.00,2.0392,BOL,KG,Noodles,14.000000,2.039200
279432,2010-09-15,Potosi,-19.583611,-65.753056,cereals and tubers,Noodles (short),Cuartilla,Retail,BOB,17.72,2.5811,BOL,L,Noodles,1.274820,0.185691


In [77]:
# Changing bread price

df_food_removed.loc[(df_food_removed['measure_unit']=='Unit')&(df_food_removed['product_name']=='Bread'), 'price_unit']=df_food_removed['price_unit']*2
df_food_removed.loc[(df_food_removed['measure_unit']=='Unit')&(df_food_removed['product_name']=='Bread'), 'usdprice_unit']=df_food_removed['usdprice_unit']*2

In [78]:
# Changing yogurt and sour cream price

df_food_removed.loc[(df_food_removed['measure_unit']=='Unit')&(df_food_removed['product_name'].isin(['Sour cream','Yogurt'])), 'price_unit']=df_food_removed['price_unit']*2
df_food_removed.loc[(df_food_removed['measure_unit']=='Unit')&(df_food_removed['product_name'].isin(['Sour cream','Yogurt'])), 'usdprice_unit']=df_food_removed['usdprice_unit']*2

In [79]:
# Changing measure units for bread, sour cream and yogurt

df_food_removed.loc[df_food_removed['product_name'].isin(['Sour cream','Yogurt']), 'measure_unit']='L'
df_food_removed.loc[df_food_removed['product_name'].isin(['Bread']), 'measure_unit']='KG'

In [80]:
df_mult_units=df_food_removed.groupby('product_name')['measure_unit'].nunique().sort_values(ascending=False).rename_axis('product').reset_index(name='count')
df_mult_units=df_mult_units[df_mult_units['count']>1]
df_mult_units #for 51 products, multiple units are used for measuring

Unnamed: 0,product,count
0,Fuel,3
1,Rice,3
2,Handwash soap,3
3,Noodles,3
4,Wage,3
5,Potato Leaves,2
6,Garlic,2
7,Charcoal,2
8,Cheese,2
9,Chicken,2


In [81]:
# for all other product I will calculate the most popular (mode) measure unit and use at as the only measure unit for a certain product

# Finding the most common measure unit for each product 

df_food_removed['measure_mode']=df_food_removed.groupby('product_name')['measure_unit'].transform(lambda x: x.value_counts().idxmax())
df_food_removed.head(50)

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit,measure_mode
0,2000-01-15,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,150000.0,3205.7378,AFG,Unit,Wage,150000.0,3205.7378,Unit
1,2000-01-15,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,100000.0,2137.1586,AFG,Unit,Wage,100000.0,2137.1586,Unit
2,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,Retail,AFN,15.63,0.334,AFG,KG,Bread,15.63,0.334,KG
3,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,Retail,AFN,9.13,0.1951,AFG,KG,Wheat,9.13,0.1951,KG
4,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,Retail,AFN,10.06,0.215,AFG,KG,Wheat flour,10.06,0.215,KG
5,2000-01-15,Hirat,34.346944,62.198333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,60000.0,1282.2951,AFG,Unit,Wage,60000.0,1282.2951,Unit
6,2000-01-15,Kabul,34.516667,69.183333,cereals and tubers,Bread,KG,Retail,AFN,14.26,0.3048,AFG,KG,Bread,14.26,0.3048,KG
7,2000-01-15,Kabul,34.516667,69.183333,cereals and tubers,Wheat,KG,Retail,AFN,13.75,0.2939,AFG,KG,Wheat,13.75,0.2939,KG
8,2000-01-15,Kabul,34.516667,69.183333,cereals and tubers,Wheat flour,KG,Retail,AFN,18.57,0.3969,AFG,KG,Wheat flour,18.57,0.3969,KG
9,2000-01-15,Kabul,34.516667,69.183333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,55000.0,1175.4372,AFG,Unit,Wage,55000.0,1175.4372,Unit


In [82]:
#Replacing measure unit with the most popular one for each product

df_food_removed['measure_unit']=df_food_removed['measure_mode']

In [83]:
df_food_removed.groupby('product_name')['measure_unit'].nunique().sort_values(ascending=False) #Fixed!

product_name
Alcohol          1
Potato Leaves    1
Peas             1
Pen              1
Pencil           1
                ..
Fuel             1
Gari             1
Garlic           1
Ghee             1
Zucchini         1
Name: measure_unit, Length: 202, dtype: int64

### Handling multiple categories

In [84]:
# I will handle double categories the same way

# Checking if the same product is always recorded in the same category

df_food_removed.groupby('product_name')['category'].nunique().sort_values(ascending=False)

product_name
Groundnuts    2
Tomatoes      2
Lemons        2
Beans         2
Peas          2
             ..
Fuel          1
Gari          1
Garlic        1
Ghee          1
Zucchini      1
Name: category, Length: 202, dtype: int64

In [85]:
# Finding the most common category for each product 

df_food_removed['category_mode']=df_food_removed.groupby('product_name')['category'].transform(lambda x: x.value_counts().idxmax())
df_food_removed.head(50)

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit,measure_mode,category_mode
0,2000-01-15,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,150000.0,3205.7378,AFG,Unit,Wage,150000.0,3205.7378,Unit,non-food
1,2000-01-15,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,100000.0,2137.1586,AFG,Unit,Wage,100000.0,2137.1586,Unit,non-food
2,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Bread,KG,Retail,AFN,15.63,0.334,AFG,KG,Bread,15.63,0.334,KG,cereals and tubers
3,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Wheat,KG,Retail,AFN,9.13,0.1951,AFG,KG,Wheat,9.13,0.1951,KG,cereals and tubers
4,2000-01-15,Hirat,34.346944,62.198333,cereals and tubers,Wheat flour,KG,Retail,AFN,10.06,0.215,AFG,KG,Wheat flour,10.06,0.215,KG,cereals and tubers
5,2000-01-15,Hirat,34.346944,62.198333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,60000.0,1282.2951,AFG,Unit,Wage,60000.0,1282.2951,Unit,non-food
6,2000-01-15,Kabul,34.516667,69.183333,cereals and tubers,Bread,KG,Retail,AFN,14.26,0.3048,AFG,KG,Bread,14.26,0.3048,KG,cereals and tubers
7,2000-01-15,Kabul,34.516667,69.183333,cereals and tubers,Wheat,KG,Retail,AFN,13.75,0.2939,AFG,KG,Wheat,13.75,0.2939,KG,cereals and tubers
8,2000-01-15,Kabul,34.516667,69.183333,cereals and tubers,Wheat flour,KG,Retail,AFN,18.57,0.3969,AFG,KG,Wheat flour,18.57,0.3969,KG,cereals and tubers
9,2000-01-15,Kabul,34.516667,69.183333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,55000.0,1175.4372,AFG,Unit,Wage,55000.0,1175.4372,Unit,non-food


In [86]:
#Replacing category with category_mode

df_food_removed['category']=df_food_removed['category_mode']

In [87]:
#Checking the result

df_food_removed.groupby('product_name')['category'].nunique().sort_values(ascending=False) #perfect!

product_name
Alcohol          1
Potato Leaves    1
Peas             1
Pen              1
Pencil           1
                ..
Fuel             1
Gari             1
Garlic           1
Ghee             1
Zucchini         1
Name: category, Length: 202, dtype: int64

In [88]:
#Removing the category_mode column

df_food_removed.drop(columns=['category_mode','measure_mode'], inplace=True)

In [89]:
# Fixing different fule types issue. Disel and kerosine are recorded as one product, I will fix that

df_food_removed['commodity'][df_food_removed['commodity'].str.contains('Fuel')].value_counts()


Fuel (diesel)                                35670
Fuel (petrol-gasoline)                       24929
Fuel (gas)                                   12573
Fuel (kerosene)                               4360
Fuel (petrol-gasoline, 95 octane)             3316
Fuel (diesel, transport, parallel market)     2602
Fuel (gas, parallel market)                   2501
Fuel (diesel, heating, parallel market)       2187
Fuel (Super Petrol)                           1902
Fuel (LPG)                                    1350
Fuel (petrol-gasoline, 92 octane)             1342
Fuel (diesel, parallel market)                 781
Fuel (petrol-gasoline, parallel market)        762
Fuel (kerosene, paraffin)                      182
Fuel (petrol)                                  155
Name: commodity, dtype: int64

In [90]:
df_food_removed.loc[df_food_removed['commodity'].str.contains('diesel'), 'product_name']='Diesel'
df_food_removed.loc[df_food_removed['commodity'].str.contains('petrol'), 'product_name']='Petrol'   
df_food_removed.loc[df_food_removed['commodity'].str.contains('gas'), 'product_name']='Gas' 
df_food_removed.loc[df_food_removed['commodity'].str.contains('kerosene'), 'product_name']='Kerosene' 

### Handling multiple currencies

In [91]:
#Checking if only one local currency is used per country

df_food_removed.groupby('iso')['currency'].nunique().sort_values(ascending=False) #6 countries have more than one currency

iso
ZWE    2
HND    2
PAN    2
SOM    2
NIC    2
      ..
GEO    1
GAB    1
FJI    1
ETH    1
LBN    1
Name: currency, Length: 99, dtype: int64

In [92]:
# Local currency in ZWE was changed from ZWD to ZWL with with the exchange rate 1 to 10in25th

df_food_removed[df_food_removed['iso']=='ZWE'] #until the currency way changed, the prices were recorded in USD

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
2704689,2010-01-15,Kombai,,,cereals and tubers,Maize,KG,Retail,USD,0.26,0.2571,ZWE,KG,Maize,0.26,0.2571
2704690,2010-01-15,Mandava,,,cereals and tubers,Maize,KG,Retail,USD,0.26,0.2571,ZWE,KG,Maize,0.26,0.2571
2704691,2010-01-15,Mucheke,,,cereals and tubers,Maize,KG,Retail,USD,0.31,0.3143,ZWE,KG,Maize,0.31,0.3143
2704692,2010-01-15,Murombedzi,,,cereals and tubers,Maize,KG,Retail,USD,0.23,0.2286,ZWE,KG,Maize,0.23,0.2286
2704693,2010-01-15,Renkini Bus Terminus,,,cereals and tubers,Maize,KG,Retail,USD,0.31,0.3143,ZWE,KG,Maize,0.31,0.3143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2726296,2023-01-15,Nkayi Growth Point,-18.999158,28.898316,non-food,Handwash soap,250 G,Retail,ZWL,280.00,0.3834,ZWE,KG,Handwash soap,1120.00,1.5336
2726297,2023-01-15,Nkayi Growth Point,-18.999158,28.898316,non-food,Laundry soap,KG,Retail,ZWL,1408.00,1.9280,ZWE,KG,Laundry soap,1408.00,1.9280
2726298,2023-01-15,Nkayi Growth Point,-18.999158,28.898316,non-food,Toothpaste,100 ML,Retail,ZWL,1400.00,1.9170,ZWE,L,Toothpaste,14000.00,19.1700
2726299,2023-01-15,Nkayi Growth Point,-18.999158,28.898316,oil and fats,Oil (vegetable),L,Retail,ZWL,2000.00,2.7386,ZWE,L,Oil,2000.00,2.7386


In [93]:
# To unify the currency, I'm converting ZWL - new currency to USD 
df_food_removed.loc[(df_food_removed['iso']=='ZWE')&(df_food_removed['currency']=='ZWL'), 'price_unit']=df_food_removed['usdprice_unit']

In [94]:
# Changing currency to USD
df_food_removed.loc[(df_food_removed['iso']=='ZWE')&(df_food_removed['currency']=='ZWL'), 'currency']='USD'

In [95]:
# Local currency in Somalia is abbreviated as SOS

df_food_removed[df_food_removed['iso']=='SOM'] 

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
2132135,1995-01-15,Bakaara,2.048031,45.32000,cereals and tubers,Sorghum (red),KG,Retail,SOS,700.0,0.0850,SOM,KG,Sorghum,700.0,0.0850
2132136,1995-02-15,Bakaara,2.048031,45.32000,cereals and tubers,Sorghum (red),KG,Retail,SOS,525.0,0.0628,SOM,KG,Sorghum,525.0,0.0628
2132137,1995-03-15,Bakaara,2.048031,45.32000,cereals and tubers,Sorghum (red),KG,Retail,SOS,600.0,0.0708,SOM,KG,Sorghum,600.0,0.0708
2132138,1995-04-15,Bakaara,2.048031,45.32000,cereals and tubers,Sorghum (red),KG,Retail,SOS,900.0,0.1046,SOM,KG,Sorghum,900.0,0.1046
2132139,1995-05-15,Bakaara,2.048031,45.32000,cereals and tubers,Sorghum (red),KG,Retail,SOS,1025.0,0.1174,SOM,KG,Sorghum,1025.0,0.1174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155497,2023-07-15,Hargeysa,9.559750,44.06678,non-food,Exchange rate,USD/LCU,Retail,SLS,8600.0,,SOM,Unit,Exchange rate,8600.0,
2155498,2023-07-15,Hargeysa,9.559750,44.06678,non-food,Fuel (diesel),L,Retail,SLS,7500.0,,SOM,L,Diesel,7500.0,
2155499,2023-07-15,Hargeysa,9.559750,44.06678,oil and fats,"Oil (vegetable, imported)",L,Retail,SLS,15500.0,,SOM,L,Oil,15500.0,
2155500,2023-07-15,Hargeysa,9.559750,44.06678,pulses and nuts,Cowpeas,KG,Retail,SLS,18000.0,,SOM,KG,Cowpeas,18000.0,


In [96]:
# Changing currency to SOS
df_food_removed.loc[(df_food_removed['iso']=='SOM')&(df_food_removed['currency']=='SLS'), 'currency']='SOS'

In [97]:
#MWK is official currency in Malawi. It's unclear why SOS was used to record the values
df_food_removed[df_food_removed['iso']=='MWI']

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
1693836,1990-10-15,National Average,,,non-food,Fuel (diesel),L,Retail,MWK,1.90,0.7162,MWI,L,Diesel,1.90,0.7162
1693837,1990-10-15,National Average,,,non-food,"Fuel (kerosene, paraffin)",L,Retail,MWK,1.22,0.4599,MWI,L,Kerosene,1.22,0.4599
1693838,1991-08-15,National Average,,,non-food,Fuel (diesel),L,Retail,MWK,2.00,0.6893,MWI,L,Diesel,2.00,0.6893
1693839,1991-08-15,National Average,,,non-food,"Fuel (kerosene, paraffin)",L,Retail,MWK,1.22,0.4204,MWI,L,Kerosene,1.22,0.4204
1693840,1992-06-15,National Average,,,non-food,Fuel (diesel),L,Retail,MWK,2.58,0.7097,MWI,L,Diesel,2.58,0.7097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1712131,2022-09-15,Luchenza,-16.007655,35.311648,cereals and tubers,Maize,KG,Retail,MWK,360.00,0.3512,MWI,KG,Maize,360.00,0.3512
1712132,2022-09-15,Thyolo Boma,-16.067166,35.145327,cereals and tubers,Maize,KG,Retail,MWK,355.00,0.3463,MWI,KG,Maize,355.00,0.3463
1712133,2022-09-15,Chinamwali,-15.379600,35.360822,cereals and tubers,Maize,KG,Retail,MWK,351.50,0.3429,MWI,KG,Maize,351.50,0.3429
1712134,2022-09-15,Songani,-15.317222,35.393137,cereals and tubers,Maize,KG,Retail,MWK,354.00,0.3453,MWI,KG,Maize,354.00,0.3453


In [98]:
# To avoid the confusion, the prices will be changed to USD

df_food_removed.loc[df_food_removed['iso']=='MWI', 'price_unit']=df_food_removed['usdprice_unit']

In [99]:
# Changing currency to USD

df_food_removed.loc[df_food_removed['iso']=='MWI', 'currency']='USD'

In [100]:
# In Panama wholesale prices are recorded in USD, which really complicates the conversion. For now I will replace currency with PAB and prices in local currency with nan

df_food_removed[(df_food_removed['iso']=='PAN')&(df_food_removed['currency']=='USD')]

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
1848732,2007-09-15,National Average,,,cereals and tubers,Maize (yellow),MT,Wholesale,USD,304.35,304.3500,PAN,KG,Maize,0.30435,0.304350
1848733,2007-09-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,739.13,739.1300,PAN,KG,Rice,0.73913,0.739130
1848745,2007-10-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,739.13,739.1300,PAN,KG,Rice,0.73913,0.739130
1848755,2007-11-15,National Average,,,cereals and tubers,Maize (yellow),MT,Wholesale,USD,369.57,369.5667,PAN,KG,Maize,0.36957,0.369567
1848756,2007-11-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,739.13,739.1300,PAN,KG,Rice,0.73913,0.739130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849800,2015-03-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,847.83,847.8300,PAN,KG,Rice,0.84783,0.847830
1849833,2015-06-15,National Average,,,cereals and tubers,Maize (yellow),MT,Wholesale,USD,524.71,524.7100,PAN,KG,Maize,0.52471,0.524710
1849834,2015-06-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,841.02,841.0167,PAN,KG,Rice,0.84102,0.841017
1849846,2015-07-15,National Average,,,cereals and tubers,Maize (yellow),MT,Wholesale,USD,522.61,522.6100,PAN,KG,Maize,0.52261,0.522610


In [101]:
# Changing price in local currency to NaN

df_food_removed.loc[(df_food_removed['iso']=='PAN')&(df_food_removed['currency']=='USD'), 'price_unit']=np.nan

In [102]:
# Changing currency to PAB

df_food_removed.loc[df_food_removed['iso']=='PAN', 'currency']='PAB'

In [103]:
# In Nicaragua retail prices are recorded in USD, which really complicates the conversion. For now I will replace currency with PAB and prices in local currency with nan

df_food_removed[(df_food_removed['iso']=='NIC')&(df_food_removed['currency']=='USD')]

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
1809315,2010-01-15,National Average,,,cereals and tubers,Tortilla (maize),Pound,Retail,USD,0.44,0.4386,NIC,KG,Tortilla,0.977778,0.974667
1809316,2010-01-15,National Average,,,"meat, fish and eggs",Fish (fresh),Pound,Retail,USD,1.95,1.9473,NIC,KG,Fish,4.333333,4.327333
1809317,2010-01-15,National Average,,,"meat, fish and eggs",Meat (beef),Pound,Retail,USD,1.82,1.8238,NIC,KG,Meat,4.044444,4.052889
1809318,2010-01-15,National Average,,,"meat, fish and eggs",Meat (pork),Pound,Retail,USD,2.10,2.0986,NIC,KG,Meat,4.666667,4.663556
1809337,2010-02-15,National Average,,,cereals and tubers,Tortilla (maize),Pound,Retail,USD,0.45,0.4483,NIC,KG,Tortilla,1.000000,0.996222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1815934,2022-04-15,National Average,,,vegetables and fruits,Oranges,Pound,Retail,USD,0.29,0.2900,NIC,KG,Oranges,0.644444,0.644444
1815935,2022-04-15,National Average,,,vegetables and fruits,Peppers (sweet),Pound,Retail,USD,1.07,1.0700,NIC,KG,Peppers,2.377778,2.377778
1815936,2022-04-15,National Average,,,vegetables and fruits,Plantains,Pound,Retail,USD,0.31,0.3100,NIC,KG,Plantains,0.688889,0.688889
1815937,2022-04-15,National Average,,,vegetables and fruits,Squashes,Pound,Retail,USD,0.25,0.2500,NIC,KG,Squashes,0.555556,0.555556


In [104]:
# Changing price in local currency to NaN

df_food_removed.loc[(df_food_removed['iso']=='NIC')&(df_food_removed['currency']=='USD'), 'price_unit']=np.nan

In [105]:
# Changing currency to NIO

df_food_removed.loc[df_food_removed['iso']=='NIC', 'currency']='NIO'

In [106]:
# In Honduras wholesale prices are recorded in USD, which really complicates the conversion. For now I will replace currency with local and prices in local currency with nan

df_food_removed[(df_food_removed['iso']=='HND')&(df_food_removed['currency']=='USD')]

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
772244,2007-07-15,National Average,,,cereals and tubers,Sorghum (white),MT,Wholesale,USD,597.83,597.8260,HND,KG,Sorghum,0.59783,0.597826
772245,2007-07-15,National Average,,,pulses and nuts,Beans (red),MT,Wholesale,USD,1358.70,1358.6960,HND,KG,Beans,1.35870,1.358696
772252,2007-08-15,National Average,,,cereals and tubers,Sorghum (white),MT,Wholesale,USD,652.17,652.1740,HND,KG,Sorghum,0.65217,0.652174
772253,2007-08-15,National Average,,,pulses and nuts,Beans (red),MT,Wholesale,USD,1739.13,1739.1305,HND,KG,Beans,1.73913,1.739130
772260,2007-09-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,636.09,636.0900,HND,KG,Rice,0.63609,0.636090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773621,2019-10-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,706.00,706.0000,HND,KG,Rice,0.70600,0.706000
773622,2019-10-15,National Average,,,pulses and nuts,Beans (red),MT,Wholesale,USD,1007.00,1007.0000,HND,KG,Beans,1.00700,1.007000
773629,2019-11-15,National Average,,,cereals and tubers,Maize (white),MT,Wholesale,USD,338.00,338.0000,HND,KG,Maize,0.33800,0.338000
773630,2019-11-15,National Average,,,cereals and tubers,Rice (milled 80-20),MT,Wholesale,USD,703.00,703.0000,HND,KG,Rice,0.70300,0.703000


In [107]:
# Changing price in local currency to NaN

df_food_removed.loc[(df_food_removed['iso']=='HND')&(df_food_removed['currency']=='USD'), 'price_unit']=np.nan

In [108]:
# Changing currency to NIO

df_food_removed.loc[df_food_removed['iso']=='HND', 'currency']='HNL'

In [109]:
df_food_removed.shape

(2726301, 16)

### Some data sets contain exchange rate as one of the commodities, I will isolate it and use for price conversion 

In [110]:
df_exchange_rate=df_food_removed[df_food_removed['product_name']=='Exchange rate']

In [111]:
df_earnings=df_food_removed[df_food_removed['product_name']=='Wage']
df_earnings

Unnamed: 0,date,market,latitude,longitude,category,commodity,unit,pricetype,currency,price,usdprice,iso,measure_unit,product_name,price_unit,usdprice_unit
0,2000-01-15,Fayzabad,37.116638,70.580022,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,150000.0,3205.7378,AFG,Unit,Wage,150000.0,3205.7378
1,2000-01-15,Mazar,36.725116,67.109571,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,100000.0,2137.1586,AFG,Unit,Wage,100000.0,2137.1586
5,2000-01-15,Hirat,34.346944,62.198333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,60000.0,1282.2951,AFG,Unit,Wage,60000.0,1282.2951
9,2000-01-15,Kabul,34.516667,69.183333,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,55000.0,1175.4372,AFG,Unit,Wage,55000.0,1175.4372
13,2000-01-15,Kandahar,31.612500,65.709444,non-food,"Wage (non-qualified labour, non-agricultural)",Day,Retail,AFN,75000.0,1602.8689,AFG,Unit,Wage,75000.0,1602.8689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2657459,2023-07-15,Attaq Town,14.801087,45.719959,non-food,Wage (qualified labour),Day,Retail,YER,15000.0,10.7603,YEM,Unit,Wage,15000.0,10.7603
2657477,2023-07-15,Soqatra (Hudaibo),12.650000,54.033333,non-food,Wage (non-qualified labour),Day,Retail,YER,15000.0,10.7603,YEM,Unit,Wage,15000.0,10.7603
2657478,2023-07-15,Soqatra (Hudaibo),12.650000,54.033333,non-food,Wage (qualified labour),Day,Retail,YER,20000.0,14.3470,YEM,Unit,Wage,20000.0,14.3470
2657494,2023-07-15,Taiz City,13.580000,44.020000,non-food,Wage (non-qualified labour),Day,Retail,YER,8000.0,5.7388,YEM,Unit,Wage,8000.0,5.7388


In [112]:
df_exchange_rate.drop(columns=['pricetype','measure_unit','product_name','price_unit','usdprice_unit'],inplace=True)

In [113]:
df_exchange_rate.to_csv(os.path.join(path, 'wfp_exchange_rate.csv'))
df_exchange_rate.to_pickle(os.path.join(path, 'wfp_exchange_rate.pkl'))

In [114]:
df_exchange_rate.shape

(15600, 11)

In [115]:
df_earnings.to_pickle(os.path.join(path, 'wfp_wage_earning.pkl'))

In [116]:
df_earnings.shape

(24196, 16)

In [117]:
# Removing exchange rate data from the main dataframe

df_food_removed=df_food_removed[~df_food_removed['product_name'].isin(['Exchange rate','Wage'])]
df_food_removed.shape

(2686505, 16)

In [118]:
#Saving data at this stage

df_food_removed.to_pickle(os.path.join(path, 'all_products_prices_market_unaggregated.pkl'))

### Next steps:  
- adding missing prices using exchange rate data
- aggregating average price for the same product-day-market-country 
- merging the data with other datasets  
- adding country names
- adding missing coordinates 