In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
src_dir = "./crops_livestock_data/"
os.listdir(src_dir)

['area_codes.csv', 'flags.csv', 'item_codes.csv', 'production_data.csv']

In [3]:
all_data = pd.read_csv("{}production_data.csv".format(src_dir))
flags = pd.read_csv("{}flags.csv".format(src_dir))

In [4]:
"Total rows: {}".format(all_data.shape[0])

'Total rows: 3761216'

In [5]:
all_data.head(2)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1975,1975,ha,0.0,E
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1976,1976,ha,5900.0,E


In [6]:
flags.head(2)

Unnamed: 0,Flag,Description
0,A,Official figure
1,E,Estimated value


In [7]:
all_data.duplicated().sum()

0

In [8]:
all_data.isna().sum()

Area Code          0
Area Code (M49)    0
Area               0
Item Code          0
Item Code (CPC)    0
Item               0
Element Code       0
Element            0
Year Code          0
Year               0
Unit               0
Value              0
Flag               0
dtype: int64

In [9]:
all_data["Element"].unique()

array(['Area harvested', 'Yield', 'Production', 'Stocks',
       'Producing Animals/Slaughtered', 'Laying', 'Yield/Carcass Weight',
       'Milk Animals', 'Prod Popultn'], dtype=object)

In [10]:
all_data["Unit"].unique()

array(['ha', 'hg/ha', 'tonnes', 'Head', '1000 Head', '100mg/An', 'No/An',
       '1000 No', 'hg/An', '0.1g/An', 'No', 'hg'], dtype=object)

In [11]:
all_data["Flag"].unique()

array(['E', 'I', 'A', 'T', 'M'], dtype=object)

In [12]:
all_data.head(2)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1975,1975,ha,0.0,E
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,1976,1976,ha,5900.0,E


In [13]:
def process_strings(string):
    string = string.strip()
    pattern = re.compile(r'[^\x00-\x7F]')
    string = pattern.sub('', string)
    
    return string.lower()

In [14]:
all_data["Area"] = all_data["Area"].apply(process_strings)
all_data["Item"] = all_data["Item"].apply(process_strings)
all_data["Element"] = all_data["Element"].apply(process_strings)

In [15]:
area_to_code_map = dict(zip(list(all_data["Area"]), 
                            list(all_data["Area Code"])))
reverse_area_to_code_map = dict(zip(area_to_code_map.values(), area_to_code_map.keys()))

In [16]:
item_to_code_map = dict(zip(list(all_data["Item"]), 
                            list(all_data["Item Code"])))
reverse_item_to_code_map = dict(zip(item_to_code_map.values(), area_to_code_map.keys()))

In [19]:
components_dir = "./components/"
data_dir = "./clean-data/"
if not os.path.exists(components_dir):
    os.makedirs(components_dir)
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [20]:
json.dump(area_to_code_map, open("{}area_to_code_map.json".format(components_dir), "w"))
json.dump(reverse_area_to_code_map, open("{}reverse_area_to_code_map.json".format(components_dir), "w"))
json.dump(item_to_code_map, open("{}item_to_code_map.json".format(components_dir), "w"))
json.dump(reverse_item_to_code_map, open("{}reverse_item_to_code_map.json".format(components_dir), "w"))

In [21]:
columns_to_remove = ["Area Code (M49)", "Area", "Element Code", "Item Code (CPC)", "Item", "Year Code"]

all_data.drop(columns=columns_to_remove, inplace=True)

all_data.head(5)

Unnamed: 0,Area Code,Item Code,Element,Year,Unit,Value,Flag
0,2,221,area harvested,1975,ha,0.0,E
1,2,221,area harvested,1976,ha,5900.0,E
2,2,221,area harvested,1977,ha,6000.0,E
3,2,221,area harvested,1978,ha,6000.0,E
4,2,221,area harvested,1979,ha,6000.0,E


In [22]:
all_data.to_csv("{}production-data.csv".format(data_dir), index=False)