# Import Packages

In [54]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import matplotlib.pyplot as plt
# For Prediction:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from statistics import mean
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

# For nice Plots
%matplotlib inline
plt.style.use('default')

# Read in the FAOSTAT Information

In [55]:
dir_path = os.path.abspath("")
fname = dir_path+'\FAOSTAT_data_en_12-16-2022.csv'
FAO_df = pd.read_csv(fname, encoding='latin-1')

# Read in Port Information

In [56]:
dir_path = os.path.abspath("")
fname = dir_path+'\countryport.csv'
Port_df = pd.read_csv(fname, encoding='latin-1')

# Read in Scraper Information

In [57]:
dir_path = os.path.abspath("")
fname = dir_path+'\scraper_ndvi_data.csv'
Scraper_df = pd.read_csv(fname, encoding='latin-1')

# Columns Names/First Five Lines

In [58]:
FAO_df.head()

Unnamed: 0,ï»¿Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,FS,Suite of Food Security Indicators,12,Algeria,6121,Value,21010,Average dietary energy supply adequacy (percen...,20002002,2000-2002,%,127,E,Estimated value,
1,FS,Suite of Food Security Indicators,12,Algeria,6121,Value,21010,Average dietary energy supply adequacy (percen...,20012003,2001-2003,%,129,E,Estimated value,
2,FS,Suite of Food Security Indicators,12,Algeria,6121,Value,21010,Average dietary energy supply adequacy (percen...,20022004,2002-2004,%,130,E,Estimated value,
3,FS,Suite of Food Security Indicators,12,Algeria,6121,Value,21010,Average dietary energy supply adequacy (percen...,20032005,2003-2005,%,130,E,Estimated value,
4,FS,Suite of Food Security Indicators,12,Algeria,6121,Value,21010,Average dietary energy supply adequacy (percen...,20042006,2004-2006,%,131,E,Estimated value,


In [59]:
Port_df.head()

Unnamed: 0,Country,Travel?,Paved Airports,Seaport,Hub?,Res,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,Algeria,2,67,9,1,19.882026,-0.171717,2.287611,3.823529,0.029487,5.233162,14.619377
1,Angola,1,32,4,1,3.264609,-1.181818,0.738938,1.149733,1.396694,0.546029,1.321885
2,Benin,2,1,1,0,-0.577489,-0.171717,-0.632743,-0.454545,0.029487,-0.400364,-0.206612
3,Botswana,1,10,0,1,0.362974,-1.181818,-0.234513,-0.989305,1.396694,-0.054996,-0.978724
4,Burkina Faso,4,2,0,0,-4.741947,1.848485,-0.588496,-0.989305,-3.416896,-0.346327,-0.978724


In [60]:
Scraper_df.head()

Unnamed: 0,Country,Year,Jan-NDVI,Feb-NDVI,Mar-NDVI,Apr-NDVI,May-NDVI,Jun-NDVI,Jul-NDVI,Aug-NDVI,Sep-NDVI,Oct-NDVI,Nov-NDVI,Dec-NDVI
0,Cotr d lvoire,2001,0.49275,0.4875,0.5515,0.631667,0.672,0.683,0.72225,0.74325,0.74025,0.728667,0.671,0.57175
1,Cotr d lvoire,2002,0.49275,0.4875,0.5515,0.631667,0.672,0.683,0.72225,0.74325,0.74025,0.728667,0.671,0.57175
2,Cotr d lvoire,2003,0.49275,0.4875,0.5515,0.631667,0.672,0.683,0.72225,0.74325,0.74025,0.728667,0.671,0.57175
3,Cotr d lvoire,2004,0.49275,0.4875,0.5515,0.63825,0.676667,0.683,0.72225,0.74325,0.74025,0.72175,0.661,0.57175
4,Cotr d lvoire,2005,0.49275,0.4875,0.5515,0.631667,0.672,0.683,0.72225,0.74325,0.74025,0.728667,0.671,0.57175


# Number of Areas in this Dataset

In [61]:
print(len(FAO_df['Area'].unique()))
print("There are " + str(len(FAO_df['Area'].unique())) + " unique areas in this African dataset.")

55
There are 55 unique areas in this African dataset.


# Unique cateories in a list

In [62]:
print(len(FAO_df['Item'].unique()))
categories = FAO_df['Item'].unique()
categories = categories.tolist()
print(categories)

63
['Average dietary energy supply adequacy (percent) (3-year average)', 'Dietary energy supply used in the estimation of prevalence of undernourishment (kcal/cap/day) (3-year average)', 'Share of dietary energy supply derived from cereals, roots and tubers (kcal/cap/day) (3-year average)', 'Average protein supply (g/cap/day) (3-year average)', 'Average supply of protein of animal origin (g/cap/day) (3-year average)', 'Rail lines density (total route in km per 100 square km of land area)', 'Gross domestic product per capita, PPP, (constant 2017 international $)', 'Prevalence of undernourishment (percent) (3-year average)', 'Number of people undernourished (million) (3-year average)', 'Prevalence of severe food insecurity in the total population (percent) (3-year average)', 'Prevalence of severe food insecurity in the male adult population (percent) (3-year average)', 'Prevalence of severe food insecurity in the female adult population (percent) (3-year average)', 'Prevalence of moderat

# Changing the year range

In [63]:
for i in range(0, len(FAO_df)):
    s = str(FAO_df.loc[i]['Year'])
    s = s.split('-')
    if len(s)>1:
        p = int(s[1])
        FAO_df.at[i,'Year'] = p
    else:
        p = int(s[0])
        FAO_df.at[i,'Year'] = p
print(FAO_df['Year'].unique())

[2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017 2018 2019 2020 2021 2000 2001]


# Create New Country Dataframe

In [64]:
columns = ['Country', 'Year'] + categories
numCols = len(columns)
countrydf = pd.DataFrame(columns = columns)
for country in FAO_df['Area'].unique():
    for year in FAO_df['Year'].unique():
        countrydf.loc[len(countrydf)] = [country, year] + [float("NAN")]*(numCols-2)

# Populate Data in Country Dataframe 

In [65]:
for i in range(0, len(FAO_df)-1):
    Item = FAO_df['Item'].iloc[i]
    Country = FAO_df['Area'].iloc[i]
    Year = FAO_df['Year'].iloc[i]
    Value = FAO_df['Value'].iloc[i]
    countrydf.loc[(countrydf['Country']==Country) & (countrydf['Year']==Year),Item] = Value

# Add Hub Data to DataFrame

In [66]:
countrydf['Port'] = 'NAN'
countrydf['Resiliency'] = 'NAN'
for i in range(0, len(countrydf)):
    Country = countrydf['Country'].iloc[i]
    for j in range(0, len(Port_df)):
        Port = Port_df['Country'].iloc[j]
        Hub = Port_df['Hub?'].iloc[j]
        Res = Port_df['Res'].iloc[j]
        if Country == Port:
           countrydf.at[i,'Port'] = Hub
           countrydf.loc[i,'Resiliency'] = Res

# Convert numerical data to floats

In [67]:
for col in countrydf.columns[2:]:
    for ind in countrydf.index:
        if isinstance(countrydf.loc[ind,col],str):
            s = countrydf.loc[ind,col]
            s = s.split('<')
            if s[0] != '':
                s = float(s[0])
                countrydf.loc[ind,col] = s
            else:
                s.pop(0)
                s = float(s[0])
                countrydf.loc[ind,col] = s
        else:
            continue  

# Populate Response Column

In [68]:
Future_Undernourishment_Col_Name = "Prevalence of undernourishment (percent) (3-year average)"
for i in range(0, len(countrydf)-1):
    if countrydf.loc[i,'Year'] != 2021:
        #countrydf.loc[i,'Future Undernourishment'] = countrydf.loc[i+1,Future_Undernourishment_Col_Name]
        # Future Undernourishment is the change in undernourishment for the next year
        countrydf.loc[i,'Future Undernourishment'] = countrydf.loc[i+1,Future_Undernourishment_Col_Name]-countrydf.loc[i,Future_Undernourishment_Col_Name]
    else:
        continue

In [69]:
# Remove data for years 2000 and 2021, which do not have a response variable
countrydf = countrydf[countrydf.Year != 2000]
countrydf = countrydf[countrydf.Year != 2021]
countrydf = countrydf.reset_index()

# Country Differences from Datascraper

In [70]:
countryc = countrydf['Country'].unique()
scraperc = Scraper_df['Country'].unique()
print("Differnces in Sraper Data ", list(set(scraperc).difference(countryc)))

Differnces in Sraper Data  ['Congo, Democratic Republic of the', 'Congo, Republic of the', 'Swaziland', 'Tanzania', 'Cotr d lvoire', 'Western Saraha']


In [71]:
for i in Scraper_df.index:
    if Scraper_df.loc[i,'Country'] == 'Congo, Democratic Republic of the':
        Scraper_df.loc[i, 'Country'] = 'Democratic Republic of the Congo'
    elif Scraper_df.loc[i,'Country'] == 'Cotr d lvoire':
        Scraper_df.loc[i, 'Country'] = "CÃ´te d'Ivoire"
    elif Scraper_df.loc[i,'Country'] == 'Congo, Republic of the':
        Scraper_df.loc[i, 'Country'] = "Congo"
    elif Scraper_df.loc[i,'Country'] == 'Tanzania':
        Scraper_df.loc[i, 'Country'] = "United Republic of Tanzania"
    else:
        continue 

countryc = countrydf['Country'].unique()
scraperc = Scraper_df['Country'].unique()
list(set(scraperc).difference(countryc))
print("Differnces in Sraper Data ", list(set(scraperc).difference(countryc)))
print("Differnces in Country Data ", list(set(countryc).difference(scraperc)))

Differnces in Sraper Data  ['Western Saraha', 'Swaziland']
Differnces in Country Data  ['Seychelles', 'Sao Tome and Principe', 'Gambia', 'Eswatini', 'Mauritius', 'Comoros', 'Africa', 'Cabo Verde']


# Scraper Incorporation

In [72]:
countrydf.insert(68,'MODIS NDVI Annual Average',float(0))

In [73]:
Months = Scraper_df.columns[2:]
yearly_mean = Scraper_df.groupby('Year').mean().mean(axis=1)
yearly_mean_df = pd.DataFrame({'Year': yearly_mean.index, 'Yearly Mean NDVI': yearly_mean})

#provide the average
for i in Scraper_df.index:
    Country = Scraper_df['Country'].iloc[i]
    if Country == "Swaziland" or Country == "Western Saraha":
        continue
    else:
        Year = Scraper_df['Year'].iloc[i]
        AveScrape = mean(Scraper_df[Months].iloc[i])
        countrydf.loc[(countrydf['Country']==Country) & (countrydf['Year']==Year),'MODIS NDVI Annual Average'] = AveScrape
#provide the overall average for locations that were not in the scaper data
for i in countrydf.index:
    year = countrydf['Year'][i]
    ndvi =  yearly_mean_df.loc[year, 'Yearly Mean NDVI']
    if countrydf.Country.iloc[i] == "Gambia" or countrydf.Country.iloc[i] == "Cabo Verde" or \
        countrydf.Country.iloc[i] == "Seychelles" or countrydf.Country.iloc[i] == "Mauritius" or \
        countrydf.Country.iloc[i] == "Africa" or countrydf.Country.iloc[i] == "Sao Tome and Principe" or \
        countrydf.Country.iloc[i] == "Comoros" or countrydf.Country.iloc[i] == "Eswatini":
        countrydf.loc[i,'MODIS NDVI Annual Average']= ndvi
    else:
        continue


In [74]:
# Check how much data we have for Equatorial Guinea:
countrydf[countrydf['Country'] == 'Equatorial Guinea'].head()

Unnamed: 0,index,Country,Year,Average dietary energy supply adequacy (percent) (3-year average),Dietary energy supply used in the estimation of prevalence of undernourishment (kcal/cap/day) (3-year average),"Share of dietary energy supply derived from cereals, roots and tubers (kcal/cap/day) (3-year average)",Average protein supply (g/cap/day) (3-year average),Average supply of protein of animal origin (g/cap/day) (3-year average),Rail lines density (total route in km per 100 square km of land area),"Gross domestic product per capita, PPP, (constant 2017 international $)",...,Number of severely food insecure people (million) (annual value),Number of severely food insecure male adults (million) (annual value),Number of severely food insecure female adults (million) (annual value),Number of moderately or severely food insecure people (million) (annual value),Number of moderately or severely food insecure male adults (million) (annual value),Number of moderately or severely food insecure female adults (million) (annual value),Port,Resiliency,MODIS NDVI Annual Average,Future Undernourishment
320,352,Equatorial Guinea,2002,,,,,,,,...,,,,,,,1,1.60555,0.766347,
321,353,Equatorial Guinea,2003,,,,,,,,...,,,,,,,1,1.60555,0.766347,
322,354,Equatorial Guinea,2004,,,,,,,,...,,,,,,,1,1.60555,0.766,
323,355,Equatorial Guinea,2005,,,,,,,,...,,,,,,,1,1.60555,0.766347,
324,356,Equatorial Guinea,2006,,,,,,,,...,,,,,,,1,1.60555,0.766347,


In [75]:
# Remove rows for Equatorial Guinea (because there is insufficent data)
countrydf = countrydf[countrydf['Country'] != 'Equatorial Guinea']
countrydf = countrydf.reset_index()

In [76]:
countrydf = countrydf.drop(['level_0','index'], axis=1)
countrydf = countrydf.loc[:, ~countrydf.columns.str.contains('^Unnamed')]
countrydf.head()

Unnamed: 0,Country,Year,Average dietary energy supply adequacy (percent) (3-year average),Dietary energy supply used in the estimation of prevalence of undernourishment (kcal/cap/day) (3-year average),"Share of dietary energy supply derived from cereals, roots and tubers (kcal/cap/day) (3-year average)",Average protein supply (g/cap/day) (3-year average),Average supply of protein of animal origin (g/cap/day) (3-year average),Rail lines density (total route in km per 100 square km of land area),"Gross domestic product per capita, PPP, (constant 2017 international $)",Prevalence of undernourishment (percent) (3-year average),...,Number of severely food insecure people (million) (annual value),Number of severely food insecure male adults (million) (annual value),Number of severely food insecure female adults (million) (annual value),Number of moderately or severely food insecure people (million) (annual value),Number of moderately or severely food insecure male adults (million) (annual value),Number of moderately or severely food insecure female adults (million) (annual value),Port,Resiliency,MODIS NDVI Annual Average,Future Undernourishment
0,Algeria,2002,127.0,2938.0,59.0,77.0,19.0,0.1,,8.0,...,,,,,,,1,19.882026,0.120326,-0.7
1,Algeria,2003,129.0,2993.0,59.0,79.4,19.3,0.1,,7.3,...,,,,,,,1,19.882026,0.120326,-0.3
2,Algeria,2004,130.0,3029.0,59.0,81.4,20.0,0.1,,7.0,...,,,,,,,1,19.882026,0.120215,0.0
3,Algeria,2005,130.0,3042.0,59.0,83.0,20.3,0.1,,7.0,...,,,,,,,1,19.882026,0.120326,-0.3
4,Algeria,2006,131.0,3071.0,59.0,83.3,21.0,0.1,,6.7,...,,,,,,,1,19.882026,0.120326,-0.3


# Generate Food Security Dataset

In [77]:
countrydf.to_csv('countrydf.csv')