In [1]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import matplotlib.pyplot as plt
# For Prediction:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from statistics import mean
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

# For nice Plots
%matplotlib inline
plt.style.use('default')

  from pandas import MultiIndex, Int64Index


# Read in the data

In [2]:
dir_path = os.path.abspath("")
fname = dir_path+'\FAOSTAT_data_en_12-16-2022.csv'
df = pd.read_csv(fname, encoding='latin-1')

# Read in Port Information

In [3]:
dir_path = os.path.abspath("")
fname = dir_path+'\countryport.csv'
df2 = pd.read_csv(fname, encoding='latin-1')

# Read in Scraper Information

In [4]:
dir_path = os.path.abspath("")
fname = dir_path+'\scraper_ndvi_data.csv'
df3 = pd.read_csv(fname, encoding='latin-1')

# Unique cateories in a list

In [5]:
print(len(df['Item'].unique()))
categories = df['Item'].unique()
categories = categories.tolist()
print(categories)

63
['Average dietary energy supply adequacy (percent) (3-year average)', 'Dietary energy supply used in the estimation of prevalence of undernourishment (kcal/cap/day) (3-year average)', 'Share of dietary energy supply derived from cereals, roots and tubers (kcal/cap/day) (3-year average)', 'Average protein supply (g/cap/day) (3-year average)', 'Average supply of protein of animal origin (g/cap/day) (3-year average)', 'Rail lines density (total route in km per 100 square km of land area)', 'Gross domestic product per capita, PPP, (constant 2017 international $)', 'Prevalence of undernourishment (percent) (3-year average)', 'Number of people undernourished (million) (3-year average)', 'Prevalence of severe food insecurity in the total population (percent) (3-year average)', 'Prevalence of severe food insecurity in the male adult population (percent) (3-year average)', 'Prevalence of severe food insecurity in the female adult population (percent) (3-year average)', 'Prevalence of moderat

# Changing the year range

In [6]:
for i in range(0, len(df)):
    s = str(df.loc[i]['Year'])
    s = s.split('-')
    if len(s)>1:
        p = int(s[1])
        df.at[i,'Year'] = p
    else:
        p = int(s[0])
        df.at[i,'Year'] = p
print(df['Year'].unique())

[2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
 2016 2017 2018 2019 2020 2021 2000 2001]


# Create New Country Dataframe

In [7]:
columns = ['Country', 'Year'] + categories
numCols = len(columns)
countrydf = pd.DataFrame(columns = columns)
for country in df['Area'].unique():
    for year in df['Year'].unique():
        countrydf.loc[len(countrydf)] = [country, year] + [float("NAN")]*(numCols-2)

# Populate Data in Country Dataframe 

In [8]:
for i in range(0, len(df)-1):
    Item = df['Item'].iloc[i]
    Country = df['Area'].iloc[i]
    Year = df['Year'].iloc[i]
    Value = df['Value'].iloc[i]
    countrydf.loc[(countrydf['Country']==Country) & (countrydf['Year']==Year),Item] = Value

# Add Hub Data to DataFrame

In [9]:
countrydf['Port'] = 'NAN'
countrydf['Resiliency'] = 'NAN'
for i in range(0, len(countrydf)):
    Country = countrydf['Country'].iloc[i]
    for j in range(0, len(df2)):
        Port = df2['Country'].iloc[j]
        Hub = df2['Hub?'].iloc[j]
        Res = df2['Res'].iloc[j]
        if Country == Port:
           countrydf.at[i,'Port'] = Hub
           countrydf.loc[i,'Resiliency'] = Res

# Convert numerical data to floats

In [10]:
for col in countrydf.columns[2:]:
    for ind in countrydf.index:
        if isinstance(countrydf.loc[ind,col],str):
            s = countrydf.loc[ind,col]
            s = s.split('<')
            if s[0] != '':
                s = float(s[0])
                countrydf.loc[ind,col] = s
            else:
                s.pop(0)
                s = float(s[0])
                countrydf.loc[ind,col] = s
        else:
            continue  

# Populate Response Column

In [11]:
Future_Undernourishment_Col_Name = "Prevalence of undernourishment (percent) (3-year average)"
for i in range(0, len(countrydf)-1):
    if countrydf.loc[i,'Year'] != 2021:
        #countrydf.loc[i,'Future Undernourishment'] = countrydf.loc[i+1,Future_Undernourishment_Col_Name]
        # Future Undernourishment is the change in undernourishment for the next year
        countrydf.loc[i,'Future Undernourishment'] = countrydf.loc[i+1,Future_Undernourishment_Col_Name]-countrydf.loc[i,Future_Undernourishment_Col_Name]
    else:
        continue

In [12]:
# Remove data for years 2000 and 2021, which do not have a response variable
countrydf = countrydf[countrydf.Year != 2000]
countrydf = countrydf[countrydf.Year != 2021]
countrydf = countrydf.reset_index()

# Scraper Incorporation

In [13]:
countrydf.insert(68,'Scraper Average year',float(0))

In [14]:
Months = df3.columns[2:]
for i in df3.index:
    
    Country = df3['Country'].iloc[i]
    Year = df3['Year'].iloc[i]
    AveScrape = mean(df3[Months].iloc[i])
    countrydf.loc[(countrydf['Country']==Country) & (countrydf['Year']==Year),'Scraper Average year'] = AveScrape

# Countries with Data in the Response Column

In [15]:
# Remove rows for Equatorial Guinea (because there is insufficent data)
countrydf = countrydf[countrydf['Country'] != 'Equatorial Guinea']
countrydf = countrydf.reset_index()
countrydf.to_csv('countrydf.csv')

# Prediction:

In [16]:
# Build a test set and training set
X_train, X_test, y_train, y_test = train_test_split(
  countrydf.drop(["Future Undernourishment", "Country", "Year"], axis=1), countrydf["Future Undernourishment"], test_size=1/3)

# Replace all the NaNs with the column mean, 
# and build the test set and training set using the indicies from the first set
countrydf2 = countrydf.fillna(countrydf.mean())
# Build a test set and training set
X_train2 = countrydf2.drop(["Future Undernourishment", "Country", "Year"], axis=1).iloc[X_train.index]
X_test2 = countrydf2.drop(["Future Undernourishment", "Country", "Year"], axis=1).iloc[X_test.index]
y_train2 = countrydf2["Future Undernourishment"].iloc[y_train.index]
y_test2 = countrydf2["Future Undernourishment"].iloc[y_test.index]

# Use only rows with non-NaN response variable, then replace all the NaNs with the column mean 
# and build the test set and training set using the indicies from the first set
keep_indices_test = np.where(y_test.notna())
keep_indices_train = np.where(y_train.notna())
# Build a test set and training set
X_train3 = X_train2.iloc[keep_indices_train]
X_test3 = X_test2.iloc[keep_indices_test]
y_train3 = y_train2.iloc[keep_indices_train]
y_test3 = y_test2.iloc[keep_indices_test]

  countrydf2 = countrydf.fillna(countrydf.mean())


# ANN

In [27]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.optimizers import Adam

# define the input shape and number of classes
input_shape = X_train3.shape[1]
num_classes = 1

# define the model architecture
model = keras.Sequential()
model.add(layers.Dense(30, activation='relu', input_shape=(input_shape,)))
for i in range(30):
    model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dense(num_classes, activation='linear'))

optimizer = Adam(lr=0.0001)
# compile the model
model.compile(loss='mean_squared_error', optimizer=optimizer)

# train the model
history = model.fit(X_train3, y_train3, epochs=50, validation_data=(X_test3, y_test3))

Epoch 1/50


  super().__init__(name, **kwargs)


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
