# Modeling Agricultural Variables
## Python modules

In [1]:
import warnings
import time
import os

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas as gpd

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

import math
import seaborn as sns

# Mild Preprocessing
## 1. Load in the Data

### Read in the features
First, we load in the feature data. This data was aggregated in the ___ notebook

In [2]:
# Let's read in the new concatenated features:
#features = pd.read_feather("/capstone/mosaiks/repos/modeling/data/cropmosaiks_features_landsat8.feather")
features = pd.read_feather("/capstone/mosaiks/repos/modeling/data/features_sea_save.feather")
features

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,994,995,996,997,998,999,lon,lat,year,month
0,0,0.001570,0.000480,0.005897,1.353835,0.509915,0.002976,0.112365,2.114384,0.0,...,0.004179,3.965036,5.546379,0.026053,1.071133,0.000490,22.24466,-16.440364,2015,11
1,1,0.001463,0.000276,0.007684,0.980061,0.378347,0.001967,0.056292,1.592373,0.0,...,0.004066,3.541692,4.875215,0.024436,1.112796,0.000805,22.25466,-16.450364,2015,11
2,2,0.003168,0.000671,0.005166,0.726874,0.222981,0.002820,0.039191,1.190421,0.0,...,0.005325,3.108610,4.367601,0.023106,1.221709,0.000022,22.25466,-16.440364,2015,11
3,3,0.001240,0.000455,0.022193,1.314633,0.528798,0.002496,0.106302,2.018325,0.0,...,0.006516,3.932435,5.520713,0.028360,1.162592,0.002150,22.26466,-16.440364,2015,11
4,4,0.004002,0.000425,0.012011,0.937913,0.405020,0.002348,0.068812,1.462166,0.0,...,0.007511,3.435787,4.772962,0.031827,1.193514,0.001942,22.26466,-16.450364,2015,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918658,3729,0.000000,0.000000,0.003351,0.506587,0.075253,0.000000,0.008294,0.860708,0.0,...,0.005945,2.868543,3.981516,0.016459,1.289775,0.000281,32.87466,-11.390364,2022,12
918659,3730,0.000000,0.000000,0.001638,0.500491,0.068168,0.000000,0.003469,0.879823,0.0,...,0.005039,2.861625,3.945390,0.016634,1.234191,0.000129,32.86466,-11.390364,2022,12
918660,3731,0.000771,0.000000,0.003275,0.559318,0.070403,0.000192,0.007627,0.958161,0.0,...,0.007142,2.917027,4.074458,0.024755,1.257929,0.002708,32.87466,-11.380364,2022,12
918661,3732,0.000508,0.000003,0.003023,0.259634,0.059728,0.000642,0.002495,0.409499,0.0,...,0.006365,2.543631,3.537266,0.026562,1.458724,0.005747,31.78466,-10.800364,2022,12


### Read in Ground-Truth Data
Next, we read in our ground truth data, which was processed in the Preprocessing notebook. Make sure to run that if you are getting any errors from running this code.

In [3]:
# Read in the survey data
country_sea = gpd.read_file('/capstone/mosaiks/repos/preprocessing/featurizeme/total.shp')
country_sea

ERROR 1: PROJ: proj_create_from_database: Open of /Users/hveirs/.conda/envs/mosaiks/share/proj failed


Unnamed: 0,sea_unq,year,bunding,cnvntnl,plntngb,ploghng,ridging,ripping,zertllg,xna,...,rice,sedcttn,sorghum,soybens,sugarcn,sunflwr,swtptts,vlvtbns,vrgntbc,geometry
0,1,2008.0,11.4875,0.1250,0.25,2.250000,29.017500,0.0,0.0000,0.00,...,0.0,0.0,1.437500,0.25,0.0,0.000,1.330000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,1,2009.0,0.3775,0.0000,4.75,0.000000,29.845000,0.0,0.0000,0.00,...,0.0,0.0,0.197500,0.19,0.0,0.000,0.250000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,1,2010.0,0.0000,8.8750,0.00,5.000000,18.340000,0.0,0.0000,0.00,...,0.0,0.0,0.000000,0.25,0.0,0.000,0.625000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,1,2011.0,0.3725,18.0275,0.00,41.025000,0.375000,0.0,0.6075,0.00,...,0.0,2.0,1.500000,0.00,0.0,0.625,0.682500,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,1,2012.0,0.6250,22.5600,0.81,58.560000,0.062500,0.0,2.0000,0.00,...,0.0,0.0,0.810000,1.00,0.0,0.000,0.625000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2746,300,2013.0,0.0000,7.9900,0.00,48.685000,0.000000,0.0,0.0000,0.00,...,0.0,0.0,3.550000,0.00,0.0,0.000,0.000000,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
2747,300,2014.0,0.0000,5.6750,0.00,65.085000,0.000000,1.2,0.0000,0.00,...,0.0,0.0,14.695000,0.00,0.0,0.000,0.000000,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
2748,300,2015.0,0.0000,1.4000,0.81,36.275894,1.737886,0.0,0.0000,4.78,...,0.0,0.0,2.089372,0.00,0.0,0.000,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
2749,300,2016.0,0.0000,1.4000,0.81,36.275894,1.737886,0.0,0.0000,4.78,...,0.0,0.0,2.089372,0.00,0.0,0.000,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


We're going to make another object `sea_unq_join` which contains the spatial information and a unique key for each SEA. This will be handy later, when we need to join the features to the ground-truth data.

In [4]:
# Filter country_sea for unique values of 'seq_unq' and 'geometry'
sea_unq_join = country_sea[['sea_unq', 'geometry']].drop_duplicates()

# Display the filtered DataFrame
print(sea_unq_join)


      sea_unq                                           geometry
0           1  POLYGON ((27.82327 -13.65772, 27.82294 -13.657...
10          2  POLYGON ((27.99349 -13.46497, 27.99352 -13.464...
20          3  POLYGON ((28.09909 -13.51864, 28.09867 -13.516...
30          4  POLYGON ((28.31924 -13.42915, 28.31911 -13.426...
40          5  POLYGON ((28.39982 -13.51544, 28.40012 -13.514...
...       ...                                                ...
2722      296  POLYGON ((25.07771 -14.63920, 25.07732 -14.638...
2728      297  POLYGON ((22.74142 -14.00343, 22.73856 -14.002...
2734      298  POLYGON ((23.08604 -14.20026, 23.08957 -14.202...
2740      299  POLYGON ((24.36764 -16.62208, 24.36564 -16.621...
2746      300  POLYGON ((23.23962 -16.31204, 23.23876 -16.312...

[298 rows x 2 columns]


## 2. Organize the features by growing season



In [5]:
# Organize the features by growing season
# Carry months October, November, and December over to the following year's data
# These months represent the start of the growing season for the following year's maize yield
year_end = 2022

features['year'] = np.where(
    features['month'].isin([10, 11, 12]),
    features['year'] + 1, 
    features['year'])

features_gs = features[features['year'] <= year_end]

features_gs.sort_values(['year', 'month'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_gs.sort_values(['year', 'month'], inplace=True)


## 3. Pivot Wider by months

Since we want each row to represent one location per year, we can use the .unstack() function to pivot wider all rows with the same lat/lon and year. This results in a dataframe with 12,000 columns (1,000 columns for each month). 

In [6]:
# Store the 'geometry' column separately before unstacking
#geometry_col = features_new_gdf[['lon', 'lat', 'geometry']].drop_duplicates(subset=['lon', 'lat'])

# Perform the unstacking operation without the 'geometry' column
#features_gs_no_geometry = features_gs.drop(columns=['geometry'])
#features = features_gs_no_geometry.set_index(['lon', 'lat', 'year', 'month']).unstack()

# This line applies a transformation to the columns' names.
#features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

# Merge the 'geometry' column back into the features DataFrame
#features = features.reset_index().merge(geometry_col, on=['lon', 'lat'])


In [7]:
# Use the unstack() function to pivot wider the rows with the same lat/lon 
features = features_gs.set_index(['lon', 'lat', 'year', 'month']).unstack()

# Apply a transformation to the columns' names
features.columns = features.columns.map(lambda x: '{}_{}'.format(*x))

In [8]:
# Since our features have infinite values, it is important to replace those with NaN values.
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.reset_index()
features.iloc[:, 12000:]

Unnamed: 0,998_10,998_11,998_12,999_1,999_2,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,,1.297913,1.044488,,0.006922,,1.000000,1.000000,0.002402,1.000000,0.000927,0.000613,,0.003262,0.000239
1,0.930201,0.935190,,,,,0.009427,0.005681,0.004284,0.002694,0.001640,0.000689,0.000878,0.021311,
2,0.898067,1.440304,,0.006652,,,0.008678,0.007007,0.005162,0.003032,0.001828,0.000936,0.000820,0.004444,
3,0.903305,1.131946,1.193247,0.006303,0.006878,0.005010,0.004234,0.003123,0.002475,0.001411,0.000419,0.000322,0.001031,0.002016,0.002855
4,0.829283,,,,0.076561,0.007508,0.007276,0.004638,0.002851,0.002013,0.001561,0.000924,0.000347,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101159,0.823307,0.859537,,,0.003429,,,0.001952,0.001340,0.000046,0.000009,0.000000,0.000000,0.000000,
101160,0.820138,0.869622,1.039315,0.002017,,,0.004004,0.005244,0.002288,0.000166,0.000149,0.000000,0.000000,0.000049,0.000000
101161,0.824495,0.893293,0.946694,,,,0.003117,0.002498,,0.005067,0.000013,0.000000,0.000000,0.000009,0.000000
101162,0.878204,0.938874,,,,0.003815,,0.002397,0.001376,0.014282,0.000009,0.000000,0.000000,0.000000,


### 4. Convert the features into a Geo Dataframe

This step allows us to join the features with our clean, ground-truth survey data based on the geometries.

In [9]:
# Create a geodataframe of the new features
features_gdf = gpd.GeoDataFrame(
    features, # could change to features_gs
    geometry = gpd.points_from_xy(x = features.lon, y = features.lat), 
    crs='EPSG:4326'
)

In [10]:
features_gdf

Unnamed: 0,lon,lat,year,index_1,index_2,index_3,index_4,index_5,index_6,index_7,...,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,geometry
0,22.00466,-16.190364,2016,,220.0,,427.0,427.0,427.0,427.0,...,1.000000,1.000000,0.002402,1.000000,0.000927,0.000613,,0.003262,0.000239,POINT (22.00466 -16.19036)
1,22.00466,-16.190364,2017,,,,427.0,427.0,427.0,427.0,...,0.009427,0.005681,0.004284,0.002694,0.001640,0.000689,0.000878,0.021311,,POINT (22.00466 -16.19036)
2,22.00466,-16.190364,2018,427.0,,,427.0,427.0,427.0,427.0,...,0.008678,0.007007,0.005162,0.003032,0.001828,0.000936,0.000820,0.004444,,POINT (22.00466 -16.19036)
3,22.00466,-16.190364,2019,363.0,427.0,427.0,427.0,427.0,427.0,427.0,...,0.004234,0.003123,0.002475,0.001411,0.000419,0.000322,0.001031,0.002016,0.002855,POINT (22.00466 -16.19036)
4,22.00466,-16.190364,2020,,53.0,332.0,427.0,427.0,427.0,427.0,...,0.007276,0.004638,0.002851,0.002013,0.001561,0.000924,0.000347,,,POINT (22.00466 -16.19036)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101159,33.50466,-10.190364,2018,,3421.0,,,11737.0,11737.0,11737.0,...,,0.001952,0.001340,0.000046,0.000009,0.000000,0.000000,0.000000,,POINT (33.50466 -10.19036)
101160,33.50466,-10.190364,2019,4297.0,,,10704.0,11686.0,11737.0,11737.0,...,0.004004,0.005244,0.002288,0.000166,0.000149,0.000000,0.000000,0.000049,0.000000,POINT (33.50466 -10.19036)
101161,33.50466,-10.190364,2020,,,,11404.0,11737.0,,11686.0,...,0.003117,0.002498,,0.005067,0.000013,0.000000,0.000000,0.000009,0.000000,POINT (33.50466 -10.19036)
101162,33.50466,-10.190364,2021,,,6816.0,,11686.0,11737.0,11686.0,...,,0.002397,0.001376,0.014282,0.000009,0.000000,0.000000,0.000000,,POINT (33.50466 -10.19036)


In [11]:
features_gdf.iloc[:, 12000:]

Unnamed: 0,998_10,998_11,998_12,999_1,999_2,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12,geometry
0,,1.297913,1.044488,,0.006922,,1.000000,1.000000,0.002402,1.000000,0.000927,0.000613,,0.003262,0.000239,POINT (22.00466 -16.19036)
1,0.930201,0.935190,,,,,0.009427,0.005681,0.004284,0.002694,0.001640,0.000689,0.000878,0.021311,,POINT (22.00466 -16.19036)
2,0.898067,1.440304,,0.006652,,,0.008678,0.007007,0.005162,0.003032,0.001828,0.000936,0.000820,0.004444,,POINT (22.00466 -16.19036)
3,0.903305,1.131946,1.193247,0.006303,0.006878,0.005010,0.004234,0.003123,0.002475,0.001411,0.000419,0.000322,0.001031,0.002016,0.002855,POINT (22.00466 -16.19036)
4,0.829283,,,,0.076561,0.007508,0.007276,0.004638,0.002851,0.002013,0.001561,0.000924,0.000347,,,POINT (22.00466 -16.19036)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101159,0.823307,0.859537,,,0.003429,,,0.001952,0.001340,0.000046,0.000009,0.000000,0.000000,0.000000,,POINT (33.50466 -10.19036)
101160,0.820138,0.869622,1.039315,0.002017,,,0.004004,0.005244,0.002288,0.000166,0.000149,0.000000,0.000000,0.000049,0.000000,POINT (33.50466 -10.19036)
101161,0.824495,0.893293,0.946694,,,,0.003117,0.002498,,0.005067,0.000013,0.000000,0.000000,0.000009,0.000000,POINT (33.50466 -10.19036)
101162,0.878204,0.938874,,,,0.003815,,0.002397,0.001376,0.014282,0.000009,0.000000,0.000000,0.000000,,POINT (33.50466 -10.19036)


## 5. Join features to ground data

This is an important step, since this is how we can use both the features and the data to use in models.

In [12]:
# Now lets combine the sea data 
spatial_join = gpd.sjoin(features_gdf, sea_unq_join, how='right', predicate = 'within')

In [13]:
spatial_join.iloc[:, 11500:12000]

Unnamed: 0,957_1,957_2,957_3,957_4,957_5,957_6,957_7,957_8,957_9,957_10,...,997_11,997_12,998_1,998_2,998_3,998_4,998_5,998_6,998_7,998_8
0,,,0.983364,,0.945928,0.920352,0.823232,0.777358,0.697412,0.737463,...,0.168942,,,,1.457966,,1.339860,1.290358,1.193511,1.163463
0,1.000000,,,1.000000,0.979923,1.000000,0.865064,0.773598,0.626654,,...,,1.000000,1.000000,,,1.000000,1.396225,1.000000,1.228299,1.141174
0,0.873285,,,0.941566,0.928260,0.847460,0.841477,0.793532,0.681616,0.832658,...,0.088273,,1.398278,,,1.375731,1.310140,1.214462,1.195083,1.150016
0,,0.978222,0.946075,0.949381,0.939434,0.885348,0.840925,0.670889,0.574123,0.786810,...,0.124661,,,1.483887,1.415645,1.382602,1.360404,1.265396,1.179830,1.027930
0,,0.962126,,0.938912,0.909080,0.871684,0.816542,0.606559,0.669342,0.743301,...,0.052343,0.069399,,1.462514,,1.351274,1.317412,1.251404,1.182634,0.963667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2746,1.000000,1.000000,,1.000000,1.000000,1.000000,1.000000,0.272329,0.201271,,...,,1.000000,1.000000,1.000000,,1.000000,1.000000,1.000000,1.000000,0.674812
2746,,0.770079,0.746128,0.725549,0.654701,0.665806,0.546920,0.476296,0.344494,0.270541,...,0.022531,0.024437,,1.514097,1.418604,1.190200,1.060640,1.058592,0.931092,0.884308
2746,0.754445,,0.855209,0.869016,0.870043,0.792487,0.688476,0.540588,0.637261,0.463695,...,0.022981,0.023586,1.233723,,1.056474,1.060046,1.040321,1.048731,0.992816,0.877198
2746,,0.818951,0.802681,0.872399,0.831065,0.769247,0.704621,0.693517,0.647633,0.566866,...,0.157391,0.020901,,1.239900,1.179530,1.075677,1.055449,1.034508,1.012649,1.053936


In [14]:
features_join = spatial_join.merge(country_sea, on=['year', 'sea_unq'], how='inner')

In [15]:
# Drop the redundant independent lon and lat columns because now that they are in a separate geometry column
features_join = features_join.drop(['lon', 'lat', 'geometry_x'], axis = 1)


In [16]:
features_join.iloc[:, 12000:]

Unnamed: 0,998_11,998_12,999_1,999_2,999_3,999_4,999_5,999_6,999_7,999_8,...,rice,sedcttn,sorghum,soybens,sugarcn,sunflwr,swtptts,vlvtbns,vrgntbc,geometry_y
0,1.314470,,,,0.062471,,0.065213,0.040456,0.006207,0.004970,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,1.042192,,,,0.017338,,0.010601,0.003372,0.001397,0.000511,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,1.141773,,,,0.032274,,0.521606,0.461205,0.008552,0.006931,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,1.183810,,,,0.065942,,0.540266,0.494341,0.005150,0.004876,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,1.012487,,,,0.131616,,0.137247,0.100964,0.001779,0.001050,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28899,,1.0,1.0,1.0,,1.0,1.000000,1.000000,1.000000,0.000741,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
28900,,1.0,1.0,1.0,,1.0,1.000000,1.000000,1.000000,0.000005,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
28901,,1.0,1.0,1.0,,1.0,1.000000,1.000000,1.000000,0.012826,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
28902,,1.0,1.0,1.0,,1.0,1.000000,1.000000,1.000000,0.006424,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


In [17]:
# Set some parameters 
# Number of features:
num_features = 1000

# Imputing
#impute_manual = True
impute_manual = False

### 6. Impute missing values

Imputing "manually" by descending group levels imputes NA values in multiple "cascading" steps, decreasing the proportion of inputed values with each step. First, the NA values are imputed at by both `year` and `geometry`, which should yield imputed values that most closely match the feature values that would be present in the data if there was no clouds obscuring the satellite images. Next, the remaining NA values that could not be imputed by both `year` and `district` are imputed by only `district`. Lastly, the remaining NA vlaues that could not be imputed by both `year` and `district` or by just `district` are imputed by `year` only. This option gives the user more control and transparency over how the imputation is executed.

Imputing using `scikit learn`'s simple imputer executes standard imputation, the details of which can be found in the `scikitlearn` documentation [here.](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)

The imputation approach depends on the selection made at the top of this notebook for `impute_manual`.

In [18]:
# Cropmosaiks used this line to calculate the number of cells, but I don't know what this does. 
#num_cells = len(features) * len(month_range) * num_features

# Extract the size of the features dataframe
rows, cols = features_join.shape

# compute the number of feature cells in the features dataframe
num_cells = rows * cols
num_cells

# Let's check how many rows do not have NA values:
len(features_join.dropna()) 
# This makes sense that all rows contain at least 1 NA value 
# since the features we currently use were randomly sampled from 10% of the points in Zambia.


0

In [19]:
# This code chunk will help us keep track of the imputation process by defining colors for the messages.
class bcolors:
    BL = '\x1b[1;34m' #GREEN
    GR = '\x1b[1;36m' #GREEN
    YL = '\x1b[1;33m' #YELLOW
    RD = '\x1b[1;31m' #RED
    RESET = '\033[0m' #RESET COLOR

#### Notes:
Using cropmosaiks' features from 2022 has a starting row count of 1893, and an ending total row count of 325. 
This might be due to the lack of temporal overlap between those features and our new survey data (the survey data we use currently only covers 2 years).

In [29]:
# Notes: Have to change the year, get an error rn. Also, check to make sure the number of cells is correct
# The if section of this chunk is the manual imputation method
if impute_manual:
    ln_ft = len(features_join) # This saves the total length of the features_join dataframe 
    ln_na = len(features_join.dropna()) # This saves the length of the features_join dataframe without NA values 
    # (which is none since every row has at least 1 missing value)
    
    # This print statement simply helps us keep track of the number of rows 
    # and what we are currently on before starting the process.
    print(f'Starting total row count: {bcolors.BL}{ln_ft}{bcolors.RESET}',
          f'\nPre-Impute NaN row count: {bcolors.RD}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPre-Impute NaN row %: {bcolors.RD}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPre-Impute NaN cell %: {bcolors.RD}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 1: Filling NaN values by month, year, and SEA group average') 
    # This is the line that takes the values and imputes the missing values based on the average.
    features_join = ( 
        features_join
        .fillna(features_join # .fillna is saying to use the next statement to replace NA values with the resulting statement
                .groupby(['year', 'sea_unq'], as_index=False) # This groups the data based on year and unique SEA
                                                              # and we don't want to index based on the resulting df
                .transform('mean'), inplace=True # Then this means to take the average of the non-missing values 
                                                 # (based on the groups above)
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 1 NaN row count: {bcolors.YL}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 1 NaN row %: {bcolors.YL}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 1 NaN cell %: {bcolors.YL}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 2: Filling NaN values by month and SEA across group average')
    features_join = (
        features_join
        .fillna(features_join
                .groupby(['sea_unq'], as_index=False)
                .transform('mean'), inplace=True
               )
    )
    ln_ft = len(features_join)
    ln_na = len(features_join.dropna())
    print(f'Post step 2 NaN row count: {bcolors.GR}{ln_ft - ln_na}{bcolors.RESET}',
          f'\nPost step 2 NaN row %: {bcolors.GR}{((ln_ft - ln_na) / ln_ft)*100:.02f}{bcolors.RESET}',
          f'\nPost step 2 NaN cell %: {bcolors.GR}{(features_join.isna().sum().sum() / num_cells)*100:.02f}{bcolors.RESET}',
          f'\n\nStep 3: Drop remaining NaN values\n')
    features_join = features_join.dropna(axis=0)
    print(f'Ending total row count: {bcolors.BL}{len(features_join)}{bcolors.RESET}')
    
# The else section is a basic simple imputation
else: 
    # Store the geometry column separately
    geometry_col = features_join['geometry_y']
    # Remove the geometry column from the DataFrame
    features_join = features_join.drop(columns=['geometry_y'])
    features_join = features_join.set_index(['year', 'sea_unq'])
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit_transform(features_join)
    features_join[:] = imputer.transform(features_join)
    features_join = features_join.reset_index()
    # Add the geometry column back to the DataFrame
    features_join['geometry'] = geometry_col

  features_join[:] = imputer.transform(features_join)


In [34]:
features_join

Unnamed: 0,year,sea_unq,index_left,index_1,index_2,index_3,index_4,index_5,index_6,index_7,...,rice,sedcttn,sorghum,soybens,sugarcn,sunflwr,swtptts,vlvtbns,vrgntbc,geometry
0,2017.0,1,62455.0,2157.484192,1083.61945,3090.000000,5312.318605,13652.0,14347.0,14398.0,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,2017.0,1,62462.0,2157.484192,1083.61945,3091.000000,5312.318605,13653.0,14348.0,14399.0,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,2017.0,1,62525.0,2157.484192,1083.61945,3089.000000,5312.318605,13651.0,14346.0,14397.0,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,2017.0,1,62532.0,2157.484192,1083.61945,3092.000000,5312.318605,13654.0,14349.0,14400.0,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,2017.0,1,62469.0,2157.484192,1083.61945,3093.000000,5312.318605,13655.0,14350.0,14401.0,...,0.0,0.0,0.404686,6.0,0.0,0.0,0.000000,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28899,2016.0,300,12257.0,275.000000,832.00000,1523.422262,1049.000000,1049.0,1049.0,1049.0,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
28900,2016.0,300,12439.0,274.000000,831.00000,1523.422262,1048.000000,1048.0,1048.0,1048.0,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
28901,2016.0,300,12614.0,278.000000,835.00000,1523.422262,1052.000000,1052.0,1052.0,1052.0,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."
28902,2016.0,300,12446.0,276.000000,833.00000,1523.422262,1050.000000,1050.0,1050.0,1050.0,...,0.0,0.0,2.089372,0.0,0.0,0.0,2.737887,0.0,0.0,"POLYGON ((23.23962 -16.31204, 23.23876 -16.312..."


### Test of IterativeImputer to try to Impute missing values more effectively

The Simple Imputer from sklearn is fine, but our ending number of points has been below 20% of the original dataframe (~2% using Carlo's features and Sitian's old joined ground data)

In [21]:
# This is a test to see if IterativeImputer would work with this data
# First going to make a copy of our joined features and just save the first 1000 rows or so.
features_test = features_join.copy().iloc[:1000,:]
features_test

Unnamed: 0,index_left,year,index_1,index_2,index_3,index_4,index_5,index_6,index_7,index_8,...,rice,sedcttn,sorghum,soybens,sugarcn,sunflwr,swtptts,vlvtbns,vrgntbc,geometry_y
0,62455.0,2017.0,,,3090.0,,13652.0,14347.0,14398.0,14398.0,...,0.0,0.0,0.404686,6.00,0.0,0.0,0.00,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
1,62462.0,2017.0,,,3091.0,,13653.0,14348.0,14399.0,14399.0,...,0.0,0.0,0.404686,6.00,0.0,0.0,0.00,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
2,62525.0,2017.0,,,3089.0,,13651.0,14346.0,14397.0,14397.0,...,0.0,0.0,0.404686,6.00,0.0,0.0,0.00,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
3,62532.0,2017.0,,,3092.0,,13654.0,14349.0,14400.0,14400.0,...,0.0,0.0,0.404686,6.00,0.0,0.0,0.00,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
4,62469.0,2017.0,,,3093.0,,13655.0,14350.0,14401.0,14401.0,...,0.0,0.0,0.404686,6.00,0.0,0.0,0.00,0.0,0.0,"POLYGON ((27.82327 -13.65772, 27.82294 -13.657..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,62741.0,2016.0,,,,11789.0,14094.0,14094.0,14145.0,14000.0,...,0.0,0.0,0.125000,0.31,0.0,0.0,2.26,0.0,0.0,"POLYGON ((27.77832 -13.19012, 27.77791 -13.191..."
996,62349.0,2016.0,,,,11778.0,14083.0,14083.0,14134.0,13989.0,...,0.0,0.0,0.125000,0.31,0.0,0.0,2.26,0.0,0.0,"POLYGON ((27.77832 -13.19012, 27.77791 -13.191..."
997,62699.0,2016.0,,,,11788.0,14093.0,14093.0,14144.0,13999.0,...,0.0,0.0,0.125000,0.31,0.0,0.0,2.26,0.0,0.0,"POLYGON ((27.77832 -13.19012, 27.77791 -13.191..."
998,62489.0,2016.0,,,,11784.0,14089.0,14089.0,14140.0,13995.0,...,0.0,0.0,0.125000,0.31,0.0,0.0,2.26,0.0,0.0,"POLYGON ((27.77832 -13.19012, 27.77791 -13.191..."


In [23]:
# Load the Iterative Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import random

In [25]:
# Try out the Iterative Imputer
# Set the seed
random.seed(987)
# Store the geometry column separately
geometry_col = features_test['geometry_y']
# Remove the geometry column from the DataFrame
features_test_new = features_test.drop(columns=['geometry_y'])
features_test_new = features_test_new.set_index(['year', 'sea_unq'])
imputer = IterativeImputer(missing_values=np.nan, max_iter=10)
imputer.fit_transform(features_test_new)
features_test_new[:] = imputer.transform(features_test_new)
features_test_new = features_test_new.reset_index()
# Add the geometry column back to the DataFrame
features_test_new['geometry'] = geometry_col

KeyboardInterrupt: 

### Save copy of processed features before sumarizing training features to district level

Duplicate the features dataframe at this stage so we can retain a copy of features at point resolution for all years available, which is `2013/2014/2016-2021`.

    - The start year is `2016` if the satellite selected is Sentinel 2 (due to the fact that Sentinel 2 launched in June of `2015`)
    - The start year is `2013` if the satellite selected is Landsat 8 and the month range selected was anything besides all months (due to the fact that Landsat 8 launched in February of `2013`)
    - The start year is `2014` if the satellite selected is Landsat 8 and the month range selected was all months
    
This duplicated dataframe we create in the following code is called `features_all_years`. The purpose for this dataframe comes into play after the model is trained; we will be able to plug in point-resolution features from _any and all_ years from this dataframe into the trained model and observe how the model predicts crop years across space and time. It would be interesting to plot these features for each year sequentially to show how the crop prediction landscape changes by year. These point-resolution features increase the spatial resolution of the ground-truth crop data we have for the years through 2018, because our ground-truth crop data is at a the coarser  district-resolution. Furthermore, these point-resolution features are the _only_ crop data we have for the years 2020-2021. The reason we lack data from 2019 is because the Zanbia Sattistics Agency has not yet released their Crop Forecast Survey data for that year. The reason we do not have data for 2020-2021 is because Covid-19 prevented any Crop Forecast Surveys from being conducted. 


After we create the dataframe `features_all_years`, we are free to further process the original features dataframe, `features_join`, in order to train the model with these features and their paired ground-truth crop yields. Processing this dataframe further requires us to subset the years to the start year through the years for which we have crop data: `2013/2014/2016-2019`. This dataframe is called `features_through_2019`. The reason we subset this dataframe is because we are training the model using _supervised_ machine learning, which means we are feeding it only features that have ground-truth crop data accosicated with them.

In [35]:
features_all_years = features_join.copy()

# assign the geometry column to features_2014_2021 so it can serve 2 purposes:
# 1. plotting features sequentially by year
# 2. the entire dataframe can be fed into the model after the model is trained on only the summarized features for 2014-2018 and the associated crop data
# moving forward in the immediate sections, summarize the `features` dataframe to SEA level

### Summarise to administrative boundary level
Weighted by cropped area, or simple mean, depending on the selection at the top of this notebook for `weighted_avg`. 

In [36]:
# check the order of the columns in the dataframe that will be summarized and then fed into the ridge regression in order to train the model
# we care about the order of columns specifically because in the following steps we assign only the feature columns to an object, so we need to know which 3 columns to omit by indexing
features_join.columns

Index(['year', 'sea_unq', 'index_left', 'index_1', 'index_2', 'index_3',
       'index_4', 'index_5', 'index_6', 'index_7',
       ...
       'rice', 'sedcttn', 'sorghum', 'soybens', 'sugarcn', 'sunflwr',
       'swtptts', 'vlvtbns', 'vrgntbc', 'geometry'],
      dtype='object', length=12060)

The output above show that the 3 columns that are _not_ features are the first 2 columns `year` and `district`, and the last column, `crop_perc`.

In [37]:
# check the shape to the dataframe as a sanity check
features_join.shape

(28904, 12060)

In [38]:
features_join.iloc[:, 2]

0        62455.0
1        62462.0
2        62525.0
3        62532.0
4        62469.0
          ...   
28899    12257.0
28900    12439.0
28901    12614.0
28902    12446.0
28903    12264.0
Name: index_left, Length: 28904, dtype: float64

The output above shows the number of rows and columns in the dataframe, respectively. Recall that the number of rows represents the number of points for which we have features, and the number of columns is all features for all months selected plus the columns joined from the ground-truth data. There are _____ rows, meaning that is the amount of training points we have to feed into the model _before they are summarized to SEA level, so this number will shrink after we summarize to district level. There are 12060 columns, which will not change after we summarize the features to district level. The number of columns that we include in the features object in the next chunk will be this number minus the _____ non-feature columns.

In [39]:
# create object that contains only feature columns, rather than all columns that would include `district`, `year`, and `crop_perc`
# python index starts at 0, so here we specify to retain columns starting at 3 through every column besides the last column
# the columns we omit stay in the dataframe, because we assign the selected columns to an object, but the omitted columns are not included in the calculation in the next chunk
var_cols = features_join.columns[2:12001].values.tolist()

# call the object `var_cols` to check that it only includes feature columns, but do not view it in list format because it is more readable not as a list 
features_join.columns[2:12001]
# these are all the feature columns that will be fed into the `weighted_avg` calculation in the next chunk

Index(['index_left', 'index_1', 'index_2', 'index_3', 'index_4', 'index_5',
       'index_6', 'index_7', 'index_8', 'index_9',
       ...
       '998_1', '998_2', '998_3', '998_4', '998_5', '998_6', '998_7', '998_8',
       '998_9', '998_10'],
      dtype='object', length=11999)

In [40]:
%%time
# Group by 'year' and 'sea_unq' and calculate the mean for the specified columns
grouped_features = features_join.groupby(['year', 'sea_unq']).mean()


CPU times: user 1.85 s, sys: 168 ms, total: 2.02 s
Wall time: 2.02 s




Now that the features have been summarized to district and year, there are fewer rows. The dataframe we were working with before this step,  `features_through_2018`, had 13866 rows that represented points. Now we have 216 rows, as shown by the following output. Notice we still have all 12003 columns. 

In [43]:
grouped_features.iloc[:, 12001:]

Unnamed: 0_level_0,Unnamed: 1_level_0,999_1,999_2,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,...,popcorn,rice,sedcttn,sorghum,soybens,sugarcn,sunflwr,swtptts,vlvtbns,vrgntbc
year,sea_unq,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016.0,1,1.000000,0.483945,0.167272,1.000000,0.211239,1.000000,0.086030,0.003092,0.001454,0.003303,...,0.404686,0.0,0.00,0.000000,0.500000,0.0,0.000000,0.000000,0.0,0.0
2016.0,2,0.943929,0.483945,0.167272,0.885844,0.049124,0.041234,0.004182,0.002311,0.002178,0.003303,...,0.250000,0.0,1.00,0.000000,6.054058,0.0,0.809372,0.725000,0.0,0.0
2016.0,3,0.001593,0.483945,0.167272,0.004801,0.001091,0.157373,0.000343,0.000103,0.037756,0.003303,...,0.500000,0.0,0.00,0.000000,9.665604,0.0,0.000000,1.000000,0.0,0.0
2016.0,4,0.006728,0.483945,0.167272,0.010231,0.005338,0.004577,0.002421,0.000945,0.000467,0.003303,...,0.000000,0.0,2.25,0.125000,2.000000,0.0,0.000000,1.875000,0.0,0.0
2016.0,5,0.007167,0.483945,0.167272,0.009445,0.025973,0.025197,0.022631,0.001154,0.000526,0.003303,...,0.500000,0.0,0.00,0.000000,0.000000,0.0,0.000000,3.857029,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017.0,296,0.511381,0.483945,0.167272,0.008287,0.005605,0.004261,0.002733,0.001859,0.000728,0.000963,...,0.000000,1.0,0.00,0.809372,0.000000,0.0,0.000000,1.561715,0.0,0.5
2017.0,297,0.511381,0.483945,0.014979,0.200632,0.007343,0.004114,0.002398,0.001973,0.001217,0.000802,...,0.000000,0.0,0.00,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
2017.0,298,0.511381,0.483945,0.167272,0.041498,0.011164,0.004721,0.004188,0.003522,0.002490,0.001910,...,0.000000,0.5,0.00,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0
2017.0,299,0.511381,0.483945,0.167272,0.008407,0.012032,0.008613,0.005902,0.004621,0.003972,0.004047,...,0.000000,0.0,0.00,0.030000,0.000000,0.0,0.000000,0.000000,0.0,0.0


## Model

### Define `x`'s and `y`'s that will be a part of training the model

Since our independent variable is the features, these are the `x`'s. Our dependent variable is the crop yield in metric tonnes per hectare planted, so that will be the `y`'s.

In [44]:
# Separate features (X) and target variables (y)
X = grouped_features.iloc[:, 2:12000]
y = grouped_features.iloc[:, 12001:12045]

In [45]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Split into train and test sets

This step is executed right before training the model so we can train on 80% of the data and preserve 20% for testing.

In [46]:
print("Number of total points: ", len(X), "\n", 
      "Number of training points: ", len(X_train), "\n",
      "Number of testing points: ", len(X_test), sep = "")

Number of total points: 556
Number of training points: 444
Number of testing points: 112


### Train model using cross-validated ridge regression

Please see the documentation for the function that executes this regression [here.](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html)

In [47]:
# Create a pipeline with normalization
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RidgeCV(cv = 5, alphas = np.logspace(-8, 8, base = 10, num = 17)))
])


In [48]:
# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

In [49]:
y_pred

array([[ 4.37840867e-01,  4.41322229e-01,  1.51381053e-01, ...,
         1.90486232e+00,  2.05285926e+00, -1.26716273e-03],
       [ 4.29852732e-01,  4.73590075e-01,  1.74765696e-01, ...,
         1.10217113e+00,  1.28711478e+00,  2.89603457e-04],
       [ 4.90227562e-01,  4.67174985e-01,  1.67469449e-01, ...,
         5.51074753e-01,  1.44735647e+00,  1.39071274e-04],
       ...,
       [ 1.33145899e-01,  4.71904790e-01,  1.68929204e-01, ...,
         9.39516516e-01,  8.40310414e-01,  7.59464863e-04],
       [ 4.99204944e-01,  4.72268236e-01,  1.70473224e-01, ...,
         2.28071068e-01,  1.43742727e+00,  9.78722212e-04],
       [ 6.17190533e-01,  8.94936366e-01,  1.60247619e-01, ...,
         2.53564965e+00, -2.66810187e-01, -1.10980642e-03]])

In [50]:
# Initialize lists to store the predictions, RMSEs, and R-squared values
predictions = []
rmse_list = []
r2_list = []

# Loop through the target variables (columns)
for i in range(y_test.shape[1]):
    # Extract the true values and predictions for the current target variable
    y_test_i = y_test.iloc[:, i]
    y_pred_i = y_pred[:, i]
    
    # Compute the RMSE and R-squared
    rmse_i = np.sqrt(mean_squared_error(y_test_i, y_pred_i))
    r2_i = r2_score(y_test_i, y_pred_i)
    
    # Append the results to the corresponding lists
    predictions.append(y_pred_i)
    rmse_list.append(rmse_i)
    r2_list.append(r2_i)

# Print the RMSE and R-squared values for each target variable

for i, (column_name, rmse_i, r2_i) in enumerate(zip(y_test.columns, rmse_list, r2_list), start=1):
    print(f"{column_name}: RMSE = {rmse_i:.4f}, R-squared = {r2_i:.4f}")

999_1: RMSE = 0.0936, R-squared = 0.8066
999_2: RMSE = 0.0670, R-squared = 0.8410
999_3: RMSE = 0.0567, R-squared = 0.6771
999_4: RMSE = 0.1572, R-squared = 0.7955
999_5: RMSE = 0.1892, R-squared = 0.8029
999_6: RMSE = 0.1594, R-squared = 0.8280
999_7: RMSE = 0.1564, R-squared = 0.8235
999_8: RMSE = 0.0024, R-squared = 0.1887
999_9: RMSE = 0.0041, R-squared = -0.1885
999_10: RMSE = 0.0028, R-squared = -0.6669
999_11: RMSE = 0.0025, R-squared = -0.0882
999_12: RMSE = 0.1263, R-squared = 0.6546
bunding: RMSE = 1.6426, R-squared = 0.0599
cnvntnl: RMSE = 18.7723, R-squared = -6.6190
plntngb: RMSE = 0.4034, R-squared = -0.4260
ploghng: RMSE = 12.7749, R-squared = 0.0095
ridging: RMSE = 15.0129, R-squared = -1.4693
ripping: RMSE = 0.4520, R-squared = -2.2824
zertllg: RMSE = 1.9411, R-squared = -0.9076
xna: RMSE = 1.6706, R-squared = 0.0881
al: RMSE = 5.3693, R-squared = -1.4314
ttl_r_h: RMSE = 15.7203, R-squared = -0.2741
al_nmlb: RMSE = 0.7925, R-squared = -0.1724
al_flds: RMSE = 1.3812, R-

### Validation set $R^2$ performance

In [51]:
print(f"Validation R2 performance: {ridge_cv_random.best_score_:0.2f}")

NameError: name 'ridge_cv_random' is not defined

### Train set

In [52]:
y_pred = np.maximum(ridge_cv_random.predict(x_train), 0)
r2_train = r2_score(y_train, y_pred)

fig, ax = plt.subplots(ncols=1)
plt.scatter(y_pred, y_train, alpha=1, s=4)
plt.xlabel("Predicted", fontsize=15, x = .3)
plt.ylabel("Ground Truth", fontsize=15)
plt.suptitle(r"$\log_{10}(1 + Crop Yield)$", fontsize=20, y=1.02)
plt.title((f"Model applied to train data n = {len(x_train)}, R$^2$ = {r2_train:0.2f}"),
          fontsize=12, y=1.01)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

ax.axline([0, 0], [1, 1], c = "k")

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)


# plt.savefig(f'images/{feature_file_name}_train_data.jpg', dpi=300)
plt.show()
plt.close()
# the model is plotted with a black 45 degree line that serves as a reference of what a perfect correlation would look like
# deviation of the line indicates that there is not a perfect correlation

NameError: name 'ridge_cv_random' is not defined

In [None]:
print(f"Training R^2 = {r2_train:0.2f}\nPearsons r = {pearsonr(y_pred, y_train)[0]:0.2f}") 

In [None]:
# Pearson r^2
pearsonr(y_pred, y_train)[0] ** 2

In [None]:
# alternative way to calculate Training R^2
ridge_cv_random.score(x_train, y_train)

### Test set

In [None]:
y_pred = np.maximum(ridge_cv_random.predict(x_test), 0)
r2_test = r2_score(y_test, y_pred)

plt.figure()
plt.scatter(y_pred, y_test, alpha=1, s=4)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Ground Truth", fontsize=15)
plt.suptitle(r"$\log_{10}(1 + Crop Yield)$", fontsize=20, y=1.02)
plt.title(f"Model applied to test data n = {len(x_test)}, R$^2$ = {r2_test:0.2f}",
          fontsize=12, y=1)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

ax.axline([0, 0], [.75, .75], c = "k")

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

# plt.savefig(f'images/{feature_file_name}_test_data.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
print(f"Testing set R^2 = {r2_test:0.2f}")
print(f"Testing set pearsons R = {pearsonr(y_pred, y_test)[0]:0.2f}")

Summary of both train and test data sets

In [None]:
y_pred = np.maximum(ridge_cv_random.predict(x_all), 0)

fig, ax = plt.subplots(figsize=(7, 7))
ax.axline([0, 0], [.75, .75], c = "k")
plt.scatter(y_pred, y_all, alpha=.9, s=15)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Observed", fontsize=15)
plt.text(
    0, .8, fontsize=15, fontweight="bold",
    s=f"R$^2$={r2_train:0.2f} - Train set",
)
plt.text(
    0, .75, fontsize=15, fontweight="bold",
    s=f"R$^2$={ridge_cv_random.best_score_:0.2f} - Validation set",
)
plt.text(
    0, .7, fontsize=15, fontweight="bold",
    s=f"R$^2$={r2_test:0.2f} - Test set",
)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

# plt.savefig(f'images/{feature_file_name}_all_data.jpg', dpi=300)
plt.show()
plt.close()

### Use the trained model to predict crop yields over all years from 1km grid-cell resolution features 

Recall that after we executed imputation on all feature years in the dataframe `features`, we copied the dataframe and named it `features_all_years`. Now we can plug that into the model to visualize how our model performs over time.

In [None]:
# recall the object we created earlier, before we split the features by year into those that would train the model 
# and those that would be fed into the trained model to predict crop yields
# in years for which we do not have crop data
features_all_years.head(3)

In the following chunk, we drop certain columns from `features_all_years` because we only need to feed the feature data into the model to generate predictions. Using the argument `axis = 1`, we specify that we are dropping columns rather than rows. 

In [None]:
x_all = features_all_years.drop([
    'year', 
    'geometry',
    'district',
    'crop_perc'
], axis = 1)

In the following chunk, we execute the model on the features from the dataframe `features_all_years`. The crop yield predictions for each row populate a new column in the dataframe.

The model is run inside the `np.maximum()` function because if we run it without being wrapped inside function, some crop predictions are negative values, but we need them all to be positive because conceptually crop yields cannot be negative.

In [None]:
features_all_years['yield_prediction'] = np.maximum(ridge_cv_random.predict(x_all), 0)

In [None]:
# check out the dataframe with the new column of predictions
features_all_years.head(3)

The dataframe is already a geodataframe, so we do not have to convert it to one before mapping predictions. However, we do need to replace all the zero value crop percentage areas with `NA`. We do this by applying the `mask()` function. This function is similar to an if-else statement. If the value of the `crop_perc` is equal to 0, that value is replaced by the value of the second argument, which is `NA`. If the value of `crop_prec` is _not_ equal to zero, we retain the current value. The argument `inplace = True` executes this replacement in the same cell. 

In [None]:
features_all_years['yield_prediction'].mask(features_all_years['crop_perc']==0, np.nan, inplace=True)

Recall that this dataframe has a geometry column, with latitude and longitude together. In order to map the predicted features, we separate this geometry column into separate `lon` and `lat` columns. 

In [None]:
# extract the longitude and latitude from the geometry column, and make then into independent columns
features_all_years['lon'], features_all_years['lat'] = features_all_years.geometry.x, features_all_years.geometry.y

Plot the predicted features for each year:

In [None]:
def scatter(x, y, c, **kwargs):
    plt.scatter(x, y, c=c, s = 1.25)
sns.color_palette("viridis", as_cmap=True)
g = sns.FacetGrid(
    features_all_years, 
    col="year", 
    col_wrap = 4, 
    height=5, 
    aspect=1
)
g.map(scatter, "lon", "lat", "yield_prediction")
g.set_axis_labels(r"Yield Prediction")
# save the figure and name the file so that it represents the model parameters that created the predictions
# plt.savefig(f'images/{feature_file_name}_all_predictions.jpg', dpi=300)

Plot the model's predicted features summarized to district level. In this visualization, we choose a specific year to examine rather than visualizing all years in one figure. Visualizing the the features summarized to district level is interesting because the crop data resolution provided by Zambia Statistics Agency is at the district level, and therefore it is easier to compare our model results to those ground-truth values when they are summarized to district level as well. Furthermore, our model's crop predictions for the years 2020 and 2021 might be more valuable when summarized to district level if Zambian governments, policy-makers, farmers, and researchers wish to use this data to determine crop imports, exports, and storage according to district summaries. 

In [None]:
features_all_years_summary = (
    features_all_years
    .groupby(['district',"year"], as_index = False)['yield_prediction']
    .mean()
    .set_index('district')
)

In [None]:
# join Zambia's shapefile to the summarized features to map the districts
# reset the index so it is a properly formatted dataframe
features_all_years_summary = features_all_years_summary.join(country_shp).reset_index()

Now that the geometries have been converted to districts from points, the geomatries are now polygons. There is still a row for each district for each year.

In order to change the year visualized, simply change the year in the following code and re-run the chunk.

In [None]:
features_all_years_summary[features_all_years_summary.year == 2020].plot(column = "yield_prediction")

Plot a boxplot for each year to visualize the range and quantile distribution of each year's crop predictions, summarized to district level. This enables us to identify years with exceptional disparities between the predicted yields by district. It also allows us to identify years that have many outliers.

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x="year", y="yield_prediction", data = features_all_years_summary)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Predicted Yield", fontsize=15)

Visualize the total crop yield predictions by year. This bar chart shows the sum of all the district crop yields.

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x="year", y="yield_prediction", data = features_all_years_summary, estimator = sum)

## Yield and Residual Plots

Create a dataframe of residuals called `residuals_df` from the `features_summary` dataframe. Note that we are _not_ using the predicted crop yields for _all_ years for these residuals, but rather the ground-truth crop yields for just the years through 2018.

The residuals give us an idea of the amount of uncertianty that is present in our model. By demeaning the residuals over space, we are able to remove the uncertainty over space and better determine our model performance over time and our uncertainty over time.

In [None]:
x_all = features_summary.drop(drop_cols, axis = 1)

# create empty dataframe to then populate with columns
residual_df = pd.DataFrame()

residual_df["yield_mt"] = features_summary.yield_mt.to_numpy()
residual_df["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
residual_df["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
residual_df["residual"] = residual_df["log_yield"] - residual_df["prediction"]
residual_df["year"] = features_summary.year
residual_df["district"] = features_summary.district
# join the district geometries
residual_df = residual_df.join(country_shp, how = "left", on = "district")

# demean by location so we can analyze the data over time
residual_df["district_yield_mean"] = residual_df.groupby('district')['log_yield'].transform('mean')
residual_df["district_prediction_mean"] = residual_df.groupby('district')['prediction'].transform('mean')
residual_df["demean_yield"] = residual_df["log_yield"] - residual_df["district_yield_mean"]
residual_df["demean_prediction"] = residual_df["prediction"] - residual_df["district_prediction_mean"]
residual_gdf = geopandas.GeoDataFrame(residual_df)

residual_gdf.head(3)

Visualize the residuals for the ground truth crop yields through 2018 with a boxplot.

In [None]:
plt.figure(figsize=(6, 5))
sns.boxplot(x="year", y="log_yield", data=residual_df)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Log Yield", fontsize=15)

Visualize the residuals as a sum by year with a bar plot.

In [None]:
plt.figure(figsize=(6, 5))
sns.barplot(x="year", y="log_yield", data=residual_df, estimator = sum)

Visualize the crop yield residuals by year as a histogram to determine how they are distributed.

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "yield_mt", bins = 20)
g.set_axis_labels("Yield (MT)")

Visualize the log-transformed crop yield residuals by year as a histogram to compare how they are distributed after the transformation.

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "log_yield", bins = 20)
g.set_axis_labels(r"$\log_{10}(1 + Crop Yield)$")

#### Crop prediction histogram

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "prediction", bins = 20)
g.set_axis_labels(r"Crop yield predictions")

#### Residual histogram

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "residual", bins = 20)
g.set_axis_labels(r"Residuals")

In [None]:
residual_gdf.residual.min()

In [None]:
residual_gdf.residual.max()

#### Log crop yield vs residuals

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.scatterplot, "log_yield", "residual")
g.set_axis_labels(r"$\log_{10}(1 + Crop Yield)$")

#### District residuals 

In [None]:
if satellite == 'landsat-8-c2-l2':
    fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
    ax1 = (residual_gdf[residual_gdf.year == 2014]
           .plot(ax = ax1, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
           .set_title("2014 Residuals"))
    ax2 = (residual_gdf[residual_gdf.year == 2015]
           .plot(ax = ax2, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
           .set_title("2015 Residuals"))
else:
    pass
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
ax1 = (residual_gdf[residual_gdf.year == 2016]
       .plot(ax = ax1, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2016 Residuals"))
ax2 = (residual_gdf[residual_gdf.year == 2017]
       .plot(ax = ax2, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2017 Residuals"))
ax3 = (residual_gdf[residual_gdf.year == 2018]
       .plot(ax = ax3, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2018 Residuals"))

caption = "A positive value is an underestimated prediction (the prediction is lower than the actual yield), a negative value is an over estimated prediction"
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=12)


#### Difference from the mean

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.scatterplot, "demean_yield", "demean_prediction")
g.set_axis_labels('Difference from Yield Mean', 'Difference from Prediction Mean')

In [None]:
fig, ax = plt.subplots(figsize= (6, 5))
ax.axline([-.2, -.2], [.2, .2], c = "k")
plt.scatter(residual_gdf.demean_yield, residual_gdf.demean_prediction)
plt.title("Demeaned truth and predictions by district")
plt.xlabel('Difference from Yield Mean')
plt.ylabel('Difference from Predictions Mean')
r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
plt.text(
    -0.2,
    .18,
    s=f"Demeaned R$^2$ = {r_squared:0.2f}",
    fontsize=15,
    fontweight="bold",
)
plt.savefig(f'images/{feature_file_name}_demean.jpg', dpi=300)

In [None]:
for yr in range(year_start+1, 2018):
    r_squared = r2_score(residual_gdf[residual_gdf.year == yr]["demean_yield"], residual_gdf[residual_gdf.year == yr]["demean_prediction"])
    pearson_r = pearsonr(residual_gdf[residual_gdf.year == yr]["demean_yield"], residual_gdf[residual_gdf.year == yr]["demean_prediction"])
    
    print(yr, f"    R^2: {r_squared:.2f}\n",
          f"Pearson's r: {pearson_r[0]:.2f}\n", 
          sep = "")
    
r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
pearson_r = pearsonr(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
print(f"All     R^2: {r_squared:.2f}\n",
      f"Pearson's r: {pearson_r[0]:.2f}", sep = "")

In [None]:
r2 = round(pearson_r[0] ** 2, 2)
r2

#### Join residuals to the features for _all_ years to visualize the residuals of the features before they were summarized to district level.

In [None]:
complete_df = (
    features_all_years_summary
    .set_index(['district', 'year'])
    .join(residual_df
          .drop('geometry', axis = 1)
          .set_index(['district', 'year'])
         )
    .reset_index()
)

complete_df.head(3)

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))
tidy = complete_df.melt(id_vars='year').rename(columns=str.title)
tidy = tidy[tidy.Variable.isin(['yield_prediction', 'log_yield'])]
sns.barplot(x='Year', y='Value', hue='Variable', data=tidy, ax=ax1, ci = None)
sns.despine(fig)

h, l = ax1.get_legend_handles_labels()
ax1.legend(h, ['Predicted Yield', 'Observed Yield'],loc='lower left')

plt.savefig(f'images/{feature_file_name}_yield_pred.jpg', dpi=300)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x="year", y="yield_prediction", data=complete_df, estimator = sum)

### Congratulations on completing this analysis!