# Modeling Agricultural Variables
## Python modules

In [1]:
import warnings
import time
import os

import dask
from dask.distributed import Client

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors

import geopandas as gpd

import pyarrow
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from scipy.linalg import LinAlgWarning
from scipy.stats import pearsonr

import math
import seaborn as sns

## Read in Data

We first read in the aggregated features and ground-truth data joined in  feature_preprocessing.ipynb 

In [2]:
grouped_features = pd.read_csv("/capstone/mosaiks/repos/preprocessing/data/grouped_features.csv")

In [3]:
features = grouped_features.iloc[:,2:12002]
features.head()

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,999_3,999_4,999_5,999_6,999_7,999_8,999_9,999_10,999_11,999_12
0,0.0,0.000863,0.000783,0.0,0.0,0.0,0.0,6.157999e-06,0.000207,0.001568,...,0.060421,1.0,0.274676,1.0,0.115388,0.002708,0.001319,0.002867,0.003866,1.0
1,6.9e-05,0.000863,0.000783,0.0,2e-06,1.4e-05,4.7e-05,6.29924e-05,0.000168,0.001568,...,0.060421,0.939709,0.049106,0.039969,0.004752,0.002671,0.002439,0.002867,0.003866,0.071531
2,0.001141,0.000863,0.000783,0.000329,0.0,0.0,0.0,0.001008277,0.00136,0.002211,...,0.060421,0.006789,1.0,1.0,1.0,0.000517,0.000343,0.000396,0.003866,0.071531
3,0.001131,0.000863,0.000783,6e-06,4e-06,1e-05,1.4e-05,2.590917e-05,0.00011,0.001568,...,0.060421,0.005561,0.006391,0.004212,0.003235,0.001937,0.001683,0.002867,0.003866,0.071531
4,0.001131,0.000863,0.000783,0.0,0.0,0.0,0.0,3.113844e-07,1.2e-05,0.001568,...,0.060421,0.00557,0.006739,0.003991,0.002857,0.001979,0.001435,0.002867,0.003866,0.071531


In [4]:
outcomes = grouped_features.iloc[:,12003:]

outcomes["loss_ind"].astype('category')
outcomes["drought_loss_ind"].astype('category')
outcomes['pest_loss_ind'].astype('category')
outcomes['animal_loss_ind'].astype('category')
outcomes['flood_loss_ind'].astype('category')
outcomes.head()

Unnamed: 0,total_area_harv_ha,total_area_lost_ha,total_harv_kg,yield_kgha,frac_area_harv,frac_area_loss,area_lost_fire,maize,groundnuts,mixed_beans,...,prop_mix,log_maize,log_sweetpotatoes,log_groundnuts,log_soybeans,loss_ind,drought_loss_ind,flood_loss_ind,animal_loss_ind,pest_loss_ind
0,71.0,5.0,4400.0,57.894737,0.934211,0.065789,0.0,57.894737,0.0,0.0,...,0.0,4.058626,6.364023,5.935403,6.565149,0.0,0.0,0.0,0.0,0.0
1,124.0,7.0,3150.0,24.045802,0.946565,0.053435,0.0,24.045802,0.0,0.0,...,0.0,3.17996,6.364023,5.935403,6.565149,0.0,0.0,0.0,0.0,0.0
2,607.0,409.0,8730.0,8.59252,0.597441,0.402559,0.0,29.583333,0.0,0.434783,...,0.181102,3.387211,0.689155,5.935403,6.565149,1.0,1.0,0.0,0.0,0.0
3,462.0,190.0,7930.0,12.162577,0.708589,0.291411,0.0,14.938398,0.244444,5.366667,...,0.069018,2.703935,6.364023,-1.408767,6.565149,1.0,0.0,0.0,0.0,0.0
4,410.0,135.0,19975.0,36.651376,0.752294,0.247706,0.0,41.048593,28.629032,0.0,...,0.0,3.714757,2.525729,3.354421,6.565149,1.0,0.0,0.0,0.0,0.0


## Model

We define a model to predict each of our outcome variables on our features for each SEA/year. The `train_and_evaluate_models` function trains and evaluates Ridge Linear Regression models for each target variable specified in the `target_columns` parameter. It handles both categorical and continuous target variables and provides the option to block sample on specific SEAs (Survey Enumeration Areas) by providing the SEA IDs to hold out for the validation set.

The function works as follows:

1. Read the grouped features and outcomes from a CSV file.
2. Define a helper function `block_sampling` to perform block sampling based on the provided SEA IDs.
3. For each target variable in `target_columns`, select the corresponding target variable data.
4. If `block_sea_ids` is provided and not empty, perform block sampling using the `block_sampling` helper function. Otherwise, use `train_test_split` to split the data into training and testing sets.
5. Train a Ridge Linear Regression model using RidgeCV with 5-fold cross-validation and a range of alpha values.
6. If the target variable is categorical, calculate and print the false positive rate and AUC-ROC. If the target variable is continuous, calculate and print the estimated regularization parameter, training R2 performance, validation R2 performance, and Pearson's correlation coefficient.

In [17]:
def calculate_confusion_matrix(y_true, y_pred, decision_boundary):
    y_pred_adj = np.where(y_pred >= decision_boundary, 1, 0)
    cm = confusion_matrix(y_true, y_pred_adj)
    if cm.shape == (1, 1):
        if y_true.iloc[0] == 0:
            tn, fp, fn, tp = cm[0, 0], 0, 0, 0
        else:
            tn, fp, fn, tp = 0, 0, 0, cm[0, 0]
    elif cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        print("Unexpected confusion matrix:")
        print(cm)
        raise ValueError('Unexpected confusion matrix shape.')
    return tn, fp, fn, tp

In [18]:
def train_and_evaluate_models(target_columns, test_size, categorical_columns, decision_boundaries, block_sea_ids=None):
    grouped_features = pd.read_csv("/capstone/mosaiks/repos/preprocessing/data/grouped_features.csv")

    features = grouped_features.iloc[:, 2:12002]
    outcomes = grouped_features.iloc[:, 12003:]

    # Helper function to perform block sampling based on SEA IDs
    def block_sampling(features, outcomes, block_sea_ids):
        train_data = grouped_features[~grouped_features['sea_unq'].isin(block_sea_ids)]
        test_data = grouped_features[grouped_features['sea_unq'].isin(block_sea_ids)]

        X_train = train_data.iloc[:, 2:12002]
        X_test = test_data.iloc[:, 2:12002]
        y_train = train_data[target_column]
        y_test = test_data[target_column]

        return X_train, X_test, y_train, y_test

    # Initialize an empty DataFrame to store the predictions
    predictions_df = pd.DataFrame()

    for target_column in target_columns:
        # Select the target variable
        y = outcomes[target_column]

        # Split the data into training and testing sets
        if block_sea_ids and len(block_sea_ids) > 0:
            X_train, X_test, y_train, y_test = block_sampling(features, y, block_sea_ids)
        else:
            X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=test_size, random_state=42)

        # Train the model
        ridge_cv = RidgeCV(cv=5, alphas=np.logspace(-8, 8, base=10, num=17))
        ridge_cv.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = ridge_cv.predict(X_test)

        # Update the predictions DataFrame with the new predictions
        predictions_df[target_column] = y_pred

        if target_column in categorical_columns:
            for decision_boundary in decision_boundaries:
                # Calculate confusion matrix
                tn, fp, fn, tp = calculate_confusion_matrix(y_test, y_pred, decision_boundary)

                # Calculate the false positive rate
                false_positive_rate = fp / (fp + tn)

                # Calculate AUC-ROC
                auc_roc = roc_auc_score(y_test, y_pred)

                print(f"Target variable: {target_column} (Categorical)")
                print(f"Decision boundary: {decision_boundary}")
                print(f"False positive rate: {false_positive_rate:0.2f}")
                print(f"AUC-ROC: {auc_roc:0.2f}")
                print()
        else:
            # Calculate Pearson's correlation coefficient
            pearson_coeff, _ = pearsonr(y_test, y_pred)

            # Calculate training R squared
            train_r_squared = ridge_cv.score(X_train, y_train)

            print(f"Target variable: {target_column}")
            print(f"Estimated regularization parameter: {ridge_cv.alpha_}")
            print(f"Training R2 performance: {train_r_squared:0.2f}")
            print(f"Validation R2 performance: {ridge_cv.best_score_:0.2f}")
            print(f"Pearson's correlation coefficient: {pearson_coeff:0.2f}")

        print()

    return predictions_df


In [22]:
test_size = 0.2
target_columns = ['loss_ind', 'drought_loss_ind', 'flood_loss_ind', 'animal_loss_ind', 'pest_loss_ind']
categorical_columns = ['loss_ind', 'drought_loss_ind', 'flood_loss_ind', 'animal_loss_ind', 'pest_loss_ind']
block_sea_ids = None  # Change this to the desired SEA IDs or set to None for regular train_test_split
predictions_df = train_and_evaluate_models(target_columns, test_size, categorical_columns, decision_boundaries = [0.3, 0.4, 0.5, 0.6, 0.7], block_sea_ids=block_sea_ids)


Target variable: loss_ind (Categorical)
Decision boundary: 0.3
False positive rate: 0.81
AUC-ROC: 0.82

Target variable: loss_ind (Categorical)
Decision boundary: 0.4
False positive rate: 0.65
AUC-ROC: 0.82

Target variable: loss_ind (Categorical)
Decision boundary: 0.5
False positive rate: 0.52
AUC-ROC: 0.82

Target variable: loss_ind (Categorical)
Decision boundary: 0.6
False positive rate: 0.35
AUC-ROC: 0.82

Target variable: loss_ind (Categorical)
Decision boundary: 0.7
False positive rate: 0.29
AUC-ROC: 0.82


Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.3
False positive rate: 0.20
AUC-ROC: 0.89

Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.4
False positive rate: 0.10
AUC-ROC: 0.89

Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.5
False positive rate: 0.04
AUC-ROC: 0.89

Target variable: drought_loss_ind (Categorical)
Decision boundary: 0.6
False positive rate: 0.03
AUC-ROC: 0.89

Target variable: drought_loss_i

In [45]:
predictions_df.head()

Unnamed: 0,frac_area_harv,drought_loss_ind,animal_loss_ind
0,0.512868,0.01367,0.034667
1,0.956633,-0.321268,0.034946
2,0.739097,0.42461,0.029251
3,0.607781,0.289455,0.035903
4,0.863619,0.004838,0.023138


In [47]:
zambia = pd.read_feather("/capstone/mosaiks/repos/modeling/data/features_zmb_save.feather")

In [48]:
zambia.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,994,995,996,997,998,999,lon,lat,year,month
0,0.000185,0.0,0.000639,0.520889,0.017403,0.0,0.003055,1.075246,0.0,0.0,...,0.004387,2.898772,3.899328,0.019313,0.956588,0.000149,27.800588,-16.343257,2015,7
1,0.000165,0.0,0.00014,0.506241,0.010501,0.0,0.001828,1.070585,0.0,0.0,...,0.003691,2.874976,3.857865,0.023663,0.927871,0.000587,27.790588,-16.343257,2015,7
2,0.000679,0.0,0.00109,0.584611,0.021318,0.0,0.006663,1.202663,0.0,0.0,...,0.00718,2.960961,3.987956,0.021282,0.891021,0.000574,27.780588,-16.353257,2015,7
3,0.000502,0.0,0.003195,0.692933,0.034732,0.0,0.009775,1.366068,0.0,0.0,...,0.008007,3.097285,4.168679,0.00919,0.84719,6e-06,27.720588,-16.363257,2015,7
4,0.000162,0.0,0.003317,0.766315,0.039693,0.0,0.013185,1.480798,0.0,0.0,...,0.008061,3.172538,4.296293,0.017217,0.834277,0.0,27.730588,-16.363257,2015,7


### Train set

In [None]:
y_pred = np.maximum(ridge_cv_random.predict(x_train), 0)
r2_train = r2_score(y_train, y_pred)

fig, ax = plt.subplots(ncols=1)
plt.scatter(y_pred, y_train, alpha=1, s=4)
plt.xlabel("Predicted", fontsize=15, x = .3)
plt.ylabel("Ground Truth", fontsize=15)
plt.suptitle(r"$\log_{10}(1 + Crop Yield)$", fontsize=20, y=1.02)
plt.title((f"Model applied to train data n = {len(x_train)}, R$^2$ = {r2_train:0.2f}"),
          fontsize=12, y=1.01)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

ax.axline([0, 0], [1, 1], c = "k")

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)


# plt.savefig(f'images/{feature_file_name}_train_data.jpg', dpi=300)
plt.show()
plt.close()
# the model is plotted with a black 45 degree line that serves as a reference of what a perfect correlation would look like
# deviation of the line indicates that there is not a perfect correlation

In [None]:
print(f"Training R^2 = {r2_train:0.2f}\nPearsons r = {pearsonr(y_pred, y_train)[0]:0.2f}") 

In [None]:
# Pearson r^2
pearsonr(y_pred, y_train)[0] ** 2

In [None]:
# alternative way to calculate Training R^2
ridge_cv_random.score(x_train, y_train)

### Test set

In [None]:
y_pred = np.maximum(ridge_cv_random.predict(x_test), 0)
r2_test = r2_score(y_test, y_pred)

plt.figure()
plt.scatter(y_pred, y_test, alpha=1, s=4)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Ground Truth", fontsize=15)
plt.suptitle(r"$\log_{10}(1 + Crop Yield)$", fontsize=20, y=1.02)
plt.title(f"Model applied to test data n = {len(x_test)}, R$^2$ = {r2_test:0.2f}",
          fontsize=12, y=1)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

ax.axline([0, 0], [.75, .75], c = "k")

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

# plt.savefig(f'images/{feature_file_name}_test_data.jpg', dpi=300)
plt.show()
plt.close()

In [None]:
print(f"Testing set R^2 = {r2_test:0.2f}")
print(f"Testing set pearsons R = {pearsonr(y_pred, y_test)[0]:0.2f}")

Summary of both train and test data sets

In [None]:
y_pred = np.maximum(ridge_cv_random.predict(x_all), 0)

fig, ax = plt.subplots(figsize=(7, 7))
ax.axline([0, 0], [.75, .75], c = "k")
plt.scatter(y_pred, y_all, alpha=.9, s=15)
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Observed", fontsize=15)
plt.text(
    0, .8, fontsize=15, fontweight="bold",
    s=f"R$^2$={r2_train:0.2f} - Train set",
)
plt.text(
    0, .75, fontsize=15, fontweight="bold",
    s=f"R$^2$={ridge_cv_random.best_score_:0.2f} - Validation set",
)
plt.text(
    0, .7, fontsize=15, fontweight="bold",
    s=f"R$^2$={r2_test:0.2f} - Test set",
)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.gca().spines.right.set_visible(False)
plt.gca().spines.top.set_visible(False)

# plt.savefig(f'images/{feature_file_name}_all_data.jpg', dpi=300)
plt.show()
plt.close()

### Use the trained model to predict crop yields over all years from 1km grid-cell resolution features 

Recall that after we executed imputation on all feature years in the dataframe `features`, we copied the dataframe and named it `features_all_years`. Now we can plug that into the model to visualize how our model performs over time.

In [None]:
# recall the object we created earlier, before we split the features by year into those that would train the model 
# and those that would be fed into the trained model to predict crop yields
# in years for which we do not have crop data
features_all_years.head(3)

In the following chunk, we drop certain columns from `features_all_years` because we only need to feed the feature data into the model to generate predictions. Using the argument `axis = 1`, we specify that we are dropping columns rather than rows. 

In [None]:
x_all = features_all_years.drop([
    'year', 
    'geometry',
    'district',
    'crop_perc'
], axis = 1)

In the following chunk, we execute the model on the features from the dataframe `features_all_years`. The crop yield predictions for each row populate a new column in the dataframe.

The model is run inside the `np.maximum()` function because if we run it without being wrapped inside function, some crop predictions are negative values, but we need them all to be positive because conceptually crop yields cannot be negative.

In [None]:
features_all_years['yield_prediction'] = np.maximum(ridge_cv_random.predict(x_all), 0)

In [None]:
# check out the dataframe with the new column of predictions
features_all_years.head(3)

The dataframe is already a geodataframe, so we do not have to convert it to one before mapping predictions. However, we do need to replace all the zero value crop percentage areas with `NA`. We do this by applying the `mask()` function. This function is similar to an if-else statement. If the value of the `crop_perc` is equal to 0, that value is replaced by the value of the second argument, which is `NA`. If the value of `crop_prec` is _not_ equal to zero, we retain the current value. The argument `inplace = True` executes this replacement in the same cell. 

In [None]:
features_all_years['yield_prediction'].mask(features_all_years['crop_perc']==0, np.nan, inplace=True)

Recall that this dataframe has a geometry column, with latitude and longitude together. In order to map the predicted features, we separate this geometry column into separate `lon` and `lat` columns. 

In [None]:
# extract the longitude and latitude from the geometry column, and make then into independent columns
features_all_years['lon'], features_all_years['lat'] = features_all_years.geometry.x, features_all_years.geometry.y

Plot the predicted features for each year:

In [None]:
def scatter(x, y, c, **kwargs):
    plt.scatter(x, y, c=c, s = 1.25)
sns.color_palette("viridis", as_cmap=True)
g = sns.FacetGrid(
    features_all_years, 
    col="year", 
    col_wrap = 4, 
    height=5, 
    aspect=1
)
g.map(scatter, "lon", "lat", "yield_prediction")
g.set_axis_labels(r"Yield Prediction")
# save the figure and name the file so that it represents the model parameters that created the predictions
# plt.savefig(f'images/{feature_file_name}_all_predictions.jpg', dpi=300)

Plot the model's predicted features summarized to district level. In this visualization, we choose a specific year to examine rather than visualizing all years in one figure. Visualizing the the features summarized to district level is interesting because the crop data resolution provided by Zambia Statistics Agency is at the district level, and therefore it is easier to compare our model results to those ground-truth values when they are summarized to district level as well. Furthermore, our model's crop predictions for the years 2020 and 2021 might be more valuable when summarized to district level if Zambian governments, policy-makers, farmers, and researchers wish to use this data to determine crop imports, exports, and storage according to district summaries. 

In [None]:
features_all_years_summary = (
    features_all_years
    .groupby(['district',"year"], as_index = False)['yield_prediction']
    .mean()
    .set_index('district')
)

In [None]:
# join Zambia's shapefile to the summarized features to map the districts
# reset the index so it is a properly formatted dataframe
features_all_years_summary = features_all_years_summary.join(country_shp).reset_index()

Now that the geometries have been converted to districts from points, the geomatries are now polygons. There is still a row for each district for each year.

In order to change the year visualized, simply change the year in the following code and re-run the chunk.

In [None]:
features_all_years_summary[features_all_years_summary.year == 2020].plot(column = "yield_prediction")

Plot a boxplot for each year to visualize the range and quantile distribution of each year's crop predictions, summarized to district level. This enables us to identify years with exceptional disparities between the predicted yields by district. It also allows us to identify years that have many outliers.

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x="year", y="yield_prediction", data = features_all_years_summary)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Predicted Yield", fontsize=15)

Visualize the total crop yield predictions by year. This bar chart shows the sum of all the district crop yields.

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x="year", y="yield_prediction", data = features_all_years_summary, estimator = sum)

## Yield and Residual Plots

Create a dataframe of residuals called `residuals_df` from the `features_summary` dataframe. Note that we are _not_ using the predicted crop yields for _all_ years for these residuals, but rather the ground-truth crop yields for just the years through 2018.

The residuals give us an idea of the amount of uncertianty that is present in our model. By demeaning the residuals over space, we are able to remove the uncertainty over space and better determine our model performance over time and our uncertainty over time.

In [None]:
x_all = features_summary.drop(drop_cols, axis = 1)

# create empty dataframe to then populate with columns
residual_df = pd.DataFrame()

residual_df["yield_mt"] = features_summary.yield_mt.to_numpy()
residual_df["log_yield"] = np.log10(features_summary.yield_mt.to_numpy() + 1)
residual_df["prediction"] = np.maximum(ridge_cv_random.predict(x_all), 0)
residual_df["residual"] = residual_df["log_yield"] - residual_df["prediction"]
residual_df["year"] = features_summary.year
residual_df["district"] = features_summary.district
# join the district geometries
residual_df = residual_df.join(country_shp, how = "left", on = "district")

# demean by location so we can analyze the data over time
residual_df["district_yield_mean"] = residual_df.groupby('district')['log_yield'].transform('mean')
residual_df["district_prediction_mean"] = residual_df.groupby('district')['prediction'].transform('mean')
residual_df["demean_yield"] = residual_df["log_yield"] - residual_df["district_yield_mean"]
residual_df["demean_prediction"] = residual_df["prediction"] - residual_df["district_prediction_mean"]
residual_gdf = geopandas.GeoDataFrame(residual_df)

residual_gdf.head(3)

Visualize the residuals for the ground truth crop yields through 2018 with a boxplot.

In [None]:
plt.figure(figsize=(6, 5))
sns.boxplot(x="year", y="log_yield", data=residual_df)
plt.xlabel("Year", fontsize=15)
plt.ylabel("Log Yield", fontsize=15)

Visualize the residuals as a sum by year with a bar plot.

In [None]:
plt.figure(figsize=(6, 5))
sns.barplot(x="year", y="log_yield", data=residual_df, estimator = sum)

Visualize the crop yield residuals by year as a histogram to determine how they are distributed.

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "yield_mt", bins = 20)
g.set_axis_labels("Yield (MT)")

Visualize the log-transformed crop yield residuals by year as a histogram to compare how they are distributed after the transformation.

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "log_yield", bins = 20)
g.set_axis_labels(r"$\log_{10}(1 + Crop Yield)$")

#### Crop prediction histogram

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "prediction", bins = 20)
g.set_axis_labels(r"Crop yield predictions")

#### Residual histogram

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.histplot, "residual", bins = 20)
g.set_axis_labels(r"Residuals")

In [None]:
residual_gdf.residual.min()

In [None]:
residual_gdf.residual.max()

#### Log crop yield vs residuals

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.scatterplot, "log_yield", "residual")
g.set_axis_labels(r"$\log_{10}(1 + Crop Yield)$")

#### District residuals 

In [None]:
if satellite == 'landsat-8-c2-l2':
    fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(13, 5))
    ax1 = (residual_gdf[residual_gdf.year == 2014]
           .plot(ax = ax1, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
           .set_title("2014 Residuals"))
    ax2 = (residual_gdf[residual_gdf.year == 2015]
           .plot(ax = ax2, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
           .set_title("2015 Residuals"))
else:
    pass
fig, (ax1,ax2,ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
ax1 = (residual_gdf[residual_gdf.year == 2016]
       .plot(ax = ax1, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2016 Residuals"))
ax2 = (residual_gdf[residual_gdf.year == 2017]
       .plot(ax = ax2, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2017 Residuals"))
ax3 = (residual_gdf[residual_gdf.year == 2018]
       .plot(ax = ax3, column = "residual", legend = True, norm=colors.Normalize(vmin= -0.4, vmax=0.4), cmap = "BrBG")
       .set_title("2018 Residuals"))

caption = "A positive value is an underestimated prediction (the prediction is lower than the actual yield), a negative value is an over estimated prediction"
plt.figtext(0.5, 0.01, caption, wrap=True, horizontalalignment='center', fontsize=12)


#### Difference from the mean

In [None]:
g = sns.FacetGrid(
    residual_gdf, 
    col="year", 
#     col_wrap = 3, 
    height=4, 
    aspect=1
)
g.map(sns.scatterplot, "demean_yield", "demean_prediction")
g.set_axis_labels('Difference from Yield Mean', 'Difference from Prediction Mean')

In [None]:
fig, ax = plt.subplots(figsize= (6, 5))
ax.axline([-.2, -.2], [.2, .2], c = "k")
plt.scatter(residual_gdf.demean_yield, residual_gdf.demean_prediction)
plt.title("Demeaned truth and predictions by district")
plt.xlabel('Difference from Yield Mean')
plt.ylabel('Difference from Predictions Mean')
r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
plt.text(
    -0.2,
    .18,
    s=f"Demeaned R$^2$ = {r_squared:0.2f}",
    fontsize=15,
    fontweight="bold",
)
plt.savefig(f'images/{feature_file_name}_demean.jpg', dpi=300)

In [None]:
for yr in range(year_start+1, 2018):
    r_squared = r2_score(residual_gdf[residual_gdf.year == yr]["demean_yield"], residual_gdf[residual_gdf.year == yr]["demean_prediction"])
    pearson_r = pearsonr(residual_gdf[residual_gdf.year == yr]["demean_yield"], residual_gdf[residual_gdf.year == yr]["demean_prediction"])
    
    print(yr, f"    R^2: {r_squared:.2f}\n",
          f"Pearson's r: {pearson_r[0]:.2f}\n", 
          sep = "")
    
r_squared = r2_score(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
pearson_r = pearsonr(residual_gdf["demean_yield"], residual_gdf["demean_prediction"])
print(f"All     R^2: {r_squared:.2f}\n",
      f"Pearson's r: {pearson_r[0]:.2f}", sep = "")

In [None]:
r2 = round(pearson_r[0] ** 2, 2)
r2

#### Join residuals to the features for _all_ years to visualize the residuals of the features before they were summarized to district level.

In [None]:
complete_df = (
    features_all_years_summary
    .set_index(['district', 'year'])
    .join(residual_df
          .drop('geometry', axis = 1)
          .set_index(['district', 'year'])
         )
    .reset_index()
)

complete_df.head(3)

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))
tidy = complete_df.melt(id_vars='year').rename(columns=str.title)
tidy = tidy[tidy.Variable.isin(['yield_prediction', 'log_yield'])]
sns.barplot(x='Year', y='Value', hue='Variable', data=tidy, ax=ax1, ci = None)
sns.despine(fig)

h, l = ax1.get_legend_handles_labels()
ax1.legend(h, ['Predicted Yield', 'Observed Yield'],loc='lower left')

plt.savefig(f'images/{feature_file_name}_yield_pred.jpg', dpi=300)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x="year", y="yield_prediction", data=complete_df, estimator = sum)

### Congratulations on completing this analysis!