In [1]:
import pandas as pd

df = pd.read_csv('./One_Acre_Fund_MEL_maize_survey_data_2016-2022.csv')
df.head()

Unnamed: 0,year,country,iso,season,cz,lon,lat,strat,avg_season_gdd,avg_season_ai,avg_season_tavg,season_prec,season_prec_1,season_prec_2,season_prec_3,elev,twi,soil_rzpawhc,soil_clay,soil_pH,soil_orgC,soil_ECEC,plant_date,harvest_date,plant_date_dev,plant_doy,pl_m2,row_spacing,hybrid,hyb_type,hyb_mat,hyb_yor,hyb_tol_mln,hyb_tol_msv,hyb_tol_gls,hyb_tol_nclb,hyb_tol_rust,hyb_tol_ear_rot,N_kg_ha,P_kg_ha,K_kg_ha,compost,comp_kg_ha,manure,fert_in_hole,lime_kg_ha,weeding,pesticide,disease,pest,striga,water_excess,yield_kg_ha
0,2016,kenya,KEN,first season,7701,34.067,0.227,01_KEN7701S1,2509.099854,0.998492,21.275661,677.039708,228.759519,365.227443,83.052746,1280,8.322142,120.0,34.0,6.3,11.182494,13.879732,2016-04-13,,22.0,104.0,,70.666656,True,,2.0,2004.0,False,False,False,False,False,False,51.89205,24.7105,0.0,False,0.0,,True,0.0,,,False,False,,False,1462.951369
1,2017,kenya,KEN,first season,7701,34.1,0.238,01_KEN7701S1,2552.900146,1.038017,21.507408,760.532583,154.624451,464.134379,141.773752,1288,10.117007,120.0,38.0,5.9,12.463738,12.463738,2017-03-31,2017-07-14,8.0,90.0,2.736283,68.666664,True,,2.0,2004.0,False,False,False,False,False,False,51.89205,24.7105,0.0,False,0.0,,True,0.0,False,True,False,True,,False,2817.771269
2,2019,kenya,KEN,first season,7701,34.082,0.247,01_KEN7701S1,2580.099854,1.008526,21.651323,724.153837,181.67682,359.354439,183.122578,1243,7.909871,120.0,33.0,6.2,10.023176,10.023176,2019-04-15,2019-08-05,23.0,105.0,4.541739,72.666664,True,,2.0,2004.0,False,False,False,False,False,False,27.873444,13.83788,0.0,False,0.0,,True,0.0,False,False,False,True,,False,0.0
3,2020,kenya,KEN,first season,7701,34.098,0.232,01_KEN7701S1,2562.800049,1.018445,21.55979,998.342599,379.451999,428.776234,190.114366,1271,8.969049,120.0,32.0,6.0,10.023176,10.023176,2020-03-28,2020-07-16,6.0,88.0,3.986111,,True,3_way_hyb,1.0,2003.0,False,False,True,False,False,True,44.4789,49.421,0.0,True,,True,False,0.0,True,False,False,True,True,False,4788.722787
4,2020,kenya,KEN,first season,7701,34.094,0.272,01_KEN7701S1,2612.800049,1.023641,21.824339,998.342599,379.451999,428.776234,190.114366,1226,6.008638,120.0,33.0,5.9,12.463738,11.182494,2020-02-05,2020-08-04,-46.0,36.0,2.277778,,True,,2.0,2004.0,False,False,False,False,False,False,51.89205,24.7105,0.0,False,,,False,0.0,False,,False,False,True,False,2193.982268


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('./One_Acre_Fund_MEL_maize_survey_data_2016-2022.csv')

# Separate features and target variable
y = df['yield_kg_ha']
X = df.drop(columns=['yield_kg_ha'])

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Get feature importances
feature_importances = clf.named_steps['model'].feature_importances_

# Get feature names after one-hot encoding
feature_names = clf.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = np.concatenate([numerical_cols, feature_names])

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({'feature': all_feature_names, 'importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importances_df.head(20))
plt.title('Top 20 Feature Importances for Predicting Yield')
plt.show()

mae, mse, r2, feature_importances_df.head(20)

KeyboardInterrupt: 