# Data Prep for Explainable Boosting Machine (EBM)

## Set Up

In [1]:
# load libraries
import pandas as pd
import numpy as np

# load data
raw_df = pd.read_csv('../../data/raw/rawdata_new.csv')

# create copy of data
processed_df = raw_df.copy()

In [2]:
# quick shape check
processed_df.shape

(105, 227)

## Prep predictors

In [3]:
# find columns with NA
print(processed_df.columns[processed_df.isna().any()])

Index(['total_agr_land', 'workforce', 'eco_proof', 'organic', 'farmtype',
       'age', 'legum_eff', 'conc_eff', 'add_eff', 'lact_eff',
       ...
       'net_name10_oth', 'net_name2_imp', 'net_name3_imp', 'net_name4_imp',
       'net_name5_imp', 'net_name6_imp', 'net_name7_imp', 'net_name8_imp',
       'net_name9_imp', 'net_name10_imp'],
      dtype='object', length=131)


In [4]:
# remove columns with NA
processed_df.dropna(axis=1, inplace=True)

In [5]:
# quick shape check
processed_df.shape

(105, 96)

In [6]:
# drop ID
processed_df.drop(['id'], axis=1, inplace=True)

In [7]:
# quick shape check
processed_df.shape

(105, 95)

In [8]:
# convert date to numeric; use days since March 1, since paper says survey began in March
processed_df['date'] = pd.to_datetime(processed_df['date'])
processed_df['date_diff'] = (processed_df['date'] - pd.to_datetime('March 1, 2019'))
processed_df['days_since_first_survey_completed'] = [d.days for d in processed_df['date_diff']]

In [9]:
# quick shape check
processed_df.shape

(105, 97)

In [10]:
# drop intermediate date columns
processed_df.drop(['date', 'date_diff'], axis=1, inplace=True)

In [11]:
# quick shape check
processed_df.shape


(105, 95)

## Prep outcome

In [12]:
# calculate outcome variable
implemented_cols = ['legum', 'conc', 'add', 'lact', 'breed', 'covman', 'comp', 'drag', 'cov', 'plough', 'solar', 'biog', 'ecodr']
processed_df['num_implemented'] = (processed_df[implemented_cols] == 1).sum(axis=1)
processed_df['num_applicable'] = (processed_df[implemented_cols] != 3).sum(axis=1)
processed_df['prop_implemented'] = processed_df['num_implemented'] / processed_df['num_applicable']

In [13]:
# quick shape check
processed_df.shape

(105, 98)

In [14]:
# remove other columns about implemented measures
processed_df.drop(implemented_cols, axis=1, inplace=True)
processed_df.drop(['num_implemented', 'num_applicable'], axis=1, inplace=True)

In [15]:
# quick shape check
processed_df.shape

(105, 83)

In [16]:
# write dataset to CSV
processed_df.to_csv('../../data/processed/ebm_data.csv')

## Split dataset

In [17]:
# load libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, OrdinalEncoder

In [18]:
# split into predictors and outcome
X = processed_df.iloc[:,:-1].to_numpy()
y = processed_df.iloc[:,-1].to_numpy()

In [19]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2021)

## Encode categorical feature

In [20]:
# encode categorical variable region at column 22 into numeric
encoder = OrdinalEncoder()
encoder.fit(X_train[:,22].reshape(-1,1))
X_train[:,22] = encoder.transform(X_train[:,22].reshape(-1,1)).reshape(-1)
X_test[:,22]  = encoder.transform(X_test[:,22].reshape(-1,1)).reshape(-1)

## Normalize all features

In [22]:
# normalize features
normalizer = Normalizer()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_test_norm  = normalizer.transform(X_test)