In [1]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils import clip_grad_norm_
import torch.optim as optim
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score
from sklearn.model_selection import cross_validate

In [2]:
dftest1 = pd.read_csv("vh_vv_data_new_20.csv").drop("Unnamed: 0", axis =1)
dftest2 = pd.read_csv("vh_vv_data_new_20_100.csv").drop("Unnamed: 0", axis =1)
dftest3 = pd.read_csv("vh_vv_data_new_100_200.csv").drop("Unnamed: 0", axis =1)
dftest4 = pd.read_csv("vh_vv_data_new_200_225.csv").drop("Unnamed: 0", axis =1)
dftest5 = pd.read_csv("vh_vv_data_new_225_300.csv").drop("Unnamed: 0", axis =1)
dftest6 = pd.read_csv("vh_vv_data_new_300_350.csv").drop("Unnamed: 0", axis =1)
dftest7 = pd.read_csv("vh_vv_data_new_350_400.csv").drop("Unnamed: 0", axis =1)
dftest8 = pd.read_csv("vh_vv_data_new_400_450.csv").drop("Unnamed: 0", axis =1)
dftest9 = pd.read_csv("vh_vv_data_new_450_500.csv").drop("Unnamed: 0", axis =1)
dftest10 = pd.read_csv("vh_vv_data_new_500_557.csv").drop("Unnamed: 0", axis =1)
dftest = pd.concat([dftest1,dftest2,dftest3,dftest4, dftest5,dftest6, dftest7, dftest8, dftest9,dftest10])

### Prepare data (modified to select 550 random points for CV)

In [3]:
def clean_from_df_pad(string):
    """
    takes in each cell as a string and replaces it with an array of float value. 
    Also pads the sequence to length 26
    """
    splitted = string.split(" ")
    result = []
    for i in splitted:
        number = float(i.replace(",","").replace("[", "").replace("]", ""))
        result.append(number)
    if len(result) !=26:
        missing = 26-len(result)
        comp = [0] * missing
        result = result + comp
    return np.asarray(result)

In [4]:
# read in those dataframes again before this
dftest["vv_list"] = dftest.vv_list.apply(clean_from_df_pad)
dftest["vh_list"] = dftest.vh_list.apply(clean_from_df_pad)
dftest['vv/vh_list'] = dftest['vv/vh_list'].apply(clean_from_df_pad)

In [5]:
# add a climate index called rvi
def calculate_rvi(vv, vh):
    """
    takes in a list of vvs and a list of vhs, return a list of their rvis
    dop = vv/(vv+vh)
    m = 1-dop
    m = np.sqrt(m)
    power_func = 4*vh/(vv+vh)
    rvi = m*power_func
    """
    if len(vv)!= len(vh):
        return None
    else:
        rvi_lst = []
        for i in range(len(vv)):
            rvi = np.sqrt(1-vv[i]/(vv[i]+vh[i])) * 4*vh[i]/(vv[i]+vh[i])
            rvi_lst.append(rvi)
    return rvi_lst

calculate_rvi_func = np.vectorize(calculate_rvi)

In [6]:
dftest["rvi"] = calculate_rvi(list(dftest.vv_list), list(dftest.vh_list))

In [7]:
# length is equal
def df_to_arr_equal_len (df):
    """
    takes in a df, which will be data, and transforms it into a 3-d array with its original size
    """
    # loop through every row in df
    res = []
    for i in range(df.shape[0]):
        row = list(df.iloc[i,:]) # each row of df is turned into a list of three elements, where each element is an array
        new_row = [] # one day, one location, three variables
        for var in row:
            new_row.append(var)
        res.append(new_row)
    return np.asarray(res)

In [8]:
arr = df_to_arr_equal_len(dftest)
np.isnan(arr).sum()

2504

In [9]:
# replace null rvi's with 0
arr = np.nan_to_num(arr, nan=0)
np.isnan(arr).sum()

0

In [10]:
arr.shape

(557, 4, 26)

In [12]:
# get gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)  

cuda:0


In [77]:
def cross_validation(arr, batch_size, num_epochs, print_freq, model_type):
    """
    takes in a model and data of size [557,26,4]. batch_size is 50, 10 , or None
    Applies 11-fold cross validation and returns criterion for each fold
    check that model has batch_size 10 or 50
    first randomly select 550 samples from the whole sample set
    divide these 550 samples into 11 parts
    for i in range(0,11):
    testing set would be the ith part, which has length 50
    train the model on the rest parts
    do the test
    and print the results
    """
    # prepare dataset
    cv_indices = random.sample(range(0,557), 550) # already in random order
    cv_input = arr
    cv_target = np.asarray(pd.read_csv("Crop_Yield_Data_challenge_2.csv")["Rice Yield (kg/ha)"])
    
    # get it into 11 parts
    cv_indices_grouped = np.asarray(cv_indices).reshape((11,50)) # each row is a group
    
    #training_losses_lst = []
    testing_metrices_lst = []
    for i in range(0,11):
        print("Testing Fold: ", i)
        # create a new model
        if model_type == "RNN":
            model = MyRNN(input_size=4, hidden_size=64, num_layers=5, output_size=1, 
                          batch_size=50, nonlinearity="tanh", dropout = 0.1)
        if model_type =="GRU":
            model = GRU2(input_size=4, hidden_size=100, output_size = 1, num_layers=10, 
                 batch_first=True, batch_size =10)
        if model_type =="LSTM":
            model = MyLSTM(input_size=4, hidden_size=300, output_size =1, num_layers=8)
            
        model.to(device)
        criterion = nn.L1Loss() # training criterion
        
        # partition data and select group
        testing_input = cv_input[cv_indices_grouped[i]] # use the ith group as testing
        testing_target = cv_target[cv_indices_grouped[i]]
        training_input = cv_input[[j for j in cv_indices if j not in cv_indices_grouped[i]]] # use the rest 10 groups for training
        training_target = cv_target[[j for j in cv_indices if j not in cv_indices_grouped[i]]]
        
        # change them into datasets and create dataloaders
        testing_input = torch.from_numpy(testing_input).float().view(50,26,4)
        testing_target = torch.from_numpy(testing_target).float().view(50)
        testing_dataloader = DataLoader(MyDataset(testing_input, testing_target), batch_size = batch_size, shuffle=True)
        
        training_input = torch.from_numpy(training_input).float().view(500,26,4)
        training_target = torch.from_numpy(training_target).float().view(500)
        training_dataloader = DataLoader(MyDataset(training_input, training_target), batch_size = batch_size, shuffle=True)
        
        # train the model on the training set and test its performance on the testing set
        if batch_size != None:
            model, training_losses = train_model_batch(training_dataloader, model, num_epochs, print_freq, model_type)
            test_results = test_model_batch(testing_dataloader, model) # a list of 4 metrices
            testing_metrices_lst.append(test_results)
        
        else:
            model, training_losses = train_model_no_batch(training_input, training_target, model, num_epochs, print_freq)
            test_results = test_model_no_batch(testing_input, testing_target, model) # a list of 4 metrices
            testing_metrices_lst.append(test_results)
    print("Finished CV")
    return (testing_metrices_lst)

In [54]:
def train_model_batch(training_dataloader, model, num_epochs, print_freq, model_type):
    losses= []
    if model_type =="GRU":
        optimizer = optim.SGD(model.parameters(), lr=0.03, momentum = 0.9)
    if model_type =="RNN":
        optimizer = optim.Adam(model.parameters(), lr=0.008)

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(training_dataloader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            losses.append(running_loss/ len(training_dataloader))
        if epoch% print_freq ==1:
            print('Epoch [%d/%d], Loss: %.4f' % (epoch+1, num_epochs, running_loss / len(training_dataloader)))
    print("finished training")
    return (model, losses)

In [96]:
def train_model_no_batch(training_input, training_target, model, num_epochs, print_freq):
    """
    takes in the training input and target separtely, and trains the model. Used for LSTMbest
    """
    losses = []
    training_input, training_target = training_input.to(device), training_target.to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.09, momentum=0.9)
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        output_seq = model(training_input)
        loss = criterion(output_seq.squeeze(), training_target)
        losses.append(loss.item())
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm = 1.0, norm_type=2, error_if_nonfinite=False)
        optimizer.step()
        if epoch % print_freq == 1:
            print('Epoch [%d/%d], Loss: %.4f' % (epoch+1, num_epochs, loss.item()))
            #print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))
    print("Finished training")
    return (model, losses)

In [50]:
def test_model_batch(testing_dataloader, model):
    """
    Evaluate the model on the test dataset, return four metrices, all averaged across batches
    takes in a dataloader
    """
    total_explained_variance = 0.0
    total_MSE= 0.0
    total_MAE = 0.0
    total_r2 = 0.0
    for batch in testing_dataloader:
        inputs, targets = batch[0].to(device), batch[1].cpu().numpy()
        outputs = model(inputs)
        outputs = outputs.data.cpu().numpy()
        total_explained_variance += explained_variance_score(y_true=targets, y_pred=outputs)
        total_MSE += mean_squared_error(y_true=targets, y_pred=outputs)
        total_MAE += mean_absolute_error(y_true=targets,y_pred=outputs)
        total_r2 += r2_score(y_true=targets,y_pred=outputs)
    a = len(testing_dataloader)
    #print("Explained varaince, MSE, MAE, R2: \n")
    return ([total_explained_variance/a, total_MSE/a, total_MAE/a, total_r2/a])

In [79]:
def test_model_no_batch(testing_input, testing_target, model):
    """
    Evaluate the model on the test dataset, return four metrices, all averaged across batches
    takes in the inputs and targets separately
    """
    inputs = testing_input.to(device)
    targets = testing_target.cpu().numpy()
    outputs = model(inputs)
    outputs = outputs.data.cpu().numpy()
    total_explained_variance = explained_variance_score(y_true=targets, y_pred=outputs)
    total_MSE = mean_squared_error(y_true=targets, y_pred=outputs)
    total_MAE = mean_absolute_error(y_true=targets,y_pred=outputs)
    total_r2 = r2_score(y_true=targets,y_pred=outputs)
    #print("Explained varaince, MSE, MAE, R2: \n")
    return([total_explained_variance, total_MSE, total_MAE, total_r2])

---
### Create models
- Best RNN: RNN model 4. Hidden_size = 64, num_layers = 5, batch_size = 50, optimizer = Adam(learning rate = 0.008), dropout = 0.1. Copied code from notebook Imrpove_RNN

In [37]:
# define baseline RNN, with batch norm 
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, batch_size, nonlinearity, 
                 dropout, bidirectional=False):
        super(MyRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.bn = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        h0 = torch.randn(self.num_layers, self.batch_size, self.hidden_size).requires_grad_().to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.bn(out[:, -1, :])
        out = self.fc(out)
        return out

In [38]:
RNN_cv_results = cross_validation(arr, batch_size=50, num_epochs=400, print_freq=100, model_type="RNN")

Testing Fold:  0
MyRNN(
  (rnn): RNN(4, 64, num_layers=5, batch_first=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)
Epoch [2/400], Loss: 6620.6317
Epoch [102/400], Loss: 682.0594
Epoch [202/400], Loss: 682.1132
Epoch [302/400], Loss: 682.2592
finished training
Testing Fold:  1
MyRNN(
  (rnn): RNN(4, 64, num_layers=5, batch_first=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)
Epoch [2/400], Loss: 6651.5147
Epoch [102/400], Loss: 693.0118
Epoch [202/400], Loss: 692.4015
Epoch [302/400], Loss: 692.6214
finished training
Testing Fold:  2
MyRNN(
  (rnn): RNN(4, 64, num_layers=5, batch_first=True)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)
Epoch [2/400], Loss: 6657.0917
Epoch [

In [58]:
print("RNN CV results")
RNNdf = pd.DataFrame(np.asarray(RNN_cv_results))
RNNdf.columns=["Explained variance", "MSE", "MAE", "R2"]
RNNdf

RNN CV results


Unnamed: 0,Explained variance,MSE,MAE,R2
0,0.0006913543,648062.0625,691.936951,-0.07878
1,0.0003179312,506082.53125,607.80127,-0.075194
2,0.0,717951.4375,749.620605,-0.099635
3,-2.384186e-07,516508.625,577.372498,-0.004436
4,0.0,603303.8125,640.863037,-0.116493
5,-7.748604e-06,689362.5625,708.264893,-0.19902
6,0.0005429983,579992.5,643.106506,-0.229817
7,8.517504e-05,956042.625,844.587341,-0.017233
8,-0.0005966425,690654.875,709.087036,-0.051448
9,0.0,638227.8125,699.486389,-0.000966


---
### Best GRU
GRU model 5. Hidden_size = 100, num_layers = 10, batch_size = 10, optimizer = SGD(learning rate = 0.01, momentum = 0.9), dropout = 0.1
- copied from notebook Improve_GRU
- **NOTE: CHANGED LEARNING RATE TO 0.03 TO GET FASTER RESULTS**

In [52]:
class GRU2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, 
                 batch_first, batch_size):
        super(GRU2, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.gru = nn.GRU(input_size, hidden_size, num_layers=num_layers, 
                  batch_first=True, dropout=0.1, bidirectional=False)
        self.fc = nn.Linear(hidden_size, output_size)
        self.bn = nn.BatchNorm1d(num_features=self.hidden_size, affine = False)
    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.randn(self.gru.num_layers, batch_size, self.gru.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out, _ = self.gru(x, h0)
        out = self.bn(out[:,-1, :])
        out = self.fc(out)
        return out

In [57]:
GRU_cv_results = cross_validation(arr, batch_size=10, num_epochs=600, print_freq=100, model_type="GRU")

Testing Fold:  0
Epoch [2/600], Loss: 6595.8692
Epoch [102/600], Loss: 5095.8206
Epoch [202/600], Loss: 3595.5766
Epoch [302/600], Loss: 2095.6776
Epoch [402/600], Loss: 920.1450
Epoch [502/600], Loss: 712.0618
finished training
Testing Fold:  1
Epoch [2/600], Loss: 6610.9317
Epoch [102/600], Loss: 5110.8831
Epoch [202/600], Loss: 3610.6392
Epoch [302/600], Loss: 2110.7402
Epoch [402/600], Loss: 917.1657
Epoch [502/600], Loss: 694.4841
finished training
Testing Fold:  2
Epoch [2/600], Loss: 6610.4758
Epoch [102/600], Loss: 5110.4272
Epoch [202/600], Loss: 3610.1832
Epoch [302/600], Loss: 2110.2845
Epoch [402/600], Loss: 906.9373
Epoch [502/600], Loss: 673.5804
finished training
Testing Fold:  3
Epoch [2/600], Loss: 6627.6890
Epoch [102/600], Loss: 5127.6405
Epoch [202/600], Loss: 3627.3964
Epoch [302/600], Loss: 2127.4977
Epoch [402/600], Loss: 931.4793
Epoch [502/600], Loss: 694.6684
finished training
Testing Fold:  4
Epoch [2/600], Loss: 6611.6348
Epoch [102/600], Loss: 5111.5863
Epo

In [59]:
print("GRU CV results")
GRUdf = pd.DataFrame(np.asarray(GRU_cv_results))
GRUdf.columns=["Explained variance", "MSE", "MAE", "R2"]
GRUdf

GRU CV results


Unnamed: 0,Explained variance,MSE,MAE,R2
0,-4.1e-05,468164.4625,561.378479,-0.266563
1,0.00019,618182.2125,643.558917,-0.119006
2,-1.2e-05,866154.7625,819.188965,-0.173293
3,8.5e-05,469954.7625,615.200354,-0.304278
4,-0.000163,822290.275,767.333105,-0.066486
5,-0.0001,755496.0875,734.756726,-0.410618
6,6.8e-05,692739.2875,690.345813,-0.157411
7,-5e-06,574712.09375,651.396497,-0.014103
8,-4.4e-05,775363.025,765.190955,-0.198012
9,-0.000144,543813.21875,621.243823,-0.048067


---
### Best LSTM
LSTM model 2. Hidden_size = 300, num_layers = 8, no batch norm layer, optimizer = SGD(learning rate = 0.03, momentum = 0.9), dropout = 0
- copied from notebook Improve_LSTM
- **NOTE: CHANGED LEARNING RATE FROM 0.03 TO 0.09 TO GET FASTER RESULTS**

In [65]:
class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(MyLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)
    def forward(self, input):
        batch_size = input.size(0)
        h0 = torch.zeros(self.lstm.num_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.lstm.num_layers, batch_size, self.hidden_size).to(device)
        h0, c0 = h0.to(device), c0.to(device)
        lstm_out, _ = self.lstm(input, (h0, c0))
        output = self.linear(lstm_out[:, -1, :])
        return output

In [98]:
LSTM_cv_results = cross_validation(arr, batch_size=None, num_epochs=600, print_freq=100, model_type="LSTM")

Testing Fold:  0
Epoch [2/600], Loss: 6639.6626
Epoch [102/600], Loss: 5418.0366
Epoch [202/600], Loss: 3856.7141
Epoch [302/600], Loss: 2295.2795
Epoch [402/600], Loss: 870.6371
Epoch [502/600], Loss: 662.4599
Finished training
Testing Fold:  1
Epoch [2/600], Loss: 6609.5532
Epoch [102/600], Loss: 5376.8452
Epoch [202/600], Loss: 3815.4868
Epoch [302/600], Loss: 2254.0520
Epoch [402/600], Loss: 852.3378
Epoch [502/600], Loss: 680.7924
Finished training
Testing Fold:  2
Epoch [2/600], Loss: 6635.9175
Epoch [102/600], Loss: 5411.2812
Epoch [202/600], Loss: 3849.9290
Epoch [302/600], Loss: 2288.4946
Epoch [402/600], Loss: 872.9791
Epoch [502/600], Loss: 678.1746
Finished training
Testing Fold:  3
Epoch [2/600], Loss: 6636.3481
Epoch [102/600], Loss: 5414.5923
Epoch [202/600], Loss: 3853.2546
Epoch [302/600], Loss: 2291.8191
Epoch [402/600], Loss: 869.6761
Epoch [502/600], Loss: 667.5012
Finished training
Testing Fold:  4
Epoch [2/600], Loss: 6611.7305
Epoch [102/600], Loss: 5379.2334
Epo

In [99]:
print("LSTM CV results")
LSTMdf = pd.DataFrame(np.asarray(LSTM_cv_results))
LSTMdf.columns=["Explained variance", "MSE", "MAE", "R2"]
LSTMdf

LSTM CV results


Unnamed: 0,Explained variance,MSE,MAE,R2
0,0.0,817050.0625,816.748108,-0.000375
1,0.0,588380.25,633.319641,-0.211024
2,0.0,624038.875,660.234558,-0.004812
3,0.0,755863.375,767.54248,-0.003228
4,0.0,550588.375,602.386475,-0.195227
5,0.0,763971.9375,749.63501,-0.002836
6,1.192093e-07,534230.375,591.382996,-0.0061
7,0.0,641230.9375,667.585083,-0.103095
8,0.0,745144.75,746.799988,-0.034261
9,-1.192093e-07,513905.71875,583.262268,-0.016261


______
### Helper functions

In [14]:
class MyDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.Y[idx]
        return x, y