<h1>Random Forest Regressor</h1>
<h2>Import</h2>

In [1]:
import glob
import math
from numba import jit
import numpy as np
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

<h2>SMAPE calculation</h2><br>
@param<br>
y_true = array of actual values<br>
y_pred = array of predicted values<br>

In [2]:
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

<h2>Evaluation printing format 1</h2><br>
@param<br>
array = target array to append<br>
type_eval = string of evaluation type<br>
eval_values = array of evaluation values<br><br>
Arrays of evaluation values for MA2, MA3 and MA4 respectively.<br>
Index 0 is 0 in all array because of the initialisation.<br>
Index 1 - 3 is MA2 DFma_1 which are MA2 without CD, with CD, and % improved respectively.<br>
Index 4 - 6 is MA2 DFma_2.<br>
...<br>
Index 16 - 18 is MA2 DFma_6.<br>
Index 19 - 21 is MA3 DFma_1.<br>
...<br><br>
Each MA has 3 values in each type of evaluation.<br>
Each set of DF has 18 values in total.
<h3>Thus,</h3>the formula for calculating the index to get the correct value is:<br>
<h4>array[3 * (which DFma_X in range [0, 5]) + 18 * (which MA_X in range [0, 2]) + 1]</h4>
<h4>array[3 * (which DFma_X in range [0, 5]) + 18 * (which MA_X in range [0, 2]) + 2]</h4>
<h4>array[3 * (which DFma_X in range [0, 5]) + 18 * (which MA_X in range [0, 2]) + 3]</h4>

In [3]:
def evaluation_print(array, type_eval, eval_values):
    for i in range (6):
        eval_arr = np.asarray([type_eval + ' DFma_' + str(i + 1)])
        for j in range (3):
            eval_arr = np.append(eval_arr, [eval_values[3 * i + 18 * j + 1], 
                                            eval_values[3 * i + 18 * j + 2], 
                                            eval_values[3 * i + 18 * j + 3]])
        array = np.append(array, [eval_arr], axis = 0)
    return array

<h2>Evaluation printing format 2</h2><br>
@param<br>
array = target array to append<br>
type_eval = string of evaluation type<br>
eval_values = array of evaluation values<br><br>
Instead of having multiple MAs, we have only 1 dataset for each DF.<br>
Index 0 is 0 in all array because of the initialisation.<br>
Index 1 - 3 is DF_1 which are without CD, with CD, and % improved respectively.<br>
Index 4 - 6 is DF_2.<br>
...<br>
Index 16 - 18 is DF_6.<br><br>
Each set of DF has 3 values in total.
<h3>Thus,</h3>the formula for calculating the index to get the correct value is:<br>
<h4>array[3 * (which DF_X in range [0, 5]) + 1]</h4>
<h4>array[3 * (which DF_X in range [0, 5]) + 2]</h4>
<h4>array[3 * (which DF_X in range [0, 5]) + 3]</h4>

In [4]:
def evaluation_print_original(array, type_eval, eval_values):
    for i in range (6):
        eval_arr = np.asarray([type_eval + ' DF_' + str(i + 1)])
        eval_arr = np.append(eval_arr, [eval_values[3 * i + 1], 
                                        eval_values[3 * i + 2], 
                                        eval_values[3 * i + 3]])
        array = np.append(array, [eval_arr], axis = 0)
    return array

<h2>Evaluation printing format 3</h2><br>
@param<br>
array = target array to append<br>
type_eval = string of evaluation type<br>
eval_values = array of evaluation values<br><br>
This format is for modified lags which the response variable is always DFma_1 for MA or DF_1 for without smoothing dataset.<br>
Index 0 is 0 in all array because of the initialisation.<br>
Index 1 - 3 is time horizon 1 week ahead which are without CD, with CD, and % improved respectively.<br>
Index 4 - 6 is time horizon 2 weeks ahead.<br>
...<br>
Index 16 - 18 is time horizon 6 weeks ahead.<br><br>
Each set of time horizon has 9 values in total.
<h3>Thus,</h3>the formula for calculating the index to get the correct value is:<br>
<h4>array[3 * (which time horizon is in range [0, 5]) + 18 * (which MA_X in range [0, 2]) + 1]</h4>
<h4>array[3 * (which time horizon is in range [0, 5]) + 18 * (which MA_X in range [0, 2]) + 2]</h4>
<h4>array[3 * (which time horizon is in range [0, 5]) + 18 * (which MA_X in range [0, 2]) + 3]</h4>

In [5]:
def evaluation_print_modified_lag(array, type_eval, eval_values):
    for i in range (6):
        eval_arr = np.asarray([type_eval + ' ' + str(i + 1) + '-week ahead'])
        for j in range (3):
            eval_arr = np.append(eval_arr, [eval_values[3 * i + 18 * j + 1], 
                                            eval_values[3 * i + 18 * j + 2], 
                                            eval_values[3 * i + 18 * j + 3]])
        array = np.append(array, [eval_arr], axis = 0)
    return array

In [6]:
def evaluation_print_modified_lag_original(array, type_eval, eval_values):
    for i in range (6):
        eval_arr = np.asarray([type_eval + ' ' + str(i + 1) + '-week ahead'])
        eval_arr = np.append(eval_arr, [eval_values[3 * i + 1], 
                                        eval_values[3 * i + 2], 
                                        eval_values[3 * i + 3]])
        array = np.append(array, [eval_arr], axis = 0)
    return array

<h2>Variables that you need to change before running the code</h2>
<b>province</b> = 'NST' or 'Krabi'<br>
<b>number of leaves</b> = 10<br>
<b>data set destination</b> = '...nakhon...' or '...krabi...'<br>

In [40]:
province1 = 'Bangkok'
province2 = 'bangkok'

<h1>Normal Lags</h1>

- Predict DFma_1 to DFma_6<br>
- Predict DF_1 to DF_6

<h2>District level</h2>
For MAs (adjusted CD)

In [63]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_dist_total_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_dist_total_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_dist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 19 - j]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21],
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # and LST_wm4 [col 25]
        
        ## With CD ##
    
        # Import the dataset
        # features: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21], 
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # LST_wm4 [col 25],
        # bin_pop9s [col 26],
        # bowl_pop9s [col 27],
        # bucket_pop9s [col 28],
        # misc_short_pop9s [col 29],
        # jar_pop9s [col 30],
        # pottedplant_pop9s [col 31],
        # tire_pop9s [col 32],
        # misc_tall_pop9s [col 33],
        # and total_pop9s [col 34]
        
        train_features_withoutCD = df_train_dist.iloc[:, [20, 21, 22, 23, 24, 25]]
        train_features_withCD = df_train_dist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        test_features_withoutCD = df_test_dist.iloc[:, [20, 21, 22, 23, 24, 25]]
        test_features_withCD = df_test_dist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        # labels: response (target) variables from DFma_1 to DFma_6 (col 19 -> col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_' + str(j + 1)])
        test_labels = np.array(df_test_dist['DFma_' + str(j + 1)])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                                  + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                                  + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                               + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                               + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' + str(j + 1) 
                                        + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (Without smoothing, adjusted CD)

In [64]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1 - DF_6), 
# MAE (DF_1 - DF_6), 
# SMAPE (DF_1 - DF_6), 
# R-squared (DF_1 - DF_6)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_dist_total_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_dist_total_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:,[1, 2, 3, 10 - i]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # and LST_wm4 [col 25]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # LST_wm4 [col 25],
    # bin_pop9s [col 26],
    # bowl_pop9s [col 27],
    # bucket_pop9s [col 28],
    # misc_short_pop9s [col 29],
    # jar_pop9s [col 30],
    # pottedplant_pop9s [col 31],
    # tire_pop9s [col 32],
    # misc_tall_pop9s [col 33],
    # and total_pop9s [col 34]
    
    train_features_withoutCD = df_train_dist.iloc[:, [10, 11, 12, 13, 24, 25]]
    train_features_withCD = df_train_dist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
    test_features_withoutCD = df_test_dist.iloc[:, [10, 11, 12, 13, 24, 25]]
    test_features_withCD = df_test_dist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_' + str(i + 1)])
    test_labels = np.array(df_test_dist['DF_' + str(i + 1)])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 
                               + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                               + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 
                            + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                            + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    dist_code = df_train_dist['addrcode'].unique()
        
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For MAs (Normal CD)

In [65]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_dist_cd_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_dist_cd_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_dist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 19 - j]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21],
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # and LST_wm4 [col 25]
        
        ## With CD ##
    
        # Import the dataset
        # features: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21], 
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # LST_wm4 [col 25],
        # bin_pop9s [col 26],
        # bowl_pop9s [col 27],
        # bucket_pop9s [col 28],
        # misc_short_pop9s [col 29],
        # jar_pop9s [col 30],
        # pottedplant_pop9s [col 31],
        # tire_pop9s [col 32],
        # misc_tall_pop9s [col 33],
        # and total_pop9s [col 34]
        
        train_features_withoutCD = df_train_dist.iloc[:, [20, 21, 22, 23, 24, 25]]
        train_features_withCD = df_train_dist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        test_features_withoutCD = df_test_dist.iloc[:, [20, 21, 22, 23, 24, 25]]
        test_features_withCD = df_test_dist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        # labels: response (target) variables from DFma_1 to DFma_6 (col 19 -> col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_' + str(j + 1)])
        test_labels = np.array(df_test_dist['DFma_' + str(j + 1)])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_dist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' 
                                               + str(i) + '/RF_' + province2 + '_dist_MA' + str(i) 
                                               + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:
            
            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' 
                                        + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (without smoothing, normal CD)

In [66]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1 - DF_6), 
# MAE (DF_1 - DF_6), 
# SMAPE (DF_1 - DF_6), 
# R-squared (DF_1 - DF_6)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_dist_cd_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_dist_cd_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:,[1, 2, 3, 10 - i]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # and LST_wm4 [col 25]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # LST_wm4 [col 25],
    # bin [col 26],
    # bowl [col 27],
    # bucket [col 28],
    # misc_short [col 29],
    # jar [col 30],
    # pottedplant [col 31],
    # tire [col 32],
    # misc_tall [col 33],
    # and total [col 34]
    
    train_features_withoutCD = df_train_dist.iloc[:, [10, 11, 12, 13, 24, 25]]
    train_features_withCD = df_train_dist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
    test_features_withoutCD = df_test_dist.iloc[:, [10, 11, 12, 13, 24, 25]]
    test_features_withCD = df_test_dist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_' + str(i + 1)])
    test_labels = np.array(df_test_dist['DF_' + str(i + 1)])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Normal Lags/Normal CD/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Normal Lags/Normal CD/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/Original DF_0/RF_' 
                               + province2 + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/Original DF_0/RF_' 
                            + province2 + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
        
    dist_code = df_train_dist['addrcode'].unique()
    
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Normal Lags/Normal CD/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, 
                                    encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/Original DF_0/RF_' 
                                + province2 + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

<h1>Sub-district level</h1>
For MAs (adjusted CD)

In [67]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_subdist_total_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_subdist_total_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_subdist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 19 - j]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21],
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # and LST_wm4 [col 25]
        
        ## With CD ##
    
        # Import the dataset
        # features: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21], 
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # LST_wm4 [col 25],
        # bin_pop9s [col 26],
        # bowl_pop9s [col 27],
        # bucket_pop9s [col 28],
        # misc_short_pop9s [col 29],
        # jar_pop9s [col 30],
        # pottedplant_pop9s [col 31],
        # tire_pop9s [col 32],
        # misc_tall_pop9s [col 33],
        # and total_pop9s [col 34]
        
        train_features_withoutCD = df_train_subdist.iloc[:, [20, 21, 22, 23, 24, 25]]
        train_features_withCD = df_train_subdist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        test_features_withoutCD = df_test_subdist.iloc[:, [20, 21, 22, 23, 24, 25]]
        test_features_withCD = df_test_subdist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        # labels: response (target) variables from DFma_1 to DFma_6 (col 19 -> col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_' + str(j + 1)])
        test_labels = np.array(df_test_subdist['DFma_' + str(j + 1)])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' 
                                                     + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                     + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each district
        for k in subdist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/MA' + str(i) 
                                           + '/RF_' + province2 + '_BySubDistrict_MA' + str(i) + '_DFma_' 
                                           + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')



For original DF1 to DF6 (without smoothing, adjusted CD)

In [68]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1 - DF_6), 
# MAE (DF_1 - DF_6), 
# SMAPE (DF_1 - DF_6), 
# R-squared (DF_1 - DF_6)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_subdist_total_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_subdist_total_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:,[1, 2, 3, 10 - i]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # and LST_wm4 [col 25]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # LST_wm4 [col 25],
    # bin_pop9s [col 26],
    # bowl_pop9s [col 27],
    # bucket_pop9s [col 28],
    # misc_short_pop9s [col 29],
    # jar_pop9s [col 30],
    # pottedplant_pop9s [col 31],
    # tire_pop9s [col 32],
    # misc_tall_pop9s [col 33],
    # and total_pop9s [col 34]
    
    train_features_withoutCD = df_train_subdist.iloc[:, [10, 11, 12, 13, 24, 25]]
    train_features_withCD = df_train_subdist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
    test_features_withoutCD = df_test_subdist.iloc[:, [10, 11, 12, 13, 24, 25]]
    test_features_withCD = df_test_subdist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_' + str(i + 1)])
    test_labels = np.array(df_test_subdist['DF_' + str(i + 1)])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                                 + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                                 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                              + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/Original DF_0/RF_' 
                               + province2 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/Original DF_0/RF_' 
                            + province2 + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each district
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Adjusted CD/Original DF_0/RF_' 
                                       + province2 + '_BySubDistrict_DF_' + str(i + 1) + '_eval_10.csv', 
                                       header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Normal Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')



For MAs (normal CD)

In [69]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_subdist_cd_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_subdist_cd_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_subdist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 19 - j]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21],
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # and LST_wm4 [col 25]
        
        ## With CD ##
    
        # Import the dataset
        # features: independent variables
        # DFma_0 [col 20],
        # DFma_wm1 [col 21], 
        # DFma_wm2 [col 22],
        # DFma_wm3 [col 23],
        # RF_wm6 [col 24],
        # LST_wm4 [col 25],
        # bin_pop9s [col 26],
        # bowl_pop9s [col 27],
        # bucket_pop9s [col 28],
        # misc_short_pop9s [col 29],
        # jar_pop9s [col 30],
        # pottedplant_pop9s [col 31],
        # tire_pop9s [col 32],
        # misc_tall_pop9s [col 33],
        # and total_pop9s [col 34]
        
        train_features_withoutCD = df_train_subdist.iloc[:, [20, 21, 22, 23, 24, 25]]
        train_features_withCD = df_train_subdist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        test_features_withoutCD = df_test_subdist.iloc[:, [20, 21, 22, 23, 24, 25]]
        test_features_withCD = df_test_subdist.iloc[:, [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
        # labels: response (target) variables from DFma_1 to DFma_6 (col 19 -> col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_' + str(j + 1)])
        test_labels = np.array(df_test_subdist['DFma_' + str(j + 1)])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' 
                                                     + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                     + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each district
        for k in subdist_code:
            
            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/MA' + str(i) 
                                           + '/RF_' + province2 + '_BySubDistrict_MA' + str(i) 
                                           + '_DFma_' + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')



For original DF_0 (without smoothing, normal CD)

In [70]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1 - DF_6), 
# MAE (DF_1 - DF_6), 
# SMAPE (DF_1 - DF_6), 
# R-squared (DF_1 - DF_6)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_subdist_cd_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_subdist_cd_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:,[1, 2, 3, 10 - i]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # and LST_wm4 [col 25]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 10],
    # DF_wm1 [col 11], 
    # DF_wm2 [col 12],
    # DF_wm3 [col 13],
    # RF_wm6 [col 24],
    # LST_wm4 [col 25],
    # bin_pop9s [col 26],
    # bowl_pop9s [col 27],
    # bucket_pop9s [col 28],
    # misc_short_pop9s [col 29],
    # jar_pop9s [col 30],
    # pottedplant_pop9s [col 31],
    # tire_pop9s [col 32],
    # misc_tall_pop9s [col 33],
    # and total_pop9s [col 34]
    
    train_features_withoutCD = df_train_subdist.iloc[:, [10, 11, 12, 13, 24, 25]]
    train_features_withCD = df_train_subdist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
        
    test_features_withoutCD = df_test_subdist.iloc[:, [10, 11, 12, 13, 24, 25]]
    test_features_withCD = df_test_subdist.iloc[:, [10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_' + str(i + 1)])
    test_labels = np.array(df_test_subdist['DF_' + str(i + 1)])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                                 + '/Normal Lags/Normal CD/Original DF_0/RF_' + province2 
                                                 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                              + '/Normal Lags/Normal CD/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/Original DF_0/RF_' 
                               + province2 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/Original DF_0/RF_' 
                            + province2 + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each district
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Normal Lags/Normal CD/Original DF_0/RF_' 
                                       + province2 + '_BySubDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, 
                                       encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Normal Lags/Normal CD/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')



<h1>Modified Lags</h1>

- Predict DFma_1 as the target<br>
- Predict DF_1 as the target<br>
But adjust the independent variables <b>according to the different time horizons</b><br>
 - 1 week ahead = independent variables are DFma_0, DFma_wm1, DFma_wm2, ..., and DFma_wm6<br>
 - 2 weeks ahead = independent variables are DFma_wm1, DFma_wm2, DFma_wm3, ..., and DFma_wm6<br>
 - 3 weeks ahead = independent variables are DFma_wm2, DFma_wm3, DFma_wm4, ..., and DFma_wm6<br>
Maximum time horizon = 6 weeks ahead

<h2>District level</h2>
For MAs (adjusted CD)

In [71]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_dist_total_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_dist_total_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_dist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin [col 22],
        # bowl [col 23],
        # bucket [col 24],
        # misc_short [col 25],
        # jar [col 26],
        # pottedplant [col 27],
        # tire [col 28],
        # misc_tall [col 29],
        # and total [col 30]
        
        train_features_withoutCD = df_train_dist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_dist.iloc[:, (13 + j): 31]
        
        test_features_withoutCD = df_test_dist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_dist.iloc[:, (13 + j): 31]
        
        # labels: response (target) variables from DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_1'])
        test_labels = np.array(df_test_dist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                                  + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                                  + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                               + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                               + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' + str(j + 1) 
                                        + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE 1-week to R squared 6-week
eval_array = evaluation_print_modified_lag(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (without smoothing, adjusted CD)

In [72]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_dist_total_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_dist_total_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:,[1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # and total_pop9s [col 30]
    
    df_train_dist_DFinfo = df_train_dist.iloc[:, (5 + i):12]
    df_train_dist_withoutCD = df_train_dist.iloc[:, [20, 21]]
    df_train_dist_withCD = df_train_dist.iloc[:, 20: 31]
    
    df_test_dist_DFinfo = df_test_dist.iloc[:, (5 + i):12]
    df_test_dist_withoutCD = df_test_dist.iloc[:, [20, 21]]
    df_test_dist_withCD = df_test_dist.iloc[:, 20: 31]
        
    train_features_withoutCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withCD], axis = 1)
    
    # labels: response (target) variables DF_1 (col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_1'])
    test_labels = np.array(df_test_dist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 
                               + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                               + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 
                            + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                            + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    dist_code = df_train_dist['addrcode'].unique()
        
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_modified_lag_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For MAs (normal CD)

In [73]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1), 
# MAE (DFma_1), 
# SMAPE (DFma_1), 
# R-squared (DFma_1)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_dist_cd_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_dist_cd_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_dist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin [col 22],
        # bowl [col 23],
        # bucket [col 24],
        # misc_short [col 25],
        # jar [col 26],
        # pottedplant [col 27],
        # tire [col 28],
        # misc_tall [col 29],
        # and total [col 30]
        
        train_features_withoutCD = df_train_dist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_dist.iloc[:, (13 + j): 31]
        
        test_features_withoutCD = df_test_dist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_dist.iloc[:, (13 + j): 31]
        
        # labels: response (target) variables DFma_1 (col 12)
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_1'])
        test_labels = np.array(df_test_dist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_dist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' 
                                               + str(i) + '/RF_' + province2 + '_dist_MA' + str(i) 
                                               + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:
            
            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' 
                                        + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_modified_lag(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (without smoothing, normal CD)

In [74]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1 - DF_6), 
# MAE (DF_1 - DF_6), 
# SMAPE (DF_1 - DF_6), 
# R-squared (DF_1 - DF_6)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Normal Lags/train_' + province2 + '_dist_cd_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Normal Lags/test_' + province2 + '_dist_cd_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # and total_pop9s [col 30]
    
    df_train_dist_DFinfo = df_train_dist.iloc[:, (5 + i):12]
    df_train_dist_withoutCD = df_train_dist.iloc[:, [20, 21]]
    df_train_dist_withCD = df_train_dist.iloc[:, 20: 31]
    
    df_test_dist_DFinfo = df_test_dist.iloc[:, (5 + i):12]
    df_test_dist_withoutCD = df_test_dist.iloc[:, [20, 21]]
    df_test_dist_withCD = df_test_dist.iloc[:, 20: 31]
        
    train_features_withoutCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withCD], axis = 1)
    
    # labels: response (target) variables DF_1 (col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_1'])
    test_labels = np.array(df_test_dist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Modified Lags/Normal CD/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Modified Lags/Normal CD/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/Original DF_0/RF_' 
                               + province2 + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/Original DF_0/RF_' 
                            + province2 + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
        
    dist_code = df_train_dist['addrcode'].unique()
    
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Modified Lags/Normal CD/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, 
                                    encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/Original DF_0/RF_' 
                                + province2 + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

<h2>Sub-district level</h2>
For MAs (adjusted CD)

In [75]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_subdist_total_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_subdist_total_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_subdist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin [col 22],
        # bowl [col 23],
        # bucket [col 24],
        # misc_short [col 25],
        # jar [col 26],
        # pottedplant [col 27],
        # tire [col 28],
        # misc_tall [col 29],
        # and total [col 30]
        
        train_features_withoutCD = df_train_subdist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_subdist.iloc[:, (13 + j): 31]
        
        test_features_withoutCD = df_test_subdist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_subdist.iloc[:, (13 + j): 31]
        
        # labels: response (target) variables DFma_1(col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_1'])
        test_labels = np.array(df_test_subdist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' 
                                                     + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                     + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each district
        for k in subdist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/MA' + str(i) 
                                           + '/RF_' + province2 + '_BySubDistrict_MA' + str(i) + '_DFma_' 
                                           + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (without smoothing, adjusted CD)

In [76]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_subdist_total_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_subdist_total_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # and total_pop9s [col 30]
    
    df_train_subdist_DFinfo = df_train_subdist.iloc[:, (5 + i):12]
    df_train_subdist_withoutCD = df_train_subdist.iloc[:, [20, 21]]
    df_train_subdist_withCD = df_train_subdist.iloc[:, 20: 31]
    
    df_test_subdist_DFinfo = df_test_subdist.iloc[:, (5 + i):12]
    df_test_subdist_withoutCD = df_test_subdist.iloc[:, [20, 21]]
    df_test_subdist_withCD = df_test_subdist.iloc[:, 20: 31]
        
    train_features_withoutCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withCD], axis = 1)
    
    # labels: response (target) variables DF_1 (col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_1'])
    test_labels = np.array(df_test_subdist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                                 + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                                 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                              + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/Original DF_0/RF_' 
                               + province2 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/Original DF_0/RF_' 
                            + province2 + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each district
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Adjusted CD/Original DF_0/RF_' 
                                       + province2 + '_BySubDistrict_DF_' + str(i + 1) + '_eval_10.csv', 
                                       header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Modified Lags/Adjusted CD/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

For MAs (normal CD)

In [77]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_subdist_cd_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_subdist_cd_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_subdist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin [col 22],
        # bowl [col 23],
        # bucket [col 24],
        # misc_short [col 25],
        # jar [col 26],
        # pottedplant [col 27],
        # tire [col 28],
        # misc_tall [col 29],
        # and total [col 30]
        
        train_features_withoutCD = df_train_subdist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_subdist.iloc[:, (13 + j): 31]
        
        test_features_withoutCD = df_test_subdist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_subdist.iloc[:, (13 + j): 31]
        
        # labels: response (target) variables DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_1'])
        test_labels = np.array(df_test_subdist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' 
                                                     + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                     + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each district
        for k in subdist_code:
            
            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/MA' + str(i) 
                                           + '/RF_' + province2 + '_BySubDistrict_MA' + str(i) 
                                           + '_DFma_' + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (without smoothing, normal CD)

In [78]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1 - DF_6), 
# MAE (DF_1 - DF_6), 
# SMAPE (DF_1 - DF_6), 
# R-squared (DF_1 - DF_6)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Modified Lags/train_' + province2 + '_subdist_cd_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Modified Lags/test_' + province2 + '_subdist_cd_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # and total_pop9s [col 30]
    
    df_train_subdist_DFinfo = df_train_subdist.iloc[:, (5 + i):12]
    df_train_subdist_withoutCD = df_train_subdist.iloc[:, [20, 21]]
    df_train_subdist_withCD = df_train_subdist.iloc[:, 20: 31]
    
    df_test_subdist_DFinfo = df_test_subdist.iloc[:, (5 + i):12]
    df_test_subdist_withoutCD = df_test_subdist.iloc[:, [20, 21]]
    df_test_subdist_withCD = df_test_subdist.iloc[:, 20: 31]
        
    train_features_withoutCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withCD], axis = 1)
    
    # labels: response (target) variables DF_1 (col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_1'])
    test_labels = np.array(df_test_subdist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                                 + '/Modified Lags/Normal CD/Original DF_0/RF_' + province2 
                                                 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                              + '/Modified Lags/Normal CD/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/Original DF_0/RF_' 
                               + province2 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/Original DF_0/RF_' 
                            + province2 + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each district
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Modified Lags/Normal CD/Original DF_0/RF_' 
                                       + province2 + '_BySubDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, 
                                       encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Modified Lags/Normal CD/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

<h1>Combined CD (Modified Lags)</h1>
<h2>District level</h2>
For MAs

In [79]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_dist_combined_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_dist_combined_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
    df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin_pop9s [col 22],
        # bowl_pop9s [col 23],
        # bucket_pop9s [col 24],
        # misc_short_pop9s [col 25],
        # jar_pop9s [col 26],
        # pottedplant_pop9s [col 27],
        # tire_pop9s [col 28],
        # misc_tall_pop9s [col 29],
        # total_pop9s [col 30]
        # bin [col 31],
        # bowl [col 32],
        # bucket [col 33],
        # misc_short [col 34],
        # jar [col 35],
        # pottedplant [col 36],
        # tire [col 37],
        # misc_tall [col 38],
        # and total [col 39]
        
        train_features_withoutCD = df_train_dist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_dist.iloc[:, (13 + j): 40]
        
        test_features_withoutCD = df_test_dist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_dist.iloc[:, (13 + j): 40]
        
        # labels: response (target) variables from DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_1'])
        test_labels = np.array(df_test_dist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                                  + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                                  + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                               + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                               + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' + str(j + 1) 
                                        + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE 1-week to R squared 6-week
eval_array = evaluation_print_modified_lag(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Combined CD/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0 (without smoothing)

In [80]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_dist_combined_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_dist_combined_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:,[1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # total_pop9s [col 30]
    # bin [col 31],
    # bowl [col 32],
    # bucket [col 33],
    # misc_short [col 34],
    # jar [col 35],
    # pottedplant [col 36],
    # tire [col 37],
    # misc_tall [col 38],
    # and total [col 39]
    
    df_train_dist_DFinfo = df_train_dist.iloc[:, (5 + i): 12]
    df_train_dist_withoutCD = df_train_dist.iloc[:, [20, 21]]
    df_train_dist_withCD = df_train_dist.iloc[:, 20: 40]
    
    df_test_dist_DFinfo = df_test_dist.iloc[:, (5 + i): 12]
    df_test_dist_withoutCD = df_test_dist.iloc[:, [20, 21]]
    df_test_dist_withCD = df_test_dist.iloc[:, 20: 40]
        
    train_features_withoutCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withCD], axis = 1)
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_1'])
    test_labels = np.array(df_test_dist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Combined CD/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Combined CD/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 
                               + '/Combined CD/Original DF_0/RF_' + province2 
                               + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 
                            + '/Combined CD/Original DF_0/RF_' + province2 
                            + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    dist_code = df_train_dist['addrcode'].unique()
        
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Combined CD/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Combined CD/Original DF_0/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

<h2>Sub-district level</h2>
For MAs

In [81]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1), 
# MAE (DFma_1), 
# SMAPE (DFma_1), 
# R-squared (DFma_1)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_subdist_combined_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_subdist_combined_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_subdist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin_pop9s [col 22],
        # bowl_pop9s [col 23],
        # bucket_pop9s [col 24],
        # misc_short_pop9s [col 25],
        # jar_pop9s [col 26],
        # pottedplant_pop9s [col 27],
        # tire_pop9s [col 28],
        # misc_tall_pop9s [col 29],
        # total_pop9s [col 30]
        # bin [col 31],
        # bowl [col 32],
        # bucket [col 33],
        # misc_short [col 34],
        # jar [col 35],
        # pottedplant [col 36],
        # tire [col 37],
        # misc_tall [col 38],
        # and total [col 39]
        
        train_features_withoutCD = df_train_subdist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_subdist.iloc[:, (13 + j): 40]
        
        test_features_withoutCD = df_test_subdist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_subdist.iloc[:, (13 + j): 40]
        
        # labels: response (target) variables DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_1'])
        test_labels = np.array(df_test_subdist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Combined CD/MA' 
                                                     + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                     + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Combined CD/MA' 
                                                  + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each district
        for k in subdist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Combined CD/MA' + str(i) 
                                           + '/RF_' + province2 + '_BySubDistrict_MA' + str(i) + '_DFma_' 
                                           + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Combined CD/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0

In [82]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_subdist_combined_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_subdist_combined_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # total_pop9s [col 30]
    # bin [col 31],
    # bowl [col 32],
    # bucket [col 33],
    # misc_short [col 34],
    # jar [col 35],
    # pottedplant [col 36],
    # tire [col 37],
    # misc_tall [col 38],
    # and total [col 39]
    
    df_train_subdist_DFinfo = df_train_subdist.iloc[:, (5 + i):12]
    df_train_subdist_withoutCD = df_train_subdist.iloc[:, [20, 21]]
    df_train_subdist_withCD = df_train_subdist.iloc[:, 20: 40]
    
    df_test_subdist_DFinfo = df_test_subdist.iloc[:, (5 + i):12]
    df_test_subdist_withoutCD = df_test_subdist.iloc[:, [20, 21]]
    df_test_subdist_withCD = df_test_subdist.iloc[:, 20: 40]
        
    train_features_withoutCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withCD], axis = 1)
    
    # labels: response (target) variables DF_1 (col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_1'])
    test_labels = np.array(df_test_subdist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                                 + '/Combined CD/Original DF_0/RF_' + province2 
                                                 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                              + '/Combined CD/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Combined CD/Original DF_0/RF_' 
                               + province2 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Combined CD/Original DF_0/RF_' 
                            + province2 + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each district
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Combined CD/Original DF_0/RF_' 
                                       + province2 + '_BySubDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, 
                                       encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Combined CD/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

<h1>Modified IA</h1>
<h2>District level</h2>
For MAs

In [83]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Modified IA/train_' + province2 + '_dist_IA_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Modified IA/test_' + province2 + '_dist_IA_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
    df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin_pop9s [col 22],
        # bowl_pop9s [col 23],
        # bucket_pop9s [col 24],
        # misc_short_pop9s [col 25],
        # jar_pop9s [col 26],
        # pottedplant_pop9s [col 27],
        # tire_pop9s [col 28],
        # misc_tall_pop9s [col 29],
        # total_pop9s [col 30]
        # bin [col 31],
        # bowl [col 32],
        # bucket [col 33],
        # misc_short [col 34],
        # jar [col 35],
        # pottedplant [col 36],
        # tire [col 37],
        # misc_tall [col 38],
        # total [col 39]
        # and image/area [col 40]
        
        train_features_withoutCD = df_train_dist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_dist.iloc[:, (13 + j): 41]
        
        test_features_withoutCD = df_test_dist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_dist.iloc[:, (13 + j): 41]
        
        # labels: response (target) variables from DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_1'])
        test_labels = np.array(df_test_dist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                                  + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                                  + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                               + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                               + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' + str(j + 1) 
                                        + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE 1-week to R squared 6-week
eval_array = evaluation_print_modified_lag(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified IA/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0

In [84]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Modified IA/train_' + province2 + '_dist_IA_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Modified IA/test_' + province2 + '_dist_IA_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:,[1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # total_pop9s [col 30]
    # bin [col 31],
    # bowl [col 32],
    # bucket [col 33],
    # misc_short [col 34],
    # jar [col 35],
    # pottedplant [col 36],
    # tire [col 37],
    # misc_tall [col 38],
    # total [col 39]
    # and image/area [col 40]
    
    df_train_dist_DFinfo = df_train_dist.iloc[:, (5 + i): 12]
    df_train_dist_withoutCD = df_train_dist.iloc[:, [20, 21]]
    df_train_dist_withCD = df_train_dist.iloc[:, 20: 41]
    
    df_test_dist_DFinfo = df_test_dist.iloc[:, (5 + i): 12]
    df_test_dist_withoutCD = df_test_dist.iloc[:, [20, 21]]
    df_test_dist_withCD = df_test_dist.iloc[:, 20: 41]
        
    train_features_withoutCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withCD], axis = 1)
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_1'])
    test_labels = np.array(df_test_dist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Modified IA/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Modified IA/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 
                               + '/Modified IA/Original DF_0/RF_' + province2 
                               + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 
                            + '/Modified IA/Original DF_0/RF_' + province2 
                            + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    dist_code = df_train_dist['addrcode'].unique()
        
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Modified IA/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Modified IA/Original DF_0/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

<h2>Sub-district level</h2>
For MAs

In [85]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1), 
# MAE (DFma_1), 
# SMAPE (DFma_1), 
# R-squared (DFma_1)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Modified IA/train_' + province2 + '_subdist_IA_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Modified IA/test_' + province2 + '_subdist_IA_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header=0, skiprows=0)
    df_test_subdist = pd.read_csv(test_file_dir, header=0, skiprows=0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin_pop9s [col 22],
        # bowl_pop9s [col 23],
        # bucket_pop9s [col 24],
        # misc_short_pop9s [col 25],
        # jar_pop9s [col 26],
        # pottedplant_pop9s [col 27],
        # tire_pop9s [col 28],
        # misc_tall_pop9s [col 29],
        # total_pop9s [col 30]
        # bin [col 31],
        # bowl [col 32],
        # bucket [col 33],
        # misc_short [col 34],
        # jar [col 35],
        # pottedplant [col 36],
        # tire [col 37],
        # misc_tall [col 38],
        # and total [col 39]
        
        train_features_withoutCD = df_train_subdist.iloc[:, (13 + j): 22]
        train_features_withCD = df_train_subdist.iloc[:, (13 + j): 40]
        
        test_features_withoutCD = df_test_subdist.iloc[:, (13 + j): 22]
        test_features_withCD = df_test_subdist.iloc[:, (13 + j): 40]
        
        # labels: response (target) variables DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_1'])
        test_labels = np.array(df_test_subdist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Modified IA/MA' 
                                                     + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                     + '_DFma_' + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Modified IA/MA' 
                                                  + str(i) + '/RF_' + province2 + '_subdist_MA' + str(i) 
                                                  + '_DFma_' + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each district
        for k in subdist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Modified IA/MA' + str(i) 
                                           + '/RF_' + province2 + '_BySubDistrict_MA' + str(i) + '_DFma_' 
                                           + str(j + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print(eval_array, 'RMSE', rmse)
eval_array = evaluation_print(eval_array, 'MAE', mae)
eval_array = evaluation_print(eval_array, 'SMAPE', smape)
eval_array = evaluation_print(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Modified IA/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0

In [86]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Modified IA/train_' + province2 + '_subdist_IA_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Modified IA/test_' + province2 + '_subdist_IA_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # total_pop9s [col 30]
    # bin [col 31],
    # bowl [col 32],
    # bucket [col 33],
    # misc_short [col 34],
    # jar [col 35],
    # pottedplant [col 36],
    # tire [col 37],
    # misc_tall [col 38],
    # and total [col 39]
    
    df_train_subdist_DFinfo = df_train_subdist.iloc[:, (5 + i):12]
    df_train_subdist_withoutCD = df_train_subdist.iloc[:, [20, 21]]
    df_train_subdist_withCD = df_train_subdist.iloc[:, 20: 40]
    
    df_test_subdist_DFinfo = df_test_subdist.iloc[:, (5 + i):12]
    df_test_subdist_withoutCD = df_test_subdist.iloc[:, [20, 21]]
    df_test_subdist_withCD = df_test_subdist.iloc[:, 20: 40]
        
    train_features_withoutCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withoutCD], axis = 1)
    train_features_withCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withCD], axis = 1)
    
    test_features_withoutCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withoutCD], axis = 1)
    test_features_withCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withCD], axis = 1)
    
    # labels: response (target) variables DF_1 (col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_1'])
    test_labels = np.array(df_test_subdist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                                 + '/Modified IA/Original DF_0/RF_' + province2 
                                                 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                              + '/Modified IA/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Modified IA/Original DF_0/RF_' 
                               + province2 + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 + '/Modified IA/Original DF_0/RF_' 
                            + province2 + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each district
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Modified IA/Original DF_0/RF_' 
                                       + province2 + '_BySubDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, 
                                       encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Modified IA/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

<h1>2 Top rank CD</h1>
Use the dataset from combined CD

<h2>District level</h2>
For MAs

In [41]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_dist_combined_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_dist_combined_mavg' + str(i) + '.csv'
    
    df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
    df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_dist = df_test_dist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin_pop9s [col 22],
        # bowl_pop9s [col 23],
        # bucket_pop9s [col 24],
        # misc_short_pop9s [col 25],
        # jar_pop9s [col 26],
        # pottedplant_pop9s [col 27],
        # tire_pop9s [col 28],
        # misc_tall_pop9s [col 29],
        # total_pop9s [col 30]
        # bin [col 31],
        # bowl [col 32],
        # bucket [col 33],
        # misc_short [col 34],
        # jar [col 35],
        # pottedplant [col 36],
        # tire [col 37],
        # misc_tall [col 38],
        # and total [col 39]
        
        train_features_withoutCD = df_train_dist.iloc[:, (13 + j): 22]
        test_features_withoutCD = df_test_dist.iloc[:, (13 + j): 22]
        
        if province1 == 'NST':
            train_CD1 = df_train_dist.iloc[:, [26]]
            train_CD2 = df_train_dist.iloc[:, [22]]
            test_CD1 = df_test_dist.iloc[:, [26]]
            test_CD2 = df_test_dist.iloc[:, [22]]
        elif province1 == 'Krabi':
            train_CD1 = df_train_dist.iloc[:, [35]]
            train_CD2 = df_train_dist.iloc[:, [33]]
            test_CD1 = df_test_dist.iloc[:, [35]]
            test_CD2 = df_test_dist.iloc[:, [33]]
        elif province1 == 'Bangkok':
            train_CD1 = df_train_dist.iloc[:, [26]]
            train_CD2 = df_train_dist.iloc[:, [33]]
            test_CD1 = df_test_dist.iloc[:, [26]]
            test_CD2 = df_test_dist.iloc[:, [33]]
        
        train_CD_attr = pd.concat([train_CD1, train_CD2], axis = 1)
        test_CD_attr = pd.concat([test_CD1, test_CD2], axis = 1)
        train_features_withCD = pd.concat([train_features_withoutCD, train_CD_attr], axis = 1)
        test_features_withCD = pd.concat([test_features_withoutCD, test_CD_attr], axis = 1)
        
        # labels: response (target) variables from DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_dist['DFma_1'])
        test_labels = np.array(df_test_dist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                                  + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                                  + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
        df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                               + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' 
                                               + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                + '/RF_' + province2 + '_dist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        dist_code = df_train_dist['addrcode'].unique()
        
        # For each district
        for k in dist_code:

            # Get the subset of actual and predicted values according to the district code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
            mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
            smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
            r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
            # Append
            dist_array = np.append(dist_array, [[k, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                                mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                                smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                                r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

        #print(dist_array)
        pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_ByDistrict_MA' + str(i) + '_DFma_' + str(j + 1) 
                                        + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE 1-week to R squared 6-week
eval_array = evaluation_print_modified_lag(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Selected CD/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

For original DF_0

In [42]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_dist_combined_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_dist_combined_mavg2.csv'

df_train_dist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_dist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_dist = df_test_dist.iloc[:,[1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # total_pop9s [col 30]
    # bin [col 31],
    # bowl [col 32],
    # bucket [col 33],
    # misc_short [col 34],
    # jar [col 35],
    # pottedplant [col 36],
    # tire [col 37],
    # misc_tall [col 38],
    # and total [col 39]
    
    # DF incidence
    df_train_dist_DFinfo = df_train_dist.iloc[:, (5 + i): 12]
    df_test_dist_DFinfo = df_test_dist.iloc[:, (5 + i): 12]
    
    # Rainfall and LST
    df_train_dist_withoutCD = df_train_dist.iloc[:, [20, 21]]
    df_test_dist_withoutCD = df_test_dist.iloc[:, [20, 21]]
    
    # Combine DF and RF and lST first
    train_features_withoutCD = pd.concat([df_train_dist_DFinfo, df_train_dist_withoutCD], axis = 1)
    test_features_withoutCD = pd.concat([df_test_dist_DFinfo, df_test_dist_withoutCD], axis = 1)

    if province1 == 'NST':
        train_CD1 = df_train_dist.iloc[:, [26]]
        train_CD2 = df_train_dist.iloc[:, [22]]
        test_CD1 = df_test_dist.iloc[:, [26]]
        test_CD2 = df_test_dist.iloc[:, [22]]
    elif province1 == 'Krabi':
        train_CD1 = df_train_dist.iloc[:, [35]]
        train_CD2 = df_train_dist.iloc[:, [33]]
        test_CD1 = df_test_dist.iloc[:, [35]]
        test_CD2 = df_test_dist.iloc[:, [33]]
    elif province1 == 'Bangkok':
        train_CD1 = df_train_dist.iloc[:, [26]]
        train_CD2 = df_train_dist.iloc[:, [33]]
        test_CD1 = df_test_dist.iloc[:, [26]]
        test_CD2 = df_test_dist.iloc[:, [33]]
        
    train_CD_attr = pd.concat([train_CD1, train_CD2], axis = 1)
    test_CD_attr = pd.concat([test_CD1, test_CD2], axis = 1)
    
    train_features_withCD = pd.concat([train_features_withoutCD, train_CD_attr], axis = 1)
    test_features_withCD = pd.concat([test_features_withoutCD, test_CD_attr], axis = 1)
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_dist['DF_1'])
    test_labels = np.array(df_test_dist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_dist_withoutCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_dist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Selected CD/Original DF_0/RF_' + province2 
                                              + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_dist_withCD = pd.concat([df_test_addrcode_week_year_dist, df_pred_withCD], axis = 1)
    df_compare_addrcode_dist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_dist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Selected CD/Original DF_0/RF_' + province2 
                                           + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 
                               + '/Selected CD/Original DF_0/RF_' + province2 
                               + '_dist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 
                            + '/Selected CD/Original DF_0/RF_' + province2 
                            + '_dist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    dist_code = df_train_dist['addrcode'].unique()
        
    # For each district
    for j in dist_code:
            
        # Get the subset of actual and predicted values according to the district code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_dist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_dist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_dist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_dist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_dist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_dist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_dist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_dist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_dist = (rmse_withoutCD_dist - rmse_withCD_dist) / rmse_withoutCD_dist
        mae_percent_improved_dist = (mae_withoutCD_dist - mae_withCD_dist) / mae_withoutCD_dist
        smape_percent_improved_dist = (smape_withoutCD_dist - smape_withCD_dist) / smape_withoutCD_dist
        r2_percent_improved_dist = (r2_withoutCD_dist - r2_withCD_dist) / r2_withoutCD_dist
            
        # Append
        dist_array = np.append(dist_array, [[j, rmse_withoutCD_dist, rmse_withCD_dist, rmse_percent_improved_dist,
                                            mae_withoutCD_dist, mae_withCD_dist, mae_percent_improved_dist,
                                            smape_withoutCD_dist, smape_withCD_dist, smape_percent_improved_dist,
                                            r2_withoutCD_dist, r2_withCD_dist, r2_percent_improved_dist]], axis = 0)

    #print(dist_array)
    pd.DataFrame(dist_array).to_csv('Random Forest/' + province1 
                                    + '/Selected CD/Original DF_0/RF_' + province2 
                                    + '_ByDistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    dist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Selected CD/Original DF_0/RF_' + province2 
                                + '_dist_eval_10.csv', header = False, encoding = 'utf-8')

<h1>Sub-district level</h1>
For MAs

In [43]:
# Arrays of all evaluation values
# row: head,
# RMSE (DFma_1 - DFma_6), 
# MAE (DFma_1 - DFma_6), 
# SMAPE (DFma_1 - DFma_6), 
# R-squared (DFma_1 - DFma_6)

# col: head,
# MA2 (without CD, with CD, % improved),
# MA3 (without CD, with CD, % improved),
# MA4 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'MA2 without CD', 'MA2 with CD', 'MA2 % improved', 
                         'MA3 without CD', 'MA3 with CD', 'MA3 % improved', 
                         'MA4 without CD', 'MA4 with CD', 'MA4 % improved']])
rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input var from CSV file
# Starting from MA2 to MA4
for i in range(2, 5):
    # Get the input variables from CSV file
    train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_subdist_combined_mavg' + str(i) + '.csv'
    test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_subdist_combined_mavg' + str(i) + '.csv'
    
    df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
    df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)
    
    # Continue on DFma_1 to DFma_6
    for j in range(6):
        # Allocate the column of addrcode, week, year and actual values first
        df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:, [1, 2, 3, 12]]
        
        ## Without CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # and LST_wm4 [col 21]
        
        ## With CD ##
    
        # Import the dataset
        # x: independent variables
        # DFma_0 [col 13],
        # DFma_wm1 [col 14],
        # DFma_wm2 [col 15],
        # DFma_wm3 [col 16],
        # DFma_wm4 [col 17],
        # DFma_wm5 [col 18],
        # DFma_wm6 [col 19],
        # RF_wm6 [col 20],
        # LST_wm4 [col 21]
        # bin_pop9s [col 22],
        # bowl_pop9s [col 23],
        # bucket_pop9s [col 24],
        # misc_short_pop9s [col 25],
        # jar_pop9s [col 26],
        # pottedplant_pop9s [col 27],
        # tire_pop9s [col 28],
        # misc_tall_pop9s [col 29],
        # total_pop9s [col 30]
        # bin [col 31],
        # bowl [col 32],
        # bucket [col 33],
        # misc_short [col 34],
        # jar [col 35],
        # pottedplant [col 36],
        # tire [col 37],
        # misc_tall [col 38],
        # and total [col 39]
        
        train_features_withoutCD = df_train_subdist.iloc[:, (13 + j): 22]
        test_features_withoutCD = df_test_subdist.iloc[:, (13 + j): 22]
        
        if province1 == 'NST':
            train_CD1 = df_train_subdist.iloc[:, [22]]
            train_CD2 = df_train_subdist.iloc[:, [24]]
            test_CD1 = df_test_subdist.iloc[:, [22]]
            test_CD2 = df_test_subdist.iloc[:, [24]]
        elif province1 == 'Krabi':
            train_CD1 = df_train_subdist.iloc[:, [22]]
            train_CD2 = df_train_subdist.iloc[:, [28]]
            test_CD1 = df_test_subdist.iloc[:, [22]]
            test_CD2 = df_test_subdist.iloc[:, [28]]
        elif province1 == 'Bangkok':
            train_CD1 = df_train_subdist.iloc[:, [23]]
            train_CD2 = df_train_subdist.iloc[:, [34]]
            test_CD1 = df_test_subdist.iloc[:, [23]]
            test_CD2 = df_test_subdist.iloc[:, [34]]
        
        train_CD_attr = pd.concat([train_CD1, train_CD2], axis = 1)
        test_CD_attr = pd.concat([test_CD1, test_CD2], axis = 1)
        train_features_withCD = pd.concat([train_features_withoutCD, train_CD_attr], axis = 1)
        test_features_withCD = pd.concat([test_features_withoutCD, test_CD_attr], axis = 1)
        
        # labels: response (target) variables from DFma_1 (col 14)      
        # Pass the response values to the array for evaluation calculation
        train_labels = np.array(df_train_subdist['DFma_1'])
        test_labels = np.array(df_test_subdist['DFma_1'])

        # Instantiate model with 10 decision trees
        rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
        rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

        # Train the model on training data
        rf_withoutCD.fit(train_features_withoutCD, train_labels);
        rf_withCD.fit(train_features_withCD, train_labels);

        # Use the forest's predict method on the test data
        predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
        predictions_withCD = rf_withCD.predict(test_features_withCD)
        
        df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
        df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
        # Store all of the predicted values to the CSV files
        df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
        df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                                  + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' 
                                                  + str(j + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

        df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
        df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
        df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                               + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' 
                                               + str(j + 1) + '_withCD_10.csv', encoding = 'utf-8')

        # Calculate the evaluation values
        #print('MA' + str(i) + ' and DFma_' + str(j + 1))
        rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
        mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
        r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
        smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
        #print('RMSE of the prediction without CD is:', rmse_withoutCD)
        #print('MAE of the prediction without CD is:', mae_withoutCD)
        #print('R-squared of the prediction without CD is:', r2_withoutCD)
        #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
        rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
        mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
        r2_withCD = r2_score(test_labels, predictions_withCD)
        smape_withCD = smape_fast(test_labels, predictions_withCD)
        #print('RMSE of the prediction with CD is:', rmse_withCD)
        #print('MAE of the prediction with CD is:', mae_withCD)
        #print('R-squared of the prediction with CD is:', r2_withCD)
        #print('SMAPE of the prediction with CD is:', smape_withCD)
        
        rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
        mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
        smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
        r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
        rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
        mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
        smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
        r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
        df_withoutCD = pd.read_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                   + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                   + '_withoutCD_10.csv', header = 0)
        df_withCD = pd.read_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                + '/RF_' + province2 + '_subdist_MA' + str(i) + '_DFma_' + str(j + 1) 
                                + '_withCD_10.csv', header = 0)
        
        subdist_code = df_train_subdist['addrcode'].unique()
        
        # For each subdistrict
        for k in subdist_code:

            # Get the subset of actual and predicted values according to the subdistrict code
            subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == k]
            subset_withCD = df_withCD.loc[df_withCD['addrcode'] == k]
            
            # Pass the response values to the array for evaluation calculation
            array_true = np.array(subset_withoutCD['actual'])
            array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
            array_pred_withCD = np.array(subset_withCD['predicted'])
            
            # Calculate the evaluation values
            rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
            mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
            smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
            r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
            rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
            mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
            smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
            r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
            rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
            mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
            smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
            r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
            # Append
            subdist_array = np.append(subdist_array, [[k, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                                mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                                smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                                r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

        #print(subdist_array)
        pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 + '/Selected CD/MA' + str(i) 
                                        + '/RF_' + province2 + '_BySubdistrict_MA' + str(i) + '_DFma_' + str(j + 1) 
                                        + '_eval_10.csv', header = False, encoding = 'utf-8')
        
        # Clear the old memory to store a new one
        subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE 1-week to R squared 6-week
eval_array = evaluation_print_modified_lag(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_modified_lag(eval_array, 'MAE', mae)
eval_array = evaluation_print_modified_lag(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_modified_lag(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 + '/Selected CD/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

In [44]:
# Arrays of all evaluation values
# row: head,
# RMSE (DF_1), 
# MAE (DF_1), 
# SMAPE (DF_1), 
# R-squared (DF_1)

# col: head,
# DF_0 (without CD, with CD, % improved)

eval_array = np.asarray([['Evaluation', 'Without CD', 'With CD', '% improved']])

rmse = np.zeros(1)
mae = np.zeros(1)
smape = np.zeros(1)
r2 = np.zeros(1)

subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                         'MAE without CD', 'MAE with CD', '% improved MAE', 
                         'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                         'R squared without CD', 'R squared with CD', '% improved R squared']])

# Get the input variables from CSV file
train_file_dir = 'Data/' + province1 + '/Combined CD/train_' + province2 + '_subdist_combined_mavg2.csv'
test_file_dir = 'Data/' + province1 + '/Combined CD/test_' + province2 + '_subdist_combined_mavg2.csv'

df_train_subdist =  pd.read_csv(train_file_dir, header = 0, skiprows = 0)
df_test_subdist = pd.read_csv(test_file_dir, header = 0, skiprows = 0)

# Get the input var from CSV file
# From DF_1 to DF_6
for i in range(6):
    # Allocate the column of addrcode, week, year and actual values first
    df_test_addrcode_week_year_subdist = df_test_subdist.iloc[:,[1, 2, 3, 4]]
    
    ## Without CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # and LST_wm4 [col 21]
        
    ## With CD ##
    
    # Import the dataset
    # x: independent variables
    # DF_0 [col 5],
    # DF_wm1 [col 6], 
    # DF_wm2 [col 7],
    # DF_wm3 [col 8],
    # DF_wm4 [col 9], 
    # DF_wm5 [col 10],
    # DF_wm6 [col 11],
    # RF_wm6 [col 20],
    # LST_wm4 [col 21],
    # bin_pop9s [col 22],
    # bowl_pop9s [col 23],
    # bucket_pop9s [col 24],
    # misc_short_pop9s [col 25],
    # jar_pop9s [col 26],
    # pottedplant_pop9s [col 27],
    # tire_pop9s [col 28],
    # misc_tall_pop9s [col 29],
    # total_pop9s [col 30]
    # bin [col 31],
    # bowl [col 32],
    # bucket [col 33],
    # misc_short [col 34],
    # jar [col 35],
    # pottedplant [col 36],
    # tire [col 37],
    # misc_tall [col 38],
    # and total [col 39]
    
    # DF incidence
    df_train_subdist_DFinfo = df_train_subdist.iloc[:, (5 + i): 12]
    df_test_subdist_DFinfo = df_test_subdist.iloc[:, (5 + i): 12]
    
    # Rainfall and LST
    df_train_subdist_withoutCD = df_train_subdist.iloc[:, [20, 21]]
    df_test_subdist_withoutCD = df_test_subdist.iloc[:, [20, 21]]
    
    # Combine DF and RF and lST first
    train_features_withoutCD = pd.concat([df_train_subdist_DFinfo, df_train_subdist_withoutCD], axis = 1)
    test_features_withoutCD = pd.concat([df_test_subdist_DFinfo, df_test_subdist_withoutCD], axis = 1)

    if province1 == 'NST':
        train_CD1 = df_train_subdist.iloc[:, [22]]
        train_CD2 = df_train_subdist.iloc[:, [24]]
        test_CD1 = df_test_subdist.iloc[:, [22]]
        test_CD2 = df_test_subdist.iloc[:, [24]]
    elif province1 == 'Krabi':
        train_CD1 = df_train_subdist.iloc[:, [22]]
        train_CD2 = df_train_subdist.iloc[:, [28]]
        test_CD1 = df_test_subdist.iloc[:, [22]]
        test_CD2 = df_test_subdist.iloc[:, [28]]
    elif province1 == 'Bangkok':
        train_CD1 = df_train_subdist.iloc[:, [23]]
        train_CD2 = df_train_subdist.iloc[:, [34]]
        test_CD1 = df_test_subdist.iloc[:, [23]]
        test_CD2 = df_test_subdist.iloc[:, [34]]
        
    train_CD_attr = pd.concat([train_CD1, train_CD2], axis = 1)
    test_CD_attr = pd.concat([test_CD1, test_CD2], axis = 1)
    
    train_features_withCD = pd.concat([train_features_withoutCD, train_CD_attr], axis = 1)
    test_features_withCD = pd.concat([test_features_withoutCD, test_CD_attr], axis = 1)
    
    # labels: response (target) variables from DF_1 to DF_6 (col 9 -> col 4)
    # Pass the response values to the array for evaluation calculation
    train_labels = np.array(df_train_subdist['DF_1'])
    test_labels = np.array(df_test_subdist['DF_1'])
    
    # Instantiate model with 10 decision trees
    rf_withoutCD = RandomForestRegressor(n_estimators = 10, random_state = 42)
    rf_withCD = RandomForestRegressor(n_estimators = 10, random_state = 42)

    # Train the model on training data
    rf_withoutCD.fit(train_features_withoutCD, train_labels);
    rf_withCD.fit(train_features_withCD, train_labels);

    # Use the forest's predict method on the test data
    predictions_withoutCD = rf_withoutCD.predict(test_features_withoutCD)
    predictions_withCD = rf_withCD.predict(test_features_withCD)
    
    df_pred_withoutCD = pd.DataFrame(predictions_withoutCD, columns = ['predicted'])
    df_pred_withCD = pd.DataFrame(predictions_withCD, columns = ['predicted'])
        
    # Store all of the predicted values to the CSV files
    df_compare_addrcode_subdist_withoutCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withoutCD], axis = 1)
    df_compare_addrcode_subdist_withoutCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withoutCD.to_csv('Random Forest/' + province1 
                                              + '/Selected CD/Original DF_0/RF_' + province2 
                                              + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', encoding = 'utf-8')

    df_compare_addrcode_subdist_withCD = pd.concat([df_test_addrcode_week_year_subdist, df_pred_withCD], axis = 1)
    df_compare_addrcode_subdist_withCD.columns = [['addrcode', 'Week', 'Year', 'actual', 'predicted']]
    df_compare_addrcode_subdist_withCD.to_csv('Random Forest/' + province1 
                                           + '/Selected CD/Original DF_0/RF_' + province2 
                                           + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', encoding = 'utf-8')

    # Calculate the evaluation values
    rmse_withoutCD = mean_squared_error(test_labels, predictions_withoutCD) ** 0.5
    mae_withoutCD = mean_absolute_error(test_labels, predictions_withoutCD)
    r2_withoutCD = r2_score(test_labels, predictions_withoutCD)
    smape_withoutCD = smape_fast(test_labels, predictions_withoutCD)
    #print('RMSE of the prediction without CD is:', rmse_withoutCD)
    #print('MAE of the prediction without CD is:', mae_withoutCD)
    #print('R-squared of the prediction without CD is:', r2_withoutCD)
    #print('SMAPE of the prediction without CD is:', smape_withoutCD)
        
    rmse_withCD = mean_squared_error(test_labels, predictions_withCD) ** 0.5
    mae_withCD = mean_absolute_error(test_labels, predictions_withCD)
    r2_withCD = r2_score(test_labels, predictions_withCD)
    smape_withCD = smape_fast(test_labels, predictions_withCD)
    #print('RMSE of the prediction with CD is:', rmse_withCD)
    #print('MAE of the prediction with CD is:', mae_withCD)
    #print('R-squared of the prediction with CD is:', r2_withCD)
    #print('SMAPE of the prediction with CD is:', smape_withCD)
        
    rmse_percent_improved = (rmse_withoutCD - rmse_withCD) / rmse_withoutCD
    mae_percent_improved = (mae_withoutCD - mae_withCD) / mae_withoutCD
    smape_percent_improved = (smape_withoutCD - smape_withCD) / smape_withoutCD
    r2_percent_improved = (r2_withoutCD - r2_withCD) / r2_withoutCD
        
    rmse = np.append(rmse, [rmse_withoutCD, rmse_withCD, rmse_percent_improved])
    mae = np.append(mae, [mae_withoutCD, mae_withCD, mae_percent_improved])
    smape = np.append(smape, [smape_withoutCD, smape_withCD, smape_percent_improved])
    r2 = np.append(r2, [r2_withoutCD, r2_withCD, r2_percent_improved])
        
    df_withoutCD = pd.read_csv('Random Forest/' + province1 
                               + '/Selected CD/Original DF_0/RF_' + province2 
                               + '_subdist_DF_' + str(i + 1) + '_withoutCD_10.csv', header = 0)
    df_withCD = pd.read_csv('Random Forest/' + province1 
                            + '/Selected CD/Original DF_0/RF_' + province2 
                            + '_subdist_DF_' + str(i + 1) + '_withCD_10.csv', header = 0)
    
    subdist_code = df_train_subdist['addrcode'].unique()
        
    # For each subdistrict
    for j in subdist_code:
            
        # Get the subset of actual and predicted values according to the subdistrict code
        subset_withoutCD = df_withoutCD.loc[df_withoutCD['addrcode'] == j]
        subset_withCD = df_withCD.loc[df_withCD['addrcode'] == j]
            
        # Pass the response values to the array for evaluation calculation
        array_true = np.array(subset_withoutCD['actual'])
        array_pred_withoutCD = np.array(subset_withoutCD['predicted'])
        array_pred_withCD = np.array(subset_withCD['predicted'])
            
        # Calculate the evaluation values
        rmse_withoutCD_subdist = mean_squared_error(array_true, array_pred_withoutCD) ** 0.5
        mae_withoutCD_subdist = mean_absolute_error(array_true, array_pred_withoutCD)
        smape_withoutCD_subdist = smape_fast(array_true, array_pred_withoutCD)
        r2_withoutCD_subdist = r2_score(array_true, array_pred_withoutCD)
            
        rmse_withCD_subdist = mean_squared_error(array_true, array_pred_withCD) ** 0.5
        mae_withCD_subdist = mean_absolute_error(array_true, array_pred_withCD)
        smape_withCD_subdist = smape_fast(array_true, array_pred_withCD)
        r2_withCD_subdist = r2_score(array_true, array_pred_withCD)
            
        rmse_percent_improved_subdist = (rmse_withoutCD_subdist - rmse_withCD_subdist) / rmse_withoutCD_subdist
        mae_percent_improved_subdist = (mae_withoutCD_subdist - mae_withCD_subdist) / mae_withoutCD_subdist
        smape_percent_improved_subdist = (smape_withoutCD_subdist - smape_withCD_subdist) / smape_withoutCD_subdist
        r2_percent_improved_subdist = (r2_withoutCD_subdist - r2_withCD_subdist) / r2_withoutCD_subdist
            
        # Append
        subdist_array = np.append(subdist_array, [[j, rmse_withoutCD_subdist, rmse_withCD_subdist, rmse_percent_improved_subdist,
                                            mae_withoutCD_subdist, mae_withCD_subdist, mae_percent_improved_subdist,
                                            smape_withoutCD_subdist, smape_withCD_subdist, smape_percent_improved_subdist,
                                            r2_withoutCD_subdist, r2_withCD_subdist, r2_percent_improved_subdist]], axis = 0)

    #print(subdist_array)
    pd.DataFrame(subdist_array).to_csv('Random Forest/' + province1 
                                    + '/Selected CD/Original DF_0/RF_' + province2 
                                    + '_BySubdistrict_DF_' + str(i + 1) + '_eval_10.csv', header = False, encoding = 'utf-8')
        
    # Clear the old memory to store a new one
    subdist_array = np.asarray([['addrcode', 'RMSE without CD', 'RMSE with CD', '% improved RMSE', 
                              'MAE without CD', 'MAE with CD', '% improved MAE', 
                             'SMAPE without CD', 'SMAPE with CD', '% improved SMAPE', 
                             'R squared without CD', 'R squared with CD', '% improved R squared']])
    
# Evaluation file storing
# From RMSE DFma_1 to R squared DFma_6
eval_array = evaluation_print_original(eval_array, 'RMSE', rmse)
eval_array = evaluation_print_original(eval_array, 'MAE', mae)
eval_array = evaluation_print_original(eval_array, 'SMAPE', smape)
eval_array = evaluation_print_original(eval_array, 'R squared', r2)

#print(eval_array)

# Store all of the evaluation values into a CSV file
pd.DataFrame(eval_array).to_csv('Random Forest/' + province1 
                                + '/Selected CD/Original DF_0/RF_' + province2 
                                + '_subdist_eval_10.csv', header = False, encoding = 'utf-8')

In [45]:
train_features_withCD.head()

Unnamed: 0,DF_wm5,DF_wm6,RF_wm6,LST_wm4,bowl_pop9s,misc_short
0,0.0,0.0,0.011162,4.406998,0.018843,7.796
1,0.0,0.0,0.074986,4.343433,0.018843,7.796
2,0.0,0.0,0.0,4.396857,0.018843,7.796
3,0.0,0.0,0.841729,4.582279,0.018843,7.796
4,0.0,0.0,2.83858,4.686003,0.018843,7.796
