# Random Forest Regressor

In [614]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
import glob
from numba import jit
import math

In [615]:
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

## Subdistrict

### [Without CD]

In [616]:
df_train =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, skiprows=0)
df_train = df_train.drop('Unnamed: 0', axis = 1)
df_train = df_train.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
df_test = df_test.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2,3]]

In [617]:
# Labels are the values we want to predict
train_labels = np.array(df_train['DF_1'])
test_labels = np.array(df_test['DF_1'])

# Remove the labels from the features
# axis 1 refers to the columns
train_features= df_train.iloc[:,[12,18,24,25,26,27,28,29,30,31,32]]
test_features= df_test.iloc[:,[12,18,24,25,26,27,28,29,30,31,32]]

# Saving feature names for later use
feature_list = list(df_train.columns)

# Convert to numpy array
features = np.array(df_train)

In [618]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (23045, 11)
Training Labels Shape: (23045,)
Testing Features Shape: (6846, 11)
Testing Labels Shape: (6846,)


In [619]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [620]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'degrees.')

Mean Absolute Error: 0.54464 degrees.


In [621]:
smape = smape_fast(test_labels,predictions)
smape

176.31632243874967

In [622]:
df_predicted = pd.DataFrame(predictions, columns = ['predicted'])
df_compare_addrcode = pd.concat([df_test_week_addrcode, df_predicted], axis = 1)
df_compare_addrcode.columns = [['Week','Year','addrcode','actual','predicted']]
df_compare_addrcode.loc[len(df_predicted)] = ['SMAPE',smape,None,None,None]
df_compare_addrcode.to_csv('Data/Modeling/Random Forest/DF_1/Province/Subdistrict/RF_DFw1_subdist_withoutCD.csv', encoding = 'utf-8')

### [With CD]

In [623]:
df_train =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, skiprows=0)
df_train = df_train.drop('Unnamed: 0', axis = 1)
#df_train = df_train.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
#df_test = df_test.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2,3]]

In [624]:
# Labels are the values we want to predict
train_labels = np.array(df_train['DF_1'])
test_labels = np.array(df_test['DF_1'])

# Remove the labels from the features
# axis 1 refers to the columns
train_features= df_train.iloc[:,[12,18,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]]
test_features= df_test.iloc[:,[12,18,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]]

# Saving feature names for later use
feature_list = list(df_train.columns)

# Convert to numpy array
features = np.array(df_train)

In [625]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (23045, 19)
Training Labels Shape: (23045,)
Testing Features Shape: (6846, 19)
Testing Labels Shape: (6846,)


In [626]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [627]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'degrees.')

Mean Absolute Error: 0.53821 degrees.


In [628]:
smape = smape_fast(test_labels,predictions)
smape

176.97108582599532

In [629]:
df_predicted = pd.DataFrame(predictions, columns = ['predicted'])
df_compare_addrcode = pd.concat([df_test_week_addrcode, df_predicted], axis = 1)
df_compare_addrcode.columns = [['Week','Year','addrcode','actual','predicted']]
df_compare_addrcode.loc[len(df_predicted)] = ['SMAPE',smape,None,None,None]
df_compare_addrcode.to_csv('Data/Modeling/Random Forest/DF_1/Province/Subdistrict/RF_DFw1_subdist_withCD.csv', encoding = 'utf-8')

## District

### [Without CD]

In [630]:
df_train_dist =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_dist.csv'), header=0, skiprows=0)
#df_train_dist = df_train.drop('Unnamed: 0', axis = 1)
df_train_dist = df_train_dist.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_dist = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_dist.csv'), header=0, skiprows=0)
#df_test_dist = df_test.drop('Unnamed: 0', axis = 1)
df_test_dist = df_test_dist.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_week_addrcode_dist = df_test_dist.iloc[:,[0,1,2,3]]

In [631]:
# Labels are the values we want to predict
train_labels = np.array(df_train_dist['DF_1'])
test_labels = np.array(df_test_dist['DF_1'])

# Remove the labels from the features
# axis 1 refers to the columns
train_features= df_train_dist.iloc[:,[12,18,24,25,26,27,28,29,30,31,32]]
test_features= df_test_dist.iloc[:,[12,18,24,25,26,27,28,29,30,31,32]]

# Saving feature names for later use
feature_list = list(df_train_dist.columns)

# Convert to numpy array
features = np.array(df_train_dist)

In [632]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (3174, 11)
Training Labels Shape: (3174,)
Testing Features Shape: (943, 11)
Testing Labels Shape: (943,)


In [633]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [634]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'degrees.')

Mean Absolute Error: 1.86351 degrees.


In [635]:
smape = smape_fast(test_labels,predictions)
smape

97.78971583250102

In [636]:
df_predicted = pd.DataFrame(predictions, columns = ['predicted'])
df_compare_addrcode_dist = pd.concat([df_test_week_addrcode_dist, df_predicted], axis = 1)
df_compare_addrcode_dist.columns = [['addrcode','Week','Year','actual','predicted']]
df_compare_addrcode_dist.loc[len(df_predicted)] = ['SMAPE',smape,None,None,None]
df_compare_addrcode_dist.to_csv('Data/Modeling/Random Forest/DF_1/Province/District/RF_DFw1_dist_withoutCD.csv', encoding = 'utf-8')

### [With CD]

In [637]:
df_train_dist =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_dist.csv'), header=0, skiprows=0)
#df_train_dist = df_train.drop('Unnamed: 0', axis = 1)
#df_train_dist = df_train_dist.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_dist = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_dist.csv'), header=0, skiprows=0)
#df_test_dist = df_test.drop('Unnamed: 0', axis = 1)
#df_test_dist = df_test_dist.drop(['bin','bowl','bucket','misc_short','jar','pottedplant','tire','misc_tall'], axis = 1)
df_test_week_addrcode_dist = df_test_dist.iloc[:,[0,1,2,3]]

In [638]:
# Labels are the values we want to predict
train_labels = np.array(df_train_dist['DF_1'])
test_labels = np.array(df_test_dist['DF_1'])

# Remove the labels from the features
# axis 1 refers to the columns
train_features= df_train_dist.iloc[:,[12,18,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]]
test_features= df_test_dist.iloc[:,[12,18,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]]

# Saving feature names for later use
feature_list = list(df_train_dist.columns)

# Convert to numpy array
features = np.array(df_train_dist)

In [639]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (3174, 19)
Training Labels Shape: (3174,)
Testing Features Shape: (943, 19)
Testing Labels Shape: (943,)


In [640]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [641]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 5), 'degrees.')

Mean Absolute Error: 1.86899 degrees.


In [642]:
smape = smape_fast(test_labels,predictions)
smape

97.95663178972144

In [643]:
df_predicted = pd.DataFrame(predictions, columns = ['predicted'])
df_compare_addrcode_dist = pd.concat([df_test_week_addrcode_dist, df_predicted], axis = 1)
df_compare_addrcode_dist.columns = [['addrcode','Week','Year','actual','predicted']]
df_compare_addrcode_dist.loc[len(df_predicted)] = ['SMAPE',smape,None,None,None]
df_compare_addrcode_dist.to_csv('Data/Modeling/Random Forest/DF_1/Province/District/RF_DFw1_dist_withCD.csv', encoding = 'utf-8')

## Separate files

### [Subdistrict] 

In [644]:
list_sub = glob.glob(os.path.join('Data','Modeling','Random Forest','DF_1','Province','Subdistrict','*'))
list_sub

['Data/Modeling/Random Forest/DF_1/Province/Subdistrict/RF_DFw1_subdist_withoutCD.csv',
 'Data/Modeling/Random Forest/DF_1/Province/Subdistrict/RF_DFw1_subdist_withCD.csv']

In [645]:
list_sub[0][:-4][54:]

'RF_DFw1_subdist_withoutCD'

In [646]:
df_available = pd.read_csv(os.path.join('Data','Data Statistics','available_addrcode_subdistrict.csv'))
df_available['addrcode'] = df_available['addrcode'].astype(str)
addrcode_list = df_available['addrcode']
addrcode_nakhon_sub = []

for i in range(len(addrcode_list)):
    if addrcode_list[i].startswith('80'):
        addrcode_nakhon_sub.append(addrcode_list[i])

In [647]:
for i in range(len(list_sub)):
    for j in range(len(addrcode_nakhon_sub)):
        df_result = pd.read_csv(list_sub[i])
        df_result['addrcode'] = df_result['addrcode'].astype(str).str[:6]
        df_result = df_result.drop('Unnamed: 0', axis =1 )
        df_result = df_result.loc[df_result['addrcode'] == addrcode_nakhon_sub[j]]
        df_result = df_result.reset_index()
        df_result = df_result.drop('index', axis = 1)
        smape = smape_fast(df_result['actual'],df_result['predicted'])
        df_result.loc[len(df_result)] = ['SMAPE',smape,None,None,None]
        df_result.to_csv('Data/Modeling/Random Forest/DF_1/Separated/Subdistrict/'+list_sub[i][:-4][54:]+'_'+addrcode_nakhon_sub[j]+'.csv', encoding = 'utf-8')
        

### [District]

In [648]:
list_dist = glob.glob(os.path.join('Data','Modeling','Random Forest','DF_1','Province','District','*'))
list_dist

['Data/Modeling/Random Forest/DF_1/Province/District/RF_DFw1_dist_withoutCD.csv',
 'Data/Modeling/Random Forest/DF_1/Province/District/RF_DFw1_dist_withCD.csv']

In [649]:
list_dist[0][:-4][51:]

'RF_DFw1_dist_withoutCD'

In [650]:
addrcode_nakhon_dist = []

for i in range(len(addrcode_nakhon_sub)):
    addrcode_nakhon_sub[i] = addrcode_nakhon_sub[i][:-2]
    addrcode_nakhon_dist.append(addrcode_nakhon_sub[i]) 
    
addrcode_nakhon_dist = list(set(addrcode_nakhon_dist))

In [651]:
for i in range(len(list_dist)):
    for j in range(len(addrcode_nakhon_dist)):
        df_result = pd.read_csv(list_dist[i])
        df_result['addrcode'] = df_result['addrcode'].astype(str).str[:6]
        df_result = df_result.drop('Unnamed: 0', axis =1 )
        df_result = df_result.loc[df_result['addrcode'] == addrcode_nakhon_dist[j]]
        df_result = df_result.reset_index()
        df_result = df_result.drop('index', axis = 1)
        smape = smape_fast(df_result['actual'],df_result['predicted'])
        df_result.loc[len(df_result)] = ['SMAPE',smape,None,None,None]
        df_result.to_csv('Data/Modeling/Random Forest/DF_1/Separated/District/'+list_dist[i][:-4][51:]+'_'+addrcode_nakhon_dist[j]+'.csv', encoding = 'utf-8')
        

## Compute Different and MAE

## Seperated Level

#### [Subdistrict]

In [652]:
file_sub = glob.glob(os.path.join('Data','Modeling','Random Forest','DF_1','Separated','Subdistrict','*'))
len(file_sub)

334

In [653]:
file_sub[0][55:]

'RF_DFw1_subdist_withoutCD_802102.csv'

In [654]:
for i in range(len(file_sub)):
    df_diff = pd.read_csv(file_sub[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',df_diff['different'].sum()/len(df_diff),None,None,None,None]
    df_diff.to_csv(file_sub[i])

### [District]

In [655]:
file_dist = glob.glob(os.path.join('Data','Modeling','Random Forest','DF_1','Separated','District','*'))
len(file_dist)

46

In [656]:
file_dist[0][52:]

'RF_DFw1_dist_withoutCD_8007.csv'

In [657]:
for i in range(len(file_dist)):
    df_diff = pd.read_csv(file_dist[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',df_diff['different'].sum()/len(df_diff),None,None,None,None]
    df_diff.to_csv(file_dist[i])

## Province Level

### [Subdistrict]

In [658]:
file_pro_sub = glob.glob(os.path.join('Data','Modeling','Random Forest','DF_1','Province','Subdistrict','*'))
len(file_pro_sub)

2

In [659]:
file_pro_sub[0][54:][:-4]

'RF_DFw1_subdist_withoutCD'

In [660]:
for i in range(len(file_pro_sub)):
    df_diff = pd.read_csv(file_pro_sub[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',df_diff['different'].sum()/len(df_diff),None,None,None,None]
    df_diff.to_csv('Data/Modeling/Random Forest/DF_1/Province/Subdistrict/'+file_pro_sub[i][54:][:-4]+'_diff.csv')

### [District]

In [661]:
file_pro_dist = glob.glob(os.path.join('Data','Modeling','Random Forest','DF_1','Province','District','*'))
len(file_pro_dist)

2

In [662]:
file_pro_dist[0][51:][:-4]

'RF_DFw1_dist_withoutCD'

In [663]:
file_pro_dist[0]

'Data/Modeling/Random Forest/DF_1/Province/District/RF_DFw1_dist_withoutCD.csv'

In [664]:
for i in range(len(file_pro_dist)):
    df_diff = pd.read_csv(file_pro_dist[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',df_diff['different'].sum()/len(df_diff),None,None,None,None]
    df_diff.to_csv('Data/Modeling/Random Forest/DF_1/Province/District/'+file_pro_dist[i][51:][:-4]+'_diff.csv')