# Importing libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from basic_imports import *
import pandas

from math import radians, cos, sin, asin, sqrt

import folium
import datetime
from constants import *
from load_files import *
from scipy.stats import *
from scipy import spatial
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import statsmodels.api as sm
import numpy as np

In [2]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
from tabulate import tabulate

In [4]:
def mean_absolute_percentage_error(y_true, y_pred):
    #y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## RoadTypes

In [5]:
import pandas as pd
rd = pd.read_csv('road_dict.csv', header=0).to_dict('records')
road_dict = rd[0]

In [6]:
road_dict = rd[0]

In [7]:
road_dict

{'residential': 4,
 'living_street': 4,
 'tertiary': 3,
 'trunk': 0,
 'secondary': 2,
 'primary': 1,
 'pedestrian': 4,
 'tertiary_link': 3,
 'trunk_link': 0,
 'primary_link': 1,
 'secondary_link': 2}

# Loading Datasets

In [8]:
peeps = pd.read_csv("../preprocessing/peeps_spatial_T.csv")
peeps = peeps.drop('Unnamed: 0', axis =1)
peeps = peeps.replace({"roadType": road_dict})

In [9]:
leon = pd.read_csv("../preprocessing/leon_spatial_T.csv")
leon = leon.drop(['Unnamed: 0', 'walk'], axis =1)
leon = leon.replace({"roadType": road_dict})
guadalajara = pd.read_csv("../preprocessing/guadalajara_spatial_T.csv")
guadalajara = guadalajara.drop(['Unnamed: 0', 'walk'], axis =1)
guadalajara = guadalajara.replace({"roadType": road_dict})

In [10]:
mexico = pd.concat([leon, guadalajara])

In [11]:
datasets = [peeps, leon, guadalajara]

# Extra Trees

In [12]:
from sklearn import preprocessing

In [91]:
def et_results(combinations, window_size):
    results = {}
    for i in range(len(combinations)):
        size = len(combinations[i])
        mean = combinations[i]['pm2_5'].mean()
        df = combinations[i].drop(['timestamp','pm2_5', 'closest_pm_id', 'hour_of_day'], axis = 1)
        
        if (window_size == 15):
            df = df.drop(['30min_avg', '60min_avg'], axis=1)
        elif (window_size == 30):
            df = df.drop(['15min_avg', '60min_avg'], axis=1)
        elif (window_size == 60):
            df = df.drop(['15min_avg', '30min_avg'], axis=1)
            
        labels = combinations[i][['pm2_5']]
        X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.25, random_state=0)
      
        
        
        reg = ExtraTreesRegressor(max_depth = 100, n_estimators=90,max_features=5, random_state=0).fit(np.asarray(X_train), np.ravel(y_train))         

        pred = reg.predict(X_test)

        rmse = np.sqrt(mean_squared_error(np.asarray(y_test), pred))
        mae = mean_absolute_error(np.asarray(y_test), pred)
        mape = mean_absolute_percentage_error(list(y_test.pm2_5), pred)
        results[i] = (size, mean, rmse, mae, mape)
        
    return results

## 15 min

In [30]:
extra_trees = et_results(datasets, 15)

In [31]:
print(tabulate([(k,) + v for k,v in extra_trees.items()], headers = ["combination", "size","mean","rmse", "mae", "mape"])) 

  combination    size     mean     rmse      mae     mape
-------------  ------  -------  -------  -------  -------
            0   56226  78.4768  12.1211  4.80005  11.5409
            1    5169  25.4675  10.2977  6.60234  31.8927
            2    2944  22.6717  10.2732  6.26155  40.1893


## 30 min

In [92]:
extra_trees_30 = et_results(datasets, 30)

In [93]:
print(tabulate([(k,) + v for k,v in extra_trees_30.items()], headers = ["combination", "size","mean","rmse", "mae", "mape"])) 

  combination    size     mean     rmse      mae     mape
-------------  ------  -------  -------  -------  -------
            0   56226  78.4768  12.9591  5.28743  13.1386
            1    5169  25.4675  10.4354  6.75571  32.941
            2    2944  22.6717  10.638   6.29745  41.145


## 60 min

In [94]:
extra_trees_60 = et_results(datasets, 60)

In [95]:
print(tabulate([(k,) + v for k,v in extra_trees_60.items()], headers = ["combination", "size","mean","rmse", "mae", "mape"])) 

  combination    size     mean     rmse      mae    mape
-------------  ------  -------  -------  -------  ------
            0    5169  25.4675  10.2779  6.61914  31.625
