# PM2.5 AQI Prediction - Beijing

### As a part of this project, we will try to scrape climate data and merge it with PM2.5 AQI of beijing and try to predict the AQI given climatic conditions of Beijing.

#### We will be using different web scraping techniques, machine learning algorithms such as Decision Tress, Linear Regression and Rnadom Forest and compare their performance with different architectures of Artificial Neural Networks (ANNs)

In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import sys

In [2]:
# Scrape data from the website for all days between Jan 2010 and Dec 2014
base_url = 'https://en.tutiempo.net/climate/' # URL of the website
city ='ws-545110.html' # City Code:: Beijing 
dates = []
for i in range(2010,2015):
    for j in range(1,13):
        month = str(j).rjust(2,'0')
        date = month+ '-' + str(i)
        dates.append(date)

        

In [3]:
# the url looks something like this -------> https://en.tutiempo.net/climate/01-2000/ws-430630.html
# Get all webpages and store them in a list

webpages = []

for i,date in enumerate(dates):
    url = base_url+date+ '/' + city
    temp = requests.get(url)
    temp_tex = temp.text
    webpages.append(temp)
    

In [4]:
# Use bs4 to extract relevant information

for page,date in zip(webpages,dates):
    soup = BeautifulSoup(page.text.encode('utf-8'),'lxml')
    table = soup.findAll('table',{'class': 'medias mensuales numspan'})
    extract = []
    for body in table:
        for row in body:
            temp = []
            for element in (row):
                a =  element.get_text()
                temp.append(a)
            extract.append(temp)
        extract = pd.DataFrame(extract)
        extract.columns = extract.iloc[0,:]
        extract = extract.iloc[1:extract.shape[0]-2,:]
        
    ind = list(extract['Day'])
    for i in range(len(ind)):
        ind[i] = str(ind[i]).rjust(2,'0') + '-' + date
    extract.index = ind
    if date == '01-2010':
        climate_df = extract.copy()
    else:
        climate_df = climate_df.append(extract)
    
    
        
       
            
    

In [5]:
climate_df.head()

Unnamed: 0,Day,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG
01-01-2010,1,-4.7,-1.0,-10.2,-,45.0,0.51,8.0,7.6,14.4,-,,o,,
02-01-2010,2,-5.7,-3.8,-9.0,-,83.0,3.3,2.7,10.6,18.0,-,,o,,
03-01-2010,3,-9.6,-6.4,-13.0,-,76.0,6.6,5.0,17.8,39.6,-,,o,,
04-01-2010,4,,,,,,,,,,,,,,
05-01-2010,5,,,,,,,,,,,,,,


In [6]:
for i in climate_df.columns:
    print(i)
    print(climate_df[i].unique())
    print('................................................................................')

Day
['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30'
 '31']
................................................................................
T
['-4.7' '-5.7' '-9.6' '' '-12.4' '-10.9' '-8.7' '-11.4' '-6.9' '-7.9'
 '0.7' '-3.4' '-5.5' '-4.4' '-3.5' '1.9' '0.3' '-4.6' '-5.9' '-4.9' '-4'
 '-1.2' '0.8' '-2.9' '-5.2' '-6.2' '0.6' '1.8' '4.4' '3.5' '1.1' '-1.1'
 '-2.7' '2.5' '1.7' '-2' '-4.1' '-3.2' '3.6' '0.4' '0.2' '5.3' '5.7' '8.6'
 '4.1' '8.3' '10.8' '7.6' '5.5' '10.7' '14.2' '15.7' '8.9' '12.8' '14.7'
 '10.9' '11.8' '8.2' '9.2' '23.4' '23' '20.1' '20.6' '19.9' '16.2' '19.4'
 '18.3' '22.6' '25' '26.7' '21.9' '26.2' '25.7' '19.5' '21.6' '21.2'
 '25.9' '25.5' '22.3' '25.2' '27.7' '27.1' '26' '23.1' '29.3' '31.6'
 '27.2' '30.1' '24.3' '24.7' '23.9' '24.1' '30.3' '30.4' '27.9' '26.5'
 '26.3' '24.4' '26.6' '27.3' '29.5' '27.5' '25.1' '26.4' '23.6' '25.4'
 '24.5' '23.7' '23.3' '24.6' '25.3' '25.6' '19

In [7]:
# Remove VG,FG,TS,SN,RA, SLP
emp_cols = ['VG','FG','TS','SN','RA','SLP','Day']
climate_df2 = climate_df.drop(emp_cols,axis=1)
climate_df2.head()


Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM
01-01-2010,-4.7,-1.0,-10.2,45.0,0.51,8.0,7.6,14.4
02-01-2010,-5.7,-3.8,-9.0,83.0,3.3,2.7,10.6,18.0
03-01-2010,-9.6,-6.4,-13.0,76.0,6.6,5.0,17.8,39.6
04-01-2010,,,,,,,,
05-01-2010,,,,,,,,


In [8]:
# Preparing AQI Data
# The Data for AQI has been downloaded from UCI Machine Learning Repository
## Link ------> https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data
os.chdir('C:/Users/Aditya Kapoor/Desktop/Data Science Upskilling/Project 3 -- AQI Prediction/Data')


In [9]:
# Importing Datasets
aqi = pd.read_csv('PRSA_data.csv')
aqi.head()
aqi = aqi[['year','month','day','hour','pm2.5']]
for i in aqi.columns:
    print(i)
    print(list(aqi[i].unique()))
    print('.........................................................................................')

year
[2010, 2011, 2012, 2013, 2014]
.........................................................................................
month
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
.........................................................................................
day
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
.........................................................................................
hour
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
.........................................................................................
pm2.5
[nan, 129.0, 148.0, 159.0, 181.0, 138.0, 109.0, 105.0, 124.0, 120.0, 132.0, 140.0, 152.0, 164.0, 158.0, 154.0, 170.0, 149.0, 156.0, 126.0, 90.0, 63.0, 65.0, 55.0, 83.0, 91.0, 86.0, 82.0, 78.0, 98.0, 107.0, 96.0, 95.0, 70.0, 61.0, 53.0, 71.0, 72.0, 76.0, 73.0, 79.0, 58.0, 25.0, 26.0, 28.0, 20.0, 29.0, 27.0, 32.0, 30.0, 31.0, 33.0, 34.0, 36

In [10]:
for i in ('month','day','hour'):
    aqi[i] = aqi[i].astype(str)
    aqi[i] = aqi[i].str.rjust(2,'0')

In [11]:
for i in aqi.columns:
    print(i)
    print(list(aqi[i].unique()))
    print('.........................................................................................')

year
[2010, 2011, 2012, 2013, 2014]
.........................................................................................
month
['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
.........................................................................................
day
['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
.........................................................................................
hour
['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
.........................................................................................
pm2.5
[nan, 129.0, 148.0, 159.0, 181.0, 138.0, 109.0, 105.0, 124.0, 120.0, 132.0, 140.0, 152.0, 164.0, 158.0, 154.0, 170.0, 149.0, 156.0, 126.0, 90.0, 63.0, 65.0, 55.0, 83.0, 91.0, 86.

In [12]:
aqi['year'] = aqi['year'].astype(str)
aqi['Date'] = aqi['day'] + '-' + aqi['month'] + '-' + aqi['year']
aqi.head()

Unnamed: 0,year,month,day,hour,pm2.5,Date
0,2010,1,1,0,,01-01-2010
1,2010,1,1,1,,01-01-2010
2,2010,1,1,2,,01-01-2010
3,2010,1,1,3,,01-01-2010
4,2010,1,1,4,,01-01-2010


In [13]:
aqi['pm2.5'] = aqi['pm2.5'].astype(float)
aqi_day = pd.DataFrame(aqi.groupby(['Date'])['pm2.5'].mean())
aqi_day.reset_index(inplace=True)
aqi_day.head(3)

Unnamed: 0,Date,pm2.5
0,01-01-2010,
1,01-01-2011,
2,01-01-2012,78.958333


In [14]:
climate_df3 = climate_df2.copy()
climate_df3.reset_index(inplace=True)
climate_df3 = climate_df3.rename(columns = {'index':'Date'})
# Merge Dataset with AQI
climate_aqi = climate_df3.merge(aqi_day,on=['Date'],how='outer')


In [15]:
for i in climate_aqi.columns:
    print(i)
    print(pd.DataFrame(climate_aqi[i].value_counts()))
    print('......................................................')

Date
            Date
13-02-2011     1
14-12-2012     1
18-04-2012     1
26-01-2014     1
16-11-2014     1
...          ...
17-07-2010     1
15-10-2011     1
23-02-2012     1
05-09-2013     1
07-10-2012     1

[1826 rows x 1 columns]
......................................................
T
        T
      891
26.2   10
-1.2    9
26.3    9
25.4    9
...   ...
-5.9    1
8       1
23.4    1
29.9    1
-5.6    1

[349 rows x 1 columns]
......................................................
TM
       TM
      891
32     31
31     31
28     25
27     22
...   ...
25.8    1
-2.9    1
5.4     1
32.9    1
-1.2    1

[299 rows x 1 columns]
......................................................
Tm
       Tm
      891
21     30
20     28
-9     27
-3     25
...   ...
21.3    1
9.6     1
-9.2    1
5.4     1
17.7    1

[206 rows x 1 columns]
......................................................
H
      H
    891
34   21
29   21
45   19
72   19
..  ...
12    2
97    1
8     1
95    1
7     1

[88 row

In [16]:
climate_aqi = climate_aqi.replace(['-','',' '], np.nan)

In [17]:
climate_aqi2 = climate_aqi.dropna()
print(climate_aqi.shape)
print(climate_aqi2.shape)

(1826, 10)
(809, 10)


In [18]:
climate_aqi2.head()
# Convert all cols to numeric. Extract Month and Year. Convert year into a continuous varibale. Scale down. Regress

Unnamed: 0,Date,T,TM,Tm,H,PP,VV,V,VM,pm2.5
1,02-01-2010,-5.7,-3.8,-9,83,3.3,2.7,10.6,18.0,145.958333
2,03-01-2010,-9.6,-6.4,-13,76,6.6,5.0,17.8,39.6,78.833333
6,07-01-2010,-12.4,-4.4,-17,52,0.0,9.0,6.9,21.7,69.0
7,08-01-2010,-10.9,-7.0,-16,64,0.0,4.8,5.7,14.4,176.208333
8,09-01-2010,-8.7,-2.8,-15,55,0.0,9.7,11.5,25.2,88.5


In [19]:
# Clealry, pm2.5 concentration is our dependent variable while all other variables are independent variables. 
# Lets break up date to extract month and year. 
date_br = (climate_aqi2['Date'].str.split('-',expand=True))
climate_aqi3 = pd.concat([climate_aqi2,date_br],axis=1)
climate_aqi3.head()

Unnamed: 0,Date,T,TM,Tm,H,PP,VV,V,VM,pm2.5,0,1,2
1,02-01-2010,-5.7,-3.8,-9,83,3.3,2.7,10.6,18.0,145.958333,2,1,2010
2,03-01-2010,-9.6,-6.4,-13,76,6.6,5.0,17.8,39.6,78.833333,3,1,2010
6,07-01-2010,-12.4,-4.4,-17,52,0.0,9.0,6.9,21.7,69.0,7,1,2010
7,08-01-2010,-10.9,-7.0,-16,64,0.0,4.8,5.7,14.4,176.208333,8,1,2010
8,09-01-2010,-8.7,-2.8,-15,55,0.0,9.7,11.5,25.2,88.5,9,1,2010


### Note that::
1. T: Average Temperature in C
2. TM: Max Temp
3. Tm: Min Temp
4. H: Average Relative Humidity
5. PP: Total Rainfall/Snow Melt
6. VV: Visibility
7. V: Average Wind Speed
8. VM: Max Wind Speed

In [20]:
x = list(climate_aqi2.columns)
x.extend(['Day', 'Month', 'Year'])
climate_aqi3.columns = x
climate_aqi3.head()
num_cols = ['T','TM','Tm','H','PP','VV','V','VM','pm2.5']
for i in num_cols:
    climate_aqi3[i] = climate_aqi3[i].astype(float)

climate_aqi3.corr()

# We see that TM and Tm are highly corelated with each other and the column T. Hence, we drop these columns.


Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,pm2.5
T,1.0,0.987047,0.982146,0.335939,0.176769,-0.102042,-0.170312,-0.116175,-0.066941
TM,0.987047,1.0,0.949051,0.256045,0.152456,-0.053139,-0.15726,-0.08532,-0.075993
Tm,0.982146,0.949051,1.0,0.456223,0.213545,-0.179405,-0.210083,-0.164535,-0.02874
H,0.335939,0.256045,0.456223,1.0,0.309121,-0.645557,-0.558377,-0.491445,0.40942
PP,0.176769,0.152456,0.213545,0.309121,1.0,-0.12178,-0.019579,0.062112,0.008309
VV,-0.102042,-0.053139,-0.179405,-0.645557,-0.12178,1.0,0.446488,0.370986,-0.771454
V,-0.170312,-0.15726,-0.210083,-0.558377,-0.019579,0.446488,1.0,0.833223,-0.346405
VM,-0.116175,-0.08532,-0.164535,-0.491445,0.062112,0.370986,0.833223,1.0,-0.270282
pm2.5,-0.066941,-0.075993,-0.02874,0.40942,0.008309,-0.771454,-0.346405,-0.270282,1.0


In [21]:
# Now that we have prepared our data well, let us now preprocess it
# Drop Date, Day
# All continuous variables except month. We get dummies for month. We will treat year as a continuous variable (why???)
# Year will be treated as a continuous variable as AQI follows a general upward trend over a period.
# Scale all variables, split data (Train test split) and then use the following models:
# Linear Regression, Random Forest Regressor, Lasso Regression, ANN

# Get Dummies for month column
mon_dumm = pd.get_dummies(climate_aqi3['Month'],prefix='month_')
climate_aqi4 = pd.concat([climate_aqi3,mon_dumm],axis=1)
climate_aqi4.head()
# Drop Unnecessary Columns and Create Dummies
climate_aqi4.drop(['Date','Month','Day'],inplace=True,axis=1)


In [22]:
# Map year with labels
yr_map = {'2010':1,'2011':2,'2012':3,'2013':4,'2014':5}
climate_aqi4['Year'] = climate_aqi4['Year'].map(yr_map)


In [23]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [24]:
# Lets also define Dependant and Independent features
climate_aqi4.columns
X = ['T', 'H', 'PP', 'VV', 'V', 'VM','Year',
       'month__01', 'month__02', 'month__03', 'month__04', 'month__05',
       'month__06', 'month__07', 'month__08', 'month__09', 'month__10',
       'month__11', 'month__12']
Y = ['pm2.5']

In [25]:
# Convert all values between 0 and 1
scaler = MinMaxScaler()
climate_aqi_transformed = pd.DataFrame(scaler.fit_transform(climate_aqi4),columns = climate_aqi4.columns)

# Get Train and Test data

train,test = train_test_split(climate_aqi_transformed,test_size=0.2,random_state=1024)
print(train.shape)
print(test.shape)

(647, 22)
(162, 22)


In [26]:
# Import all machine learning models 
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [27]:
# 1. Linear Regression Model
lin_reg = OLS(train[Y],add_constant(train[X])).fit()
lin_reg.summary()

0,1,2,3
Dep. Variable:,pm2.5,R-squared:,0.685
Model:,OLS,Adj. R-squared:,0.676
Method:,Least Squares,F-statistic:,76.0
Date:,"Fri, 07 Aug 2020",Prob (F-statistic):,2.6400000000000004e-144
Time:,02:52:24,Log-Likelihood:,669.22
No. Observations:,647,AIC:,-1300.0
Df Residuals:,628,BIC:,-1215.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2785,0.041,6.856,0.000,0.199,0.358
T,0.3136,0.051,6.151,0.000,0.214,0.414
H,0.0262,0.027,0.961,0.337,-0.027,0.080
PP,-0.0479,0.058,-0.829,0.407,-0.161,0.065
VV,-0.5719,0.026,-21.931,0.000,-0.623,-0.521
V,-0.0961,0.042,-2.301,0.022,-0.178,-0.014
VM,0.0558,0.035,1.588,0.113,-0.013,0.125
Year,-0.0633,0.011,-5.918,0.000,-0.084,-0.042
month__01,0.1738,0.021,8.408,0.000,0.133,0.214

0,1,2,3
Omnibus:,154.795,Durbin-Watson:,1.984
Prob(Omnibus):,0.0,Jarque-Bera (JB):,548.63
Skew:,1.09,Prob(JB):,7.35e-120
Kurtosis:,6.949,Cond. No.,1.21e+16


In [45]:
# We see that H and PP have high p values
# Clearly, we have columns that might need to be eliminated completly. Hence, in addition to Linear Regression, we will use Lasso regression
# defining models and their parameters
models = [LinearRegression(),Lasso(),DecisionTreeRegressor(random_state=1024,criterion='mse'),RandomForestRegressor(random_state=1024,criterion='mse')]
grid = [{},{'alpha':[0.5,1,2,5,10]},{'min_samples_split':[2,10,20],'min_samples_leaf':[3,7]},
        {'min_samples_split':[2,10,15],'min_samples_leaf':[2,4,5],'n_estimators':[10,50,100]}]
result = pd.DataFrame

In [46]:
# Use Grid Search CV for hyperparameter tuning
# Use RMSE as cost function. We have used RMSE because we want to penalise large deviations more. A ballpark figure works.
for i,j in enumerate(models):
    clf = GridSearchCV(j, grid[i],scoring='neg_root_mean_squared_error',cv=5)
    clf.fit(train[X],np.array(train[Y]))
    df_temp = pd.DataFrame(clf.cv_results_)
    if i==0:
        result = df_temp.copy()
    else:
        result = result.append(df_temp,ignore_index=True)
    print(i)
    print(clf.best_params_)

0
{}
1
{'alpha': 0.5}
2
{'min_samples_leaf': 7, 'min_samples_split': 20}


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

3
{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [47]:
result_sort = result.sort_values('mean_test_score',ascending=False)
result_sort.to_csv('HP Tuning Results ML.csv',index=False)
result_sort.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,param_alpha,param_min_samples_leaf,param_min_samples_split,param_n_estimators
14,0.393767,0.02073,0.012502,0.006251,"{'min_samples_leaf': 2, 'min_samples_split': 2...",-0.080582,-0.072837,-0.07837,-0.08995,-0.070645,-0.078477,0.006771,1,,2,2,100
17,0.335689,0.020165,0.011978,0.006074,"{'min_samples_leaf': 2, 'min_samples_split': 1...",-0.080701,-0.073275,-0.079203,-0.090432,-0.070576,-0.078838,0.006888,2,,2,10,100
20,0.328141,0.026146,0.021876,0.007655,"{'min_samples_leaf': 2, 'min_samples_split': 1...",-0.081447,-0.072942,-0.079329,-0.091223,-0.070269,-0.079042,0.007327,3,,2,15,100
23,0.371892,0.030299,0.018751,0.006251,"{'min_samples_leaf': 4, 'min_samples_split': 2...",-0.081514,-0.074084,-0.078102,-0.09043,-0.071505,-0.079127,0.006604,4,,4,2,100
26,0.315641,0.011693,0.015625,1e-06,"{'min_samples_leaf': 4, 'min_samples_split': 1...",-0.081733,-0.074232,-0.07814,-0.09081,-0.070942,-0.079171,0.006859,5,,4,10,100


In [31]:
# We see that Random Forest Regressor has the best performance.
# Lets build an ANN model

from tensorflow import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [32]:
regressor = Sequential()
regressor.add(Dense(units = 15,kernel_initializer = 'he_normal',activation = 'relu',input_dim = 19 ))
regressor.add(Dense(units = 10,kernel_initializer = 'he_normal',activation = 'relu'))
regressor.add(Dense(units = 10,kernel_initializer = 'he_normal',activation = 'relu'))
regressor.add(Dense(units = 6,kernel_initializer = 'he_normal',activation = 'relu'))
regressor.add(Dense(units = 1,kernel_initializer = 'glorot_uniform',activation = 'sigmoid')) # used sigmoid because we need a bounded output

In [33]:
# Compile the model
regressor.compile(optimizer='sgd',loss= 'mean_squared_error')
regressor.fit(train[X],train[Y],epochs = 30,validation_split=0.2)


Train on 517 samples, validate on 130 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x6820a7d1c8>

In [34]:
# Evaluate the model
pred = regressor.predict(test[X])

In [35]:
# Define Functions for MAE, MAPE and RMSE
# mean absoute error
def mae(actual,predict):
    actual = np.array(actual)
    predict = np.array(predict)
    a = actual.shape
    actual.shape = a[0]
    b = predict.shape
    predict.shape = b[0]
    
    mae = (sum(abs(actual-predict))/len(actual))
    return mae

def rmse(actual,predict):
    actual = np.array(actual)
    predict = np.array(predict)
    a = actual.shape
    actual.shape = a[0]
    b = predict.shape
    predict.shape = b[0]
    
    rms = np.sqrt(sum((actual-predict)**2)/len(actual))
    return rms

In [36]:
print(rmse(test[Y],pred))
print(mae(test[Y],pred))

0.16353865697325623
0.1204043756686256


In [41]:
# Lets try different architectures for this model
def model_builder_and_evaluator(n,neurons,dropout=0):
    neurons = list(neurons)
    if n == len(neurons):
        if n==1:
            regressor = Sequential()
            regressor.add(Dense(units = neurons[0],kernel_initializer = 'he_normal',activation = 'relu',input_dim = 19 ))
        else:
            regressor = Sequential()
            regressor.add(Dense(units = neurons[0],kernel_initializer = 'he_normal',activation = 'relu',input_dim = 19 ))
            for i in (range(n-1)):
                regressor.add(Dense(units = neurons[i+1],kernel_initializer = 'he_normal',activation = 'relu' ))
                if i%2==0:
                    regressor.add(Dropout(dropout))
            
        regressor.add(Dense(units = 1,kernel_initializer = 'glorot_uniform',activation = 'sigmoid'))        
        regressor.compile(optimizer='sgd',loss= 'mean_squared_error')
        regressor.fit(train[X],train[Y],epochs = 100,validation_split=0.2)
        
        pred = regressor.predict(test[X])
        return rmse(test[Y],pred),mae(test[Y],pred)
    else:
        return 'ERROR: Number of layers should match with the list of count of neurons passed'
        
            

In [42]:
n_ = [3,5,7,9]
neurons_ = [[200,200,100],[250,100,100,100,50],[200,100,100,100,100,50,50],[250,250,100,100,100,100,50,50,10]]
dropout_ = [0,0.2,0.5]

In [43]:
result= []

for n,neurons in zip(n_,neurons_):
    for dropout in dropout_:
        rms,ma = model_builder_and_evaluator(n,neurons,dropout)
        strin = 'Layers:  ' + str(n) + '\nDropout:  '+ str(dropout) +'\n RMSE:  ' + str(rms) +'\n MAE:  ' + str(ma) 
        result.append(strin)

Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
E

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100

Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100


Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130

Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoc

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100


Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 517 samples, validate on 130 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100


Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [44]:
result

['Layers:  3\nDropout:  0\n RMSE:  0.13023125094896906\n MAE:  0.09426075795174196',
 'Layers:  3\nDropout:  0.2\n RMSE:  0.13546242172230563\n MAE:  0.09346731696002811',
 'Layers:  3\nDropout:  0.5\n RMSE:  0.14880084315855555\n MAE:  0.10533005673420383',
 'Layers:  5\nDropout:  0\n RMSE:  0.12102322492923284\n MAE:  0.08440197301795563',
 'Layers:  5\nDropout:  0.2\n RMSE:  0.12279424331745166\n MAE:  0.0855975351878028',
 'Layers:  5\nDropout:  0.5\n RMSE:  0.16119852381230274\n MAE:  0.11480108834453603',
 'Layers:  7\nDropout:  0\n RMSE:  0.12045417036246088\n MAE:  0.08747075106296016',
 'Layers:  7\nDropout:  0.2\n RMSE:  0.1532530104123326\n MAE:  0.12023993507815002',
 'Layers:  7\nDropout:  0.5\n RMSE:  0.20170859207206687\n MAE:  0.1752648232649218',
 'Layers:  9\nDropout:  0\n RMSE:  0.10711302444112807\n MAE:  0.07772608337282066',
 'Layers:  9\nDropout:  0.2\n RMSE:  0.13886680469570375\n MAE:  0.09482135966368223',
 'Layers:  9\nDropout:  0.5\n RMSE:  0.199949285326460

In [50]:
# We get the best performance (RMSE = 0.10) with layers = 9 and dropout =0 Clearly, our model is not overfiting, so it is okay to have dropout =0
# Lets revisit our RandomForest model with n_estimators =100 and min_sample_leaf = 2
aqi_prediction_model = RandomForestRegressor(n_estimators=100,min_samples_leaf = 2, random_state=1024,criterion='mse')
reg.fit(train[X],train[Y])
pred = reg.predict(test[X])
rmse(test[Y],pred)





  after removing the cwd from sys.path.


0.08546553029185722

In [49]:
import pickle

In [51]:
file = open('aqi_prediction_model.pickle','wb')
pickle.dump(aqi_prediction_model,file)

## We see that Random Forest model performs much better than the ANN model. 
### Hence, we'll go ahead with the Random Forest Regressor with min_sample_leaf as 2 and 100 trees