# Model Building

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from numpy import mean
from numpy import absolute
from numpy import sqrt
import pandas as pd

## Importing the dataset

In [2]:
poll_df = pd.read_csv('air_pollution_gauss_cleaned.csv') # Loading normally or gaussian distributed data
poll_df.head()

Unnamed: 0,Date,State Name,New Covid Cases,Population Staying at Home,Population Not Staying at Home,Number of Trips,pm10_mean,pm25_mean,co_mean,no2_mean,o3_mean,so2_mean,press_mean,rh_mean,temp_mean,wind_mean,AQI,Category
0,2020-01-01,Alabama,0.0,859.830216,1710.371305,3245.716331,3.535534,2.868094,0.543782,4.04647,0.033676,0.951766,3.799426e+36,223.283219,48.435555,92.363889,6.055301,Good
1,2020-01-01,California,0.0,1161.114551,2234.236335,4629.817437,4.197069,3.454464,0.639386,3.185928,0.022262,0.755213,2.6898990000000003e+36,318.449161,49.710225,93.809963,7.440329,Moderate
2,2020-01-01,Colorado,0.0,766.009791,1408.948899,2977.793038,3.201562,2.187598,0.570414,3.518695,0.03829,1.050116,5.285797999999999e+35,327.386469,31.375,113.99858,5.524357,Good
3,2020-01-01,Connecticut,0.0,499.490741,975.321998,1965.252528,2.280351,1.762456,0.439804,2.559934,0.024059,0.325265,3.405333e+36,195.157135,37.482143,135.210606,4.708149,Good
4,2020-01-01,Florida,0.0,1160.936691,2275.216913,4589.595448,4.427189,3.655956,0.556034,3.329852,0.030658,1.010231,4.687015e+36,343.042315,61.408333,102.174306,6.786109,Good


## Normalization and Scaling of Data

In [3]:
poll_df.columns

Index(['Date', 'State Name', 'New Covid Cases', 'Population Staying at Home',
       'Population Not Staying at Home', 'Number of Trips', 'pm10_mean',
       'pm25_mean', 'co_mean', 'no2_mean', 'o3_mean', 'so2_mean', 'press_mean',
       'rh_mean', 'temp_mean', 'wind_mean', 'AQI', 'Category'],
      dtype='object')

In [4]:
# from sklearn.preprocessing import StandardScaler
# continous_features = ['New Covid Cases', 'Population Staying at Home',
#        'Population Not Staying at Home', 'Number of Trips', 'pm10_mean',
#        'pm25_mean', 'co_mean', 'no2_mean', 'o3_mean', 'so2_mean', 'press_mean',
#        'rh_mean', 'temp_mean', 'wind_mean', 'AQI']

# stdScale = StandardScaler()
# for col_name in continous_features:
#     poll_df[col_name] = stdScale.fit_transform(poll_df[[col_name]])

In [5]:
poll_df.head()

Unnamed: 0,Date,State Name,New Covid Cases,Population Staying at Home,Population Not Staying at Home,Number of Trips,pm10_mean,pm25_mean,co_mean,no2_mean,o3_mean,so2_mean,press_mean,rh_mean,temp_mean,wind_mean,AQI,Category
0,2020-01-01,Alabama,0.0,859.830216,1710.371305,3245.716331,3.535534,2.868094,0.543782,4.04647,0.033676,0.951766,3.799426e+36,223.283219,48.435555,92.363889,6.055301,Good
1,2020-01-01,California,0.0,1161.114551,2234.236335,4629.817437,4.197069,3.454464,0.639386,3.185928,0.022262,0.755213,2.6898990000000003e+36,318.449161,49.710225,93.809963,7.440329,Moderate
2,2020-01-01,Colorado,0.0,766.009791,1408.948899,2977.793038,3.201562,2.187598,0.570414,3.518695,0.03829,1.050116,5.285797999999999e+35,327.386469,31.375,113.99858,5.524357,Good
3,2020-01-01,Connecticut,0.0,499.490741,975.321998,1965.252528,2.280351,1.762456,0.439804,2.559934,0.024059,0.325265,3.405333e+36,195.157135,37.482143,135.210606,4.708149,Good
4,2020-01-01,Florida,0.0,1160.936691,2275.216913,4589.595448,4.427189,3.655956,0.556034,3.329852,0.030658,1.010231,4.687015e+36,343.042315,61.408333,102.174306,6.786109,Good


## Splitting into Dependent and Independent Features

In [6]:
X = poll_df[['New Covid Cases', 'Population Staying at Home',
       'Population Not Staying at Home', 'Number of Trips', 'pm10_mean',
       'pm25_mean', 'co_mean', 'no2_mean', 'o3_mean', 'so2_mean', 'press_mean',
       'rh_mean', 'temp_mean', 'wind_mean']].values
y = poll_df['AQI'].values

## Regression Model Building Unbalanced Data or Original Data

### Normalization and Scaling of Data

In [7]:
sc_X = StandardScaler()
sc_y = StandardScaler()

In [8]:
X = sc_X.fit_transform(X)

### Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### SVR model

#### Training

In [11]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

#### Predicting the Test set results

In [14]:
from sklearn.preprocessing import StandardScaler
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[7.82 8.01]
 [5.63 5.56]
 [5.45 5.55]
 ...
 [7.06 6.97]
 [4.21 4.09]
 [4.51 4.98]]


#### Model Evaluation

In [15]:
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test,y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R squared value: ",r2_score(y_test, y_pred))

MAE: 0.20325719516221816
MSE: 0.07530044492196411
RMSE: 0.2744092653719333
R squared value:  0.8989251807240534


### SVM regression using k fold cross validation and Grid Search

#### Training

In [16]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')

#### K fold cross validation

In [17]:
cv=cross_validate(regressor, X, y, cv=3)

In [18]:
print('3 folds accuracy:',cv['test_score']*100)
print('mean accuracy of 3 folds:',cv['test_score'].mean()*100,'%')

3 folds accuracy: [82.34 83.11 87.17]
mean accuracy of 3 folds: 84.20424236954955 %


#### SVR Grid Search

In [None]:
# defining parameter range
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(regressor,param_grid,refit=True,verbose=2,cv = 3, n_jobs = -1)
grid.fit(X_train,y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time=  39.2s
[CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time=  47.3s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=  36.3s
[CV] END ....................C=10, gamma=0.1, kernel=sigmoid; total time=  49.5s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  39.5s
[CV] END ...................C=10, gamma=0.01, kernel=sigmoid; total time=  48.5s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 1.5min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=  35.6s
[CV] END ......................C=10, gamma=1, kernel=sigmoid; total time=  54.8s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time= 1.5min
[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time=  34.1s
[CV] END ......................C=10, gamma=1, k

[CV] END ....................C=0.1, gamma=0.001, kernel=poly; total time=  32.3s
[CV] END ...................C=1, gamma=0.001, kernel=sigmoid; total time=  37.1s
[CV] END ......................C=100, gamma=0.01, kernel=rbf; total time= 1.6min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  36.3s
[CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 4.8min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=  36.6s
[CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 4.9min
[CV] END .................C=0.1, gamma=0.001, kernel=sigmoid; total time=  37.1s
[CV] END .......................C=10, gamma=0.1, kernel=poly; total time= 5.0min
[CV] END .....................C=1, gamma=0.1, kernel=sigmoid; total time= 1.0min
[CV] END .......................C=100, gamma=0.1, kernel=rbf; total time=23.2min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=  32.2s
[CV] END ...................

#### Predicting Results

In [None]:
from sklearn.preprocessing import StandardScaler
y_pred = grid.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

#### Evaluating Model

In [None]:
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test,y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R squared value: ",r2_score(y_test, y_pred))

#### Plotting the results

In [None]:
import seaborn as sns
plt.figure(figsize=(7,5))
sns.regplot(x=y_test,y=y_pred,ci=None,color ='red');
plt.title('Support Vector Regression with Grid Search')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')