In [1]:
#Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import datetime

In [2]:
#Splitting into train and test

data = pd.read_csv('data.csv')
train, test = train_test_split(data, test_size = 0.3, random_state = 42)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)


In [3]:
#Load data

data_train = pd.read_csv('train.csv')
X_train = data_train[['PM2.5_LCS', 'RH', 'Temp']].values
Y_train = data_train['PM2.5_Ref'].values

data_test = pd.read_csv('test.csv')
X_test = data_test[['PM2.5_LCS', 'RH', 'Temp']].values
Y_test = data_test['PM2.5_Ref'].values

data_new = pd.read_csv('data_new.csv')

# Run Different Models

# LINEAR REGRESSION

In [4]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, Y_train.ravel())
print('LR Train score: ', LR.score(X_train,Y_train))
print('LR Test score: ', LR.score(X_test,Y_test))
y_pred_LR = LR.predict(X_test)

print('LR R2: ', r2_score(Y_test, y_pred_LR))
print('LR RMSE: ', sqrt(mean_squared_error(Y_test, y_pred_LR)))
print('LR MAE: ', mean_absolute_error(Y_test, y_pred_LR))

data_new['pred'] = y_pred_LR
data_new.to_csv('LR.csv')

LR Train score:  0.5521730624305887
LR Test score:  0.5572842746432795
LR R2:  0.5572842746432795
LR RMSE:  15.731027924690409
LR MAE:  12.090095588182086


# SUPPORT VECTOR REGRESSION

In [7]:
from sklearn.svm import SVR
svr_lin = SVR(kernel = 'linear')
svr_lin.fit(X_train, Y_train.ravel())
print('SVR Train score: ', svr_lin.score(X_train,Y_train))
print('SVR Test score: ', svr_lin.score(X_test,Y_test)) 
y_pred_SVR_Line = svr_lin.predict(X_test)

print('SVR R2: ', r2_score(Y_test, y_pred_SVR_Line))
print('SVR RMSE: ', sqrt(mean_squared_error(Y_test, y_pred_SVR_Line)))
print('SVR MAE: ', mean_absolute_error(Y_test, y_pred_SVR_Line))

data_new['pred'] = y_pred_SVR_Line
data_new.to_csv('SVR_Line.csv')

SVR Train score:  0.5419372886908592
SVR Test score:  0.5462972823397332
SVR R2:  0.5462972823397332
SVR RMSE:  15.925032161372261
SVR MAE:  11.903928633479136


# DECISION TREE REGRESSION

In [8]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor()
DT.fit(X_train, Y_train.ravel())
print('Decision Tree Train score: ', DT.score(X_train,Y_train))
print('Decision Tree Test score: ', DT.score(X_test,Y_test)) 
y_pred_DT = DT.predict(X_test)

print('DT R2: ', r2_score(Y_test, y_pred_DT))
print('DT RMSE: ', sqrt(mean_squared_error(Y_test, y_pred_DT)))
print('DT MAE: ', mean_absolute_error(Y_test, y_pred_DT))

data_new['pred'] = y_pred_DT
data_new.to_csv('DT.csv')

Decision Tree Train score:  1.0
Decision Tree Test score:  0.2533134584672483
DT R2:  0.2533134584672483
DT RMSE:  20.429784685923565
DT MAE:  14.832652084942202


# RANDOM FOREST

In [9]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor()
RF.fit(X_train, Y_train.ravel())
print('RF Train score: ', RF.score(X_train,Y_train))
print('RF Test score: ', RF.score(X_test,Y_test)) 
y_pred_RF = RF.predict(X_test)

print('RF R2: ', r2_score(Y_test, y_pred_RF))
print('RF RMSE: ', sqrt(mean_squared_error(Y_test, y_pred_RF)))
print('RF MAE: ', mean_absolute_error(Y_test, y_pred_RF))

data_new['pred'] = y_pred_RF
data_new.to_csv('RF.csv')

RF Train score:  0.9418668236437204
RF Test score:  0.5998666340528541
RF R2:  0.5998666340528541
RF RMSE:  14.955364813825014
RF MAE:  11.186482409584794


# GRADIENT BOOSTING

In [10]:
from sklearn.ensemble import GradientBoostingRegressor
GB = GradientBoostingRegressor()
GB.fit(X_train, Y_train.ravel())
print('GB Train score: ', GB.score(X_train,Y_train))
print('GB Test score: ', GB.score(X_test,Y_test)) 
y_pred_GB = GB.predict(X_test)

print('GB R2: ', r2_score(Y_test, y_pred_GB))
print('GB RMSE: ', sqrt(mean_squared_error(Y_test, y_pred_GB)))
print('GB MAE: ', mean_absolute_error(Y_test, y_pred_GB))

data_new['pred'] = y_pred_GB
data_new.to_csv('GB.csv')

GB Train score:  0.6382377271617767
GB Test score:  0.6068112138631943
GB R2:  0.6068112138631943
GB RMSE:  14.825016633516261
GB MAE:  11.281554312940766


# K - NEAREST NEIGHBOR

In [11]:
from sklearn.neighbors import KNeighborsRegressor
kN = KNeighborsRegressor()
kN.fit(X_train, Y_train.ravel())
print('kNN Train score: ', kN.score(X_train,Y_train))
print('kNN Test score: ', kN.score(X_test,Y_test)) 
y_pred_kN = kN.predict(X_test)

print('KNN R2: ', r2_score(Y_test, y_pred_kN))
print('KNN RMSE: ', sqrt(mean_squared_error(Y_test, y_pred_kN)))
print('KNN MAE: ', mean_absolute_error(Y_test, y_pred_kN))

data_new['pred'] = y_pred_kN
data_new.to_csv('kN.csv')

kNN Train score:  0.7026344022924671
kNN Test score:  0.5654470922932394
KNN R2:  0.5654470922932394
KNN RMSE:  15.58532840356
KNN MAE:  11.757401293192398
