In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
import math
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import pickle 
# For serializing and de-serializing a python object structure. Any object in python can be pickled so that it can
# be saved on disk. Pickling is a way to convert a python object ( list, dict, etc. ) into a character stream which
# contains all the information necessary to reconstruct the object in another python script. 

ds1 = pd.read_csv( "gt_2011.csv" )
ds2 = pd.read_csv( "gt_2012.csv" )
ds3 = pd.read_csv( "gt_2013.csv" )
ds4 = pd.read_csv( "gt_2014.csv" )
ds5 = pd.read_csv( "gt_2015.csv" )

# Concatenate all dataframes
frames = [ ds1, ds2, ds3, ds4, ds5 ]
merged_ds = pd.concat( frames )

# Saving merged ds into a separate .csv file for ease
pd.DataFrame.to_csv( merged_ds, "CombinedDataSet.csv", index=False )

# Split the data into training ( 80 % ) and test set ( 20 % )
x = merged_ds.iloc[ :, [ 0, 1, 2, 3, 4, 5, 6, 8 ] ].values
y = merged_ds.iloc[ :, [ 7, 9, 10 ] ].values

x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2 )
merged_ds.head( )

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP,CO,NOX
0,4.5878,1018.7,83.675,3.5758,23.979,1086.2,549.83,134.67,11.898,0.32663,81.952
1,4.2932,1018.3,84.235,3.5709,23.951,1086.1,550.05,134.67,11.892,0.44784,82.377
2,3.9045,1018.4,84.858,3.5828,23.99,1086.5,550.19,135.1,12.042,0.45144,83.776
3,3.7436,1018.3,85.434,3.5808,23.911,1086.5,550.17,135.03,11.99,0.23107,82.505
4,3.7516,1017.8,85.182,3.5781,23.917,1085.9,550.0,134.67,11.91,0.26747,82.028


In [42]:
poly2_transformer = PolynomialFeatures(degree=2, include_bias=False).fit(x_train)
poly3_transformer = PolynomialFeatures(degree=3, include_bias=False).fit(x_train)
poly5_transformer = PolynomialFeatures(degree=5, include_bias=False).fit(x_train)
x_train_transformed_poly2 = poly2_transformer.transform(x_train)
x_train_transformed_poly3 = poly3_transformer.transform(x_train)
x_train_transformed_poly5 = poly5_transformer.transform(x_train)

x_test_transformed_poly2 = poly2_transformer.transform(x_test)
x_test_transformed_poly3 = poly3_transformer.transform(x_test)
x_test_transformed_poly5 = poly5_transformer.transform(x_test)

LR2 = LinearRegression()
LR3 = LinearRegression()
LR5 = LinearRegression()

poly2_reg_NOX = LR2.fit(x_train_transformed_poly2, y_train[:,2])
poly3_reg_NOX = LR3.fit(x_train_transformed_poly3, y_train[:,2])
poly5_reg_NOX = LR5.fit(x_train_transformed_poly5, y_train[:,2])

poly2_pred_NOX = poly2_reg_NOX.predict(x_test_transformed_poly2)
poly3_pred_NOX = poly3_reg_NOX.predict(x_test_transformed_poly3)
poly5_pred_NOX = poly5_reg_NOX.predict(x_test_transformed_poly5)

r_sq_poly2 = r2_score(poly2_pred_NOX, y_test[:,2])
r_sq_poly3 = r2_score(poly3_pred_NOX, y_test[:,2])
r_sq_poly5 = r2_score(poly5_pred_NOX, y_test[:,2])

rsme_poly2 = math.sqrt(mean_squared_error(y_test[:,2], poly2_pred_NOX))
rsme_poly3 = math.sqrt(mean_squared_error(y_test[:,2], poly3_pred_NOX))
rsme_poly5 = math.sqrt(mean_squared_error(y_test[:,2], poly5_pred_NOX))

print("R^2 for poly2: {:.4f}".format(r_sq_poly2))
print("R^2 for poly3: {:.4f}".format(r_sq_poly3))
print("R^2 for poly5: {:.4f}".format(r_sq_poly5))

print("RMSE for poly2: {:.4f}".format(rsme_poly2))
print("RMSE for poly3: {:.4f}".format(rsme_poly3))
print("RMSE for poly5: {:.4f}".format(rsme_poly5))



R^2 for poly2: 0.6059
R^2 for poly3: 0.7279
R^2 for poly5: 0.6851
RMSE for poly2: 6.1705
RMSE for poly3: 5.3860
RMSE for poly5: 6.6280


In [44]:
model_tree = DecisionTreeRegressor(random_state=32).fit(x_train, y_train[:, 2])

NOX_pred_train = model_tree.predict(x_train)
NOX_pred_test = model_tree.predict(x_test)

r_sq_train = r2_score(y_train[:,2], NOX_pred_train)
rmse_train = math.sqrt(mean_squared_error(y_train[:,2], NOX_pred_train))
r_sq_test = r2_score(y_test[:,2], NOX_pred_test)
rmse_test = math.sqrt(mean_squared_error(y_test[:,2], NOX_pred_test))

print(r_sq_train)
print(rmse_train)
print(r_sq_test)
print(rmse_test)

1.0
9.268404274438724e-17
0.7122619500591143
6.244192519155599
