In [None]:
#Data analysis libraries
import numpy as np
import pandas as pd
# libraries for models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from pandas.core.common import random_state
from sklearn.model_selection import train_test_split
#preprocess libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
#visulization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
from plotly.graph_objs import  Scatter
py.init_notebook_mode(connected=True)
from matplotlib import legend
import random
%matplotlib inline
#General modules
import numpy as np
import pandas as pd
#For model evaluation
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
#For q-q plot
import scipy.stats as stats
!pip install keras
!pip install scikeras
!pip install keras-tuner --upgrade
!pip install tensorflow --upgrade
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
from tensorflow.keras import optimizers
# metrics evaluation libraries
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  RandomizedSearchCV
#neural
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.models import Sequential

In [None]:
data = pd.read_csv('/content/AirQualityUCI.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
#droping end rows with NaN values
data.dropna(how='all',inplace=True)

In [None]:
data.isnull().sum()

In [None]:
data.duplicated()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.drop(['Unnamed: 15','Unnamed: 16'], axis=1, inplace=True, errors = 'ignore')

In [None]:
percent_NaN = []
columns = data.columns
for col in columns:
    pNaN =  (data[col].isna().sum()/data.shape[0]) * 100 #sum NaN instances in each column. Divide by total rows
    percent_NaN.append(pNaN)
nan_percent_df = pd.DataFrame(percent_NaN,
                              index=columns,
                              columns=['%_NaN_in_Column']).sort_values('%_NaN_in_Column',ascending = False)
nan_percent_df

#Cleaning Up Time Features

In [None]:
import pandas as pd
import datetime

#Defining Month from DATE column
data['Date'] = pd.to_datetime(data.Date, format='%d-%m-%Y')
data['MONTH'] = data['Date'].dt.month
#Splitting Column TIME(HH:MM:SS) into new column(of int64 type)
data['HOUR']=data['Time'].apply(lambda x: int(x.split(':')[0]))
data.drop(['Date', 'Time'], axis = 1, inplace = True)
data.head(10)


In [None]:
data.head()

#Removing Outliers From Dataframe

In [None]:
from scipy.stats import norm
# Plotting Box and Distribution plot
for var in data:
    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    ax=sns.boxplot(data=data[var], color = 'pink')
    ax.set_title(f'{var}')
    ax.set_ylabel(var)

    plt.subplot(1,2,2)
    ax=sns.distplot(data[var], fit=norm, color = 'pink')
    ax.set_title(f'skewness of {var} : {data[var].skew()}')
    ax.set_xlabel(var)
    print('__'*50)
    plt.show()

In [None]:
#Removing Outliers with the Interquartile Range Method (IQR)
Q1 = data.quantile(0.25, numeric_only=True)  # First quartile (25%)
Q3 = data.quantile(0.75, numeric_only=True)  # Third quartile (75%)


IQR = Q3 - Q1 #IQR = InterQuartile Range

scale = 1.4 #May need to play with this value to modify outlier detection sensitivity if need be
lower_lim = Q1 - scale*IQR
upper_lim = Q3 + scale*IQR
cols=[col for col in data.columns if col not in ["Hour", "Month", "RH"]]
# cols = data.columns[5:]  # Look for outliers in columns starting from CO(GT)

# Align lower_lim and upper_lim with the DataFrame columns
lower_lim = lower_lim[cols]
upper_lim = upper_lim[cols]

# Mask a condition that removes rows with values above/below IQR limits
condition = ~((data[cols] < lower_lim) | (data[cols] > upper_lim)).any(axis=1)

# Generate a new DataFrame with outliers removed
data_filtered = data[condition]

In [None]:
for var in data_filtered:
    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    ax=sns.boxplot(data=data_filtered[var], color = 'pink')
    ax.set_title(f'{var}')
    ax.set_ylabel(var)

    plt.subplot(1,2,2)
    ax=sns.distplot(data_filtered[var], fit=norm, color = 'pink')
    ax.set_title(f'skewness of {var} : {data_filtered[var].skew()}')
    ax.set_xlabel(var)
    print('__'*50)
    plt.show()

In [None]:
cols =  list(data.columns)
plt.figure(figsize=(20, 20))
for i in range(1, 9):
    plt.subplot(3, 4, i)
    sns.scatterplot(x = cols[i - 1], y = data['RH'],data = data, palette = "husl")

In [None]:
pairplot1 = sns.pairplot(data, hue='RH')
pairplot1.fig.suptitle("Water Potability Pairwise Plots",fontsize=26, y=1.01);

In [None]:
correlation = data.corr()
matrix_cols = correlation.columns.tolist()
corr_array  = np.array(correlation)
pd.DataFrame(corr_array)

In [None]:
#visualizing the correlation
plt.figure(figsize = (15, 5))
sns.heatmap(correlation, annot = True)
plt.show()

#Preparing the data

In [None]:
x = data.drop(columns = 'RH')
y = data['RH']
#scaling
scalar = StandardScaler()
x = scalar.fit_transform(x)
x.shape, y.shape

In [None]:
#train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 18)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
#Data-Size
print('Training Data Size:',x_train.shape)
print('Test Data Size:',x_test.shape)

#Build and Train Model Function

#Linear Regression

In [None]:
model= LinearRegression()
model.fit(x_train,y_train)
prediction = model.predict(x_test)
MAE_L =  metrics.mean_absolute_error(y_test, prediction)
print('MAE:', MAE_L)
MSE_L = metrics.mean_squared_error(y_test, prediction)
print('MSE:', MSE_L)
RMSE_L = np.sqrt(metrics.mean_squared_error(y_test, prediction))
print('RMSE:', RMSE_L)

In [None]:
# Visualization of Actual vs Predicted values
plt.figure(figsize=(14, 6))
# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, prediction, alpha=0.7, color='blue', edgecolors='k')
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.title('Actual vs Predicted Values', fontsize=14)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)  # Diagonal line
plt.grid(True)
plt.xlim(y_test.min() - 1, y_test.max() + 1)
plt.ylim(y_test.min() - 1, y_test.max() + 1)

# Annotate metrics
plt.annotate(f'MSE: {MSE_L:.2f}\nMAE: {MAE_L:.2f}\nRMSE: {RMSE_L:.2f}',
             xy=(0.05, 0.95), xycoords='axes fraction',
             fontsize=12, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.show()

#Support Vector Machine

In [None]:
from sklearn.svm import SVR
from sklearn import metrics
model = SVR()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
MAE_S =  metrics.mean_absolute_error(y_test, prediction)
print('MAE:', MAE_S)
MSE_S = metrics.mean_squared_error(y_test, prediction)
print('MSE:', MSE_S)
RMSE_S = np.sqrt(metrics.mean_squared_error(y_test, prediction))
print('RMSE:', RMSE_S)

In [None]:
# Visualization of Actual vs Predicted values
plt.figure(figsize=(14, 6))

# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, prediction, alpha=0.7, color='pink', edgecolors='k')
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.title('Actual vs Predicted Values', fontsize=14)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)  # Diagonal line
plt.grid(True)
plt.xlim(y_test.min() - 1, y_test.max() + 1)
plt.ylim(y_test.min() - 1, y_test.max() + 1)

# Annotate metrics
plt.annotate(f'MSE: {MSE_S:.2f}\nMAE: {MAE_S:.2f}\nRMSE: {RMSE_S:.2f}',
             xy=(0.05, 0.95), xycoords='axes fraction',
             fontsize=12, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.show()

#Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(x_train, y_train)
prediction=model.predict(x_test)
MAE_R =  metrics.mean_absolute_error(y_test, prediction)
print('MAE:', MAE_R)
MSE_R = metrics.mean_squared_error(y_test, prediction)
print('MSE:', MSE_R)
RMSE_R = np.sqrt(metrics.mean_squared_error(y_test, prediction))
print('RMSE:', RMSE_R)

In [None]:
# Visualization of Actual vs Predicted values
plt.figure(figsize=(14, 6))
# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, prediction, alpha=0.7, color='magenta', edgecolors='k')
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.title('Actual vs Predicted Values', fontsize=14)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)  # Diagonal line
plt.grid(True)
plt.xlim(y_test.min() - 1, y_test.max() + 1)
plt.ylim(y_test.min() - 1, y_test.max() + 1)

# Annotate metrics
plt.annotate(f'MSE: {MSE_R:.2f}\nMAE: {MAE_R:.2f}\nRMSE: {RMSE_R:.2f}',
             xy=(0.05, 0.95), xycoords='axes fraction',
             fontsize=12, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.show()

#Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
MAE_D =  metrics.mean_absolute_error(y_test, prediction)
print('MAE:', MAE_D)
MSE_D = metrics.mean_squared_error(y_test, prediction)
print('MSE:', MSE_D)
RMSE_D = np.sqrt(metrics.mean_squared_error(y_test, prediction))
print('RMSE:', RMSE_D)

In [None]:
# Visualization of Actual vs Predicted values
plt.figure(figsize=(14, 6))
# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, prediction, alpha=0.7, color='orange', edgecolors='k')
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.title('Actual vs Predicted Values', fontsize=14)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)  # Diagonal line
plt.grid(True)
plt.xlim(y_test.min() - 1, y_test.max() + 1)
plt.ylim(y_test.min() - 1, y_test.max() + 1)

# Annotate metrics
plt.annotate(f'MSE: {MSE_D:.2f}\nMAE: {MAE_D:.2f}\nRMSE: {RMSE_D:.2f}',
             xy=(0.05, 0.95), xycoords='axes fraction',
             fontsize=12, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
plt.show()


Evaluation Metrics

Three common evaluation metrics for regression problems:

    Mean Absolute Error (MAE) is the mean of the absolute value of the errors:
    <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mfrac>
    <mn>1</mn>
    <mi>n</mi>
  </mfrac>
  <munderover>
    <mo data-mjx-texclass="OP">&#x2211;</mo>
    <mrow data-mjx-texclass="ORD">
      <mi>i</mi>
      <mo>=</mo>
      <mn>1</mn>
    </mrow>
    <mi>n</mi>
  </munderover>
  <mo data-mjx-texclass="ORD" stretchy="false">|</mo>
  <msub>
    <mi>y</mi>
    <mi>i</mi>
  </msub>
  <mo>&#x2212;</mo>
  <msub>
    <mrow data-mjx-texclass="ORD">
      <mover>
        <mi>y</mi>
        <mo stretchy="false">^</mo>
      </mover>
    </mrow>
    <mi>i</mi>
  </msub>
  <mo data-mjx-texclass="ORD" stretchy="false">|</mo>
</math>

    Mean Squared Error (MSE) is the mean of the squared errors, MSE "punishes" larger errors, which tends to be useful in the real world:
    <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mfrac>
    <mn>1</mn>
    <mi>n</mi>
  </mfrac>
  <munderover>
    <mo data-mjx-texclass="OP">&#x2211;</mo>
    <mrow data-mjx-texclass="ORD">
      <mi>i</mi>
      <mo>=</mo>
      <mn>1</mn>
    </mrow>
    <mi>n</mi>
  </munderover>
  <mo stretchy="false">(</mo>
  <msub>
    <mi>y</mi>
    <mi>i</mi>
  </msub>
  <mo>&#x2212;</mo>
  <msub>
    <mrow data-mjx-texclass="ORD">
      <mover>
        <mi>y</mi>
        <mo stretchy="false">^</mo>
      </mover>
    </mrow>
    <mi>i</mi>
  </msub>
  <msup>
    <mo stretchy="false">)</mo>
    <mn>2</mn>
  </msup>
</math>

    Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors, RMSE is interpretable in the "y" units:
    <math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <msqrt>
    <mfrac>
      <mn>1</mn>
      <mi>n</mi>
    </mfrac>
    <munderover>
      <mo data-mjx-texclass="OP">&#x2211;</mo>
      <mrow data-mjx-texclass="ORD">
        <mi>i</mi>
        <mo>=</mo>
        <mn>1</mn>
      </mrow>
      <mi>n</mi>
    </munderover>
    <mo stretchy="false">(</mo>
    <msub>
      <mi>y</mi>
      <mi>i</mi>
    </msub>
    <mo>&#x2212;</mo>
    <msub>
      <mrow data-mjx-texclass="ORD">
        <mover>
          <mi>y</mi>
          <mo stretchy="false">^</mo>
        </mover>
      </mrow>
      <mi>i</mi>
    </msub>
    <msup>
      <mo stretchy="false">)</mo>
      <mn>2</mn>
    </msup>
  </msqrt>
</math>



#Neural Network

In [None]:
# Define the neural network model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.callbacks import ReduceLROnPlateau

model = Sequential()
model.add(Dense(64, input_dim = x_train.shape[1], activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'linear'))  # Linear activation for regression

# Compile the model with appropriate loss function for regression
model.compile(optimizer = Adam(learning_rate=0.001), loss = 'mean_squared_error', metrics = ['mae'])

# Set up early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 5, min_lr = 1e-6)

# Train the model
history = model.fit(x_train, y_train, epochs = 100, batch_size = 32,
                    validation_split = 0.2, callbacks=[early_stopping, reduce_lr])

# Evaluate the model
loss, mae = model.evaluate(x_test, y_test)
rmse = np.sqrt(loss)  # Calculate RMSE
print(f'Test Loss (MSE): {loss}')
print(f'Test Mean Absolute Error (MAE): {mae}')
print(f'Test Root Mean Squared Error (RMSE): {rmse}')

# Make predictions
predictions = model.predict(x_test)

# Visualization of Actual vs Predicted values
plt.figure(figsize = (12, 6))

# Plot 1: Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, predictions)
plt.xlabel('Actual RH')
plt.ylabel('Predicted RH')
plt.title('Actual vs Predicted RH')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonal line

# Plot 2: Training History
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()

plt.tight_layout()
plt.show()


#Model Evaluation

In [None]:
# Sample results from different models
rsl = {'MAE': MAE_L, 'MSE': MSE_L, 'RMSE': RMSE_L} #Linear Regression
rsd = {'MAE': MAE_D, 'MSE': MSE_D, 'RMSE': RMSE_D}  # Decision Tree Regression
rsr = {'MAE': MAE_R, 'MSE': MSE_R, 'RMSE': RMSE_R}  # Random Forest Regression
rss = {'MAE': MAE_S, 'MSE': MSE_S, 'RMSE': RMSE_S}  # Support Vector Machine Regression
rsn = {'MAE': mae, 'MSE': loss, 'RMSE': rmse} # "Neural Network"

# Dictionary to hold results
results_dict = {
    "Linear Regression": rsl,
    "Decision Tree Regression": rsd,
    "Random Forest Regression": rsr,
    "Support Vector Machine Regression": rss,
    "Neural Network" : rsn
}

# Find the best model based on the minimum values for MAE, MSE, and RMSE
best_models = {}
for metric in ['MAE', 'MSE', 'RMSE']:
    best_value = min(results_dict[model][metric] for model in results_dict)
    best_models[metric] = [model for model in results_dict if results_dict[model][metric] == best_value]

# Print the best results
print("So we achieve best results from:")
for metric, models in best_models.items():
    print(f"{metric}: {', '.join(models)} with value: {best_value}")

# Optionally, print the overall best model based on the lowest RMSE
overall_best_model = min(results_dict, key=lambda x: results_dict[x]['RMSE'])
print(f"\nOverall Best Model based on RMSE: {overall_best_model} with RMSE: {results_dict[overall_best_model]['RMSE']}")
