# Assignment 2

## Importing the Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Importing the Data Set

In [None]:
air_quality = pd.read_csv('AirQuality_Data.csv')

## Exploring the Data Set

- `air_quality.head()` shows the first 5 rows of the data set.
- `air_quality.info()` shows the number of total range, the data type of each column, the number of columns, and if a column has null or non-null entries. 
- `air_quality.describe()` shows specific information like standard deviation, mean, etc. of the data set. 
- `air_quality.hist()` creates histograms for each column and plots the data from each row in its respective histogram to visualize the data. 
- `sns.pairplot` plots the data from the data set based on the columns to show similarities between the data in all the rows.

In [None]:
air_quality.head()

Checking null values

In [None]:
air_quality.isnull().sum()

## Data Preprocessing

### Dropping unneeded data


In [None]:
air_quality.drop("No", axis = 1, inplace = True)
air_quality.drop("year", axis = 1, inplace = True)
air_quality.drop("month", axis = 1, inplace = True)
air_quality.drop("day", axis = 1, inplace = True)
air_quality.drop("hour", axis = 1, inplace = True)
air_quality.drop("station", axis = 1, inplace = True)
air_quality.drop("wd", axis = 1, inplace = True)

In [None]:
air_quality.to_csv('updated_AirQuality_Data.csv', index=False)

In [None]:
air_quality.head()

### Imputing missing values using Sklearn

Sklearn SimpleImputer was imported so that the null data entries can be imputed, making every entry a non-null entry.
- `air_quality['SO2']`, `air_quality['NO2']`, `air_quality['CO']`, `air_quality['O3']`1 was imputed using the median strategy

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

In [None]:

air_quality['PM2.5'] = imputer.fit_transform(air_quality[['PM2.5']])
air_quality['PM10'] = imputer.fit_transform(air_quality[['PM10']])
air_quality['SO2'] = imputer.fit_transform(air_quality[['SO2']])
air_quality['NO2'] = imputer.fit_transform(air_quality[['NO2']])
air_quality['CO'] = imputer.fit_transform(air_quality[['CO']])
air_quality['O3'] = imputer.fit_transform(air_quality[['O3']])
air_quality['TEMP'] = imputer.fit_transform(air_quality[['TEMP']])
air_quality['PRES'] = imputer.fit_transform(air_quality[['PRES']])
air_quality['DEWP'] = imputer.fit_transform(air_quality[['DEWP']])
air_quality['RAIN'] = imputer.fit_transform(air_quality[['RAIN']])
air_quality['WSPM'] = imputer.fit_transform(air_quality[['WSPM']])

In [None]:
air_quality.isnull().sum()

We now have no null values so we can proceed with the Regression Models

In [None]:
air_quality.to_csv('updated_AirQuality_Data.csv', index=False)

In [None]:
air_quality.corr()

## Splitting the data into a training and test set using Sklearn

Selecting independent and dependent variables

In [None]:
air_quality.info()


In [None]:
x=air_quality.drop('PM2.5', axis=1)
y=air_quality['PM2.5']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.2, random_state=42) 

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
Linear_regressor_model=regressor.fit(x_train, y_train)

In [None]:
Linear_regressor_model.score(x_train, y_train)

In [None]:
Linear_regressor_model.score(x_test, y_test)

### Using Scikit learn to calculate and predict the root mean squared error (RMSE)

In [None]:
y_pred=regressor.predict(x_test)

In [None]:
# The coefficients
print("Coefficients", regressor.coef_)
print("Intercept", regressor.intercept_)

In [None]:
from sklearn.metrics import mean_squared_error
MSE=mean_squared_error(y_true=y_test, y_pred=y_pred, squared=True)
print(MSE)

In [None]:
RMSE=mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)
print(RMSE)

## Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
reg=make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3, eta0=.07))
reg.fit(x_train, y_train)

In [None]:
y_pred_SGD=reg.predict(x_test)
RMSE=mean_squared_error(y_true=y_test, y_pred=y_pred_SGD, squared=False)
print(RMSE)

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
Ridge_model=Ridge(alpha=2, max_iter=100, tol=0.5)
Ridge_model.fit(x_train, y_train)

In [None]:
Ridge(alpha=2, max_iter=1000, tol=0.5)

In [None]:
Ridge_model.score(x_train, y_train)

In [None]:
Ridge_model.score(x_test, y_test)

## Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
Lasso_model=Lasso(alpha=100, max_iter=1000, tol=0.5)
Lasso_model.fit(x_train, y_train)

In [None]:
Lasso_model.score(x_train, y_train)

In [None]:
Lasso_model.score(x_test, y_test)

## ElaticNet

In [None]:
from sklearn.linear_model import ElasticNet
ElasticNet_model=ElasticNet(alpha=.01, l1_ratio=0.5)
ElasticNet_model.fit(x_train, y_train)

In [None]:
ElasticNet_model.score(x_train, y_train)

In [None]:
ElasticNet_model.score(x_test, y_test)

## Visualize the different Regression Models

In [None]:

from prettytable import PrettyTable

In [None]:
myTable=PrettyTable(["Model", "Training_score", "Test_score"])

In [None]:
myTable.add_row(["Linear Regression", Linear_regressor_model.score(x_train, y_train), Linear_regressor_model.score(x_test, y_test)])
myTable.add_row(["Ridge", Ridge_model.score(x_train, y_train), Ridge_model.score(x_test, y_test)])
myTable.add_row(["Lasso", Lasso_model.score(x_train, y_train), Lasso_model.score(x_test, y_test)])
myTable.add_row(["Elastic Net", ElasticNet_model.score(x_train, y_train), ElasticNet_model.score(x_test, y_test)])

In [None]:
print(myTable)

## Conclusion

All scores between `Linear Regression`, `Ridge`, `Lasso`, and `Elastic Net` were between .83 and .86 which indicate a proper fit of the data. 

