In [None]:
#Provisional Machine Learning Model by Claire Golden
#Goal) input: pollution and air quality index values, 
#Goal) output: predicted values of air index quality based on existing patterns of pollutants
#Model) Linear Regression: estimating that pollutants steadily or drastically increase with time, also estimating that these 4 main pollutants directly effect air quality
#Training) will use data on 4 main pollutant levels from 2000-2016 and the corresponding air quality index values to train 

In [2]:
# Load libraries
import numpy as np
import pandas as pd
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [10]:
pollution_df = pd.read_csv(r'resources/us_pollution_2000_2016.csv')
pollution_df.head()
                           

Unnamed: 0,usstate,county,city,local_date,no2avg,no2maxvalue,no2maxhour,no2aqi,o3avg,o3maxvalue,o3maxhour,o3aqi,so2avg,so2maxvalue,so2maxhour,so2aqi,coavg,comaxvalue,comaxhour,coaqi
0,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,10,34,3.0,9.0,21,13.0,1.145833,4.2,21,
1,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,10,34,3.0,9.0,21,13.0,0.878947,2.2,23,25.0
2,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,10,34,2.975,6.6,23,,1.145833,4.2,21,
3,Arizona,Maricopa,Phoenix,2000-01-01,19.041667,49.0,19,46,0.0225,0.04,10,34,2.975,6.6,23,,0.878947,2.2,23,25.0
4,Arizona,Maricopa,Phoenix,2000-01-02,22.958333,36.0,19,34,0.013375,0.032,10,27,1.958333,3.0,22,4.0,0.85,1.6,23,


In [11]:
pollution_df.dtypes

usstate         object
county          object
city            object
local_date      object
no2avg         float64
no2maxvalue    float64
no2maxhour       int64
no2aqi           int64
o3avg          float64
o3maxvalue     float64
o3maxhour        int64
o3aqi            int64
so2avg         float64
so2maxvalue    float64
so2maxhour       int64
so2aqi         float64
coavg          float64
comaxvalue     float64
comaxhour        int64
coaqi          float64
dtype: object

In [12]:
print(pollution_df.shape)

(1746661, 20)


In [13]:
pollution_df.describe()

Unnamed: 0,no2avg,no2maxvalue,no2maxhour,no2aqi,o3avg,o3maxvalue,o3maxhour,o3aqi,so2avg,so2maxvalue,so2maxhour,so2aqi,coavg,comaxvalue,comaxhour,coaqi
count,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,1746661.0,873754.0,1746661.0,1746661.0,1746661.0,873338.0
mean,12.82193,25.41485,11.73102,23.89822,0.02612485,0.03920331,10.17053,36.05012,1.870364,4.492185,9.664906,7.115945,0.3682177,0.6201067,7.875026,5.996595
std,9.504814,15.99963,7.877501,15.1628,0.01136974,0.01534362,4.003144,19.78042,2.760435,7.679866,6.731228,11.937473,0.3140231,0.6439361,7.978844,5.851836
min,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,0.0,0.0,-0.4375,-0.4,0.0,0.0
25%,5.75,13.0,5.0,12.0,0.017875,0.029,9.0,25.0,0.256522,0.8,5.0,1.0,0.183458,0.292,0.0,2.0
50%,10.73913,24.0,9.0,23.0,0.025875,0.038,10.0,33.0,0.9875,2.0,8.0,3.0,0.292625,0.4,6.0,5.0
75%,17.71364,35.7,20.0,33.0,0.033917,0.048,11.0,42.0,2.325,5.0,14.0,9.0,0.466667,0.8,13.0,8.0
max,139.5417,267.0,23.0,132.0,0.095083,0.141,23.0,218.0,321.625,351.0,23.0,200.0,7.508333,19.9,23.0,201.0


In [14]:
list(pollution_df.columns)

['usstate',
 'county',
 'city',
 'local_date',
 'no2avg',
 'no2maxvalue',
 'no2maxhour',
 'no2aqi',
 'o3avg',
 'o3maxvalue',
 'o3maxhour',
 'o3aqi',
 'so2avg',
 'so2maxvalue',
 'so2maxhour',
 'so2aqi',
 'coavg',
 'comaxvalue',
 'comaxhour',
 'coaqi']

In [None]:
#Insert Data Cleaning Code here 

In [None]:
#Separate clean dataframe into features and target variable 
X = cleandf.iloc[].values
y = cleandf.iloc[].values

In [None]:
#Split into training and testing sets
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=1)

In [None]:
#Linear regression model 
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
#Visualizing Linear Regression model, training set
lin_viz_train = plt
lin_viz_train.scatter(X_train, y_train, color='blue')
lin_viz_train.plot(X_train, regressor.predict(X_train), color='green')
lin_viz_train.title('')
lin_viz_train.xlabel('')
lin_viz_train.ylabel('')
lin_viz_train.show()

In [None]:
#Visualizing Linear Regression model, testing set
lin_viz_test = plt
lin_viz_test.scatter(X_test, y_test, color='blue')
lin_viz_test.plot(X_train, regressor.predict(X_train), color='green')
lin_viz_test.title('')
lin_viz_test.xlabel('')
lin_viz_test.ylabel('')
lin_viz_test.show()

In [None]:
#Analyze Linear Regression model
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
#Logistic Reegression Model
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [None]:
#Visualizing Logistic Regression model, training set
log_viz_train = plt
log_viz_train.scatter(X_train, y_train, color='blue')
log_viz_train.plot(X_train, regressor.predict(X_train), color='green')
log_viz_train.title('')
log_viz_train.xlabel('')
log_viz_train.ylabel('')
log_viz_train.show()

In [None]:
#Visualizing Logistic Regression model, testing set
log_viz_test = plt
log_viz_test.scatter(X_test, y_test, color='blue')
log_viz_test.plot(X_train, regressor.predict(X_train), color='green')
log_viz_test.title('')
log_viz_test.xlabel('')
log_viz_test.ylabel('')
log_viz_test.show()

In [None]:
#Analyze Logistic Regression Model
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))