Includes four major pollutants (Nitrogen Dioxide, Sulphur Dioxide, Carbon Monoxide and Ozone).

- State Code : The code allocated by US EPA to each state
- County code : The code of counties in a specific state allocated by US EPA
- Site Num : The site number in a specific county allocated by US EPA
- Address: Address of the monitoring site
- State : State of monitoring site
- County : County of monitoring site
- City : City of the monitoring site
- Date Local : Date of monitoring


The four pollutants (NO2, O3, SO2 and O3) each has 5 specific columns. For instance, for NO2:

- NO2 Units : The units measured for NO2
- NO2 Mean : The arithmetic mean of concentration of NO2 within a given day
- NO2 AQI : The calculated air quality index of NO2 within a given day
- NO2 1st Max Value : The maximum value obtained for NO2 concentration in a given day
- NO2 1st Max Hour : The hour when the maximum NO2 concentration was recorded in a given day

In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import data
df = pd.read_csv('data/pollution_us_2000_2016.csv')

In [3]:
df.sample()

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
1219059,17429,6,23,1004,717 SOUTH AVENUE,California,Humboldt,Eureka,2012-09-28,Parts per billion,...,Parts per billion,-0.057143,0.0,2,,Parts per million,0.3625,0.4,1,5.0


In [4]:
df.loc[(df['County'] == 'Los Angeles')]

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
10666,10666,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,0,0.0,Parts per million,1.360870,3.7,2,
10667,10667,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,0,0.0,Parts per million,1.300000,3.0,6,34.0
10668,10668,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,2,,Parts per million,1.360870,3.7,2,
10669,10669,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,2,,Parts per million,1.300000,3.0,6,34.0
10670,10670,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-02,Parts per billion,...,Parts per billion,0.000000,0.0,0,0.0,Parts per million,0.373913,1.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619794,23793,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-30,Parts per billion,...,Parts per billion,0.785714,1.2,11,,Parts per million,0.483333,0.8,4,9.0
1619795,23794,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-31,Parts per billion,...,Parts per billion,0.565217,0.9,0,0.0,Parts per million,0.382609,1.1,0,
1619796,23795,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-31,Parts per billion,...,Parts per billion,0.565217,0.9,0,0.0,Parts per million,0.425000,0.7,3,8.0
1619797,23796,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-31,Parts per billion,...,Parts per billion,0.557143,0.8,2,,Parts per million,0.382609,1.1,0,


In [5]:
df.loc[(df['County'] == 'Los Angeles') & (df['State'] == 'California')]

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
10666,10666,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,0,0.0,Parts per million,1.360870,3.7,2,
10667,10667,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,0,0.0,Parts per million,1.300000,3.0,6,34.0
10668,10668,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,2,,Parts per million,1.360870,3.7,2,
10669,10669,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-01,Parts per billion,...,Parts per billion,0.000000,0.0,2,,Parts per million,1.300000,3.0,6,34.0
10670,10670,6,37,1002,"228 W. PALM AVE., BURBANK",California,Los Angeles,Burbank,2000-01-02,Parts per billion,...,Parts per billion,0.000000,0.0,0,0.0,Parts per million,0.373913,1.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1619794,23793,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-30,Parts per billion,...,Parts per billion,0.785714,1.2,11,,Parts per million,0.483333,0.8,4,9.0
1619795,23794,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-31,Parts per billion,...,Parts per billion,0.565217,0.9,0,0.0,Parts per million,0.382609,1.1,0,
1619796,23795,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-31,Parts per billion,...,Parts per billion,0.565217,0.9,0,0.0,Parts per million,0.425000,0.7,3,8.0
1619797,23796,6,37,5005,7201 W. WESTCHESTER PARKWAY,California,Los Angeles,Los Angeles,2015-12-31,Parts per billion,...,Parts per billion,0.557143,0.8,2,,Parts per million,0.382609,1.1,0,


In [6]:
df['County'].value_counts()

Los Angeles      93381
Contra Costa     84010
Santa Barbara    82998
San Diego        51110
Maricopa         46586
                 ...  
Athens             908
Haywood            904
Kern               434
Richland           116
Duchesne           104
Name: County, Length: 133, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746661 entries, 0 to 1746660
Data columns (total 29 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   State Code         int64  
 2   County Code        int64  
 3   Site Num           int64  
 4   Address            object 
 5   State              object 
 6   County             object 
 7   City               object 
 8   Date Local         object 
 9   NO2 Units          object 
 10  NO2 Mean           float64
 11  NO2 1st Max Value  float64
 12  NO2 1st Max Hour   int64  
 13  NO2 AQI            int64  
 14  O3 Units           object 
 15  O3 Mean            float64
 16  O3 1st Max Value   float64
 17  O3 1st Max Hour    int64  
 18  O3 AQI             int64  
 19  SO2 Units          object 
 20  SO2 Mean           float64
 21  SO2 1st Max Value  float64
 22  SO2 1st Max Hour   int64  
 23  SO2 AQI            float64
 24  CO Units           object 
 25  CO Mean           

In [8]:
df.columns

Index(['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'Address',
       'State', 'County', 'City', 'Date Local', 'NO2 Units', 'NO2 Mean',
       'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI', 'O3 Units',
       'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 'SO2 Units',
       'SO2 Mean', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI',
       'CO Units', 'CO Mean', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI'],
      dtype='object')

In [9]:
df['Unnamed: 0'].value_counts()

2047      17
11834     17
1600      17
23115     17
21066     17
          ..
133524     1
132366     1
134415     1
133448     1
132120     1
Name: Unnamed: 0, Length: 134576, dtype: int64

In [10]:
df['Date Local'] = pd.to_datetime(df['Date Local'])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1746661 entries, 0 to 1746660
Data columns (total 29 columns):
 #   Column             Dtype         
---  ------             -----         
 0   Unnamed: 0         int64         
 1   State Code         int64         
 2   County Code        int64         
 3   Site Num           int64         
 4   Address            object        
 5   State              object        
 6   County             object        
 7   City               object        
 8   Date Local         datetime64[ns]
 9   NO2 Units          object        
 10  NO2 Mean           float64       
 11  NO2 1st Max Value  float64       
 12  NO2 1st Max Hour   int64         
 13  NO2 AQI            int64         
 14  O3 Units           object        
 15  O3 Mean            float64       
 16  O3 1st Max Value   float64       
 17  O3 1st Max Hour    int64         
 18  O3 AQI             int64         
 19  SO2 Units          object        
 20  SO2 Mean           float

In [12]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.000000,9.0,21,13.0,Parts per million,1.145833,4.200,21,
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.000000,9.0,21,13.0,Parts per million,0.878947,2.200,23,25.0
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975000,6.6,23,,Parts per million,1.145833,4.200,21,
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975000,6.6,23,,Parts per million,0.878947,2.200,23,25.0
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,Parts per billion,1.958333,3.0,22,4.0,Parts per million,0.850000,1.600,23,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1746656,24599,56,21,100,NCore - North Cheyenne Soccer Complex,Wyoming,Laramie,Not in a city,2016-03-30,Parts per billion,...,Parts per billion,0.000000,0.0,2,,Parts per million,0.091667,0.100,2,1.0
1746657,24600,56,21,100,NCore - North Cheyenne Soccer Complex,Wyoming,Laramie,Not in a city,2016-03-31,Parts per billion,...,Parts per billion,-0.022727,0.0,0,0.0,Parts per million,0.067714,0.127,0,
1746658,24601,56,21,100,NCore - North Cheyenne Soccer Complex,Wyoming,Laramie,Not in a city,2016-03-31,Parts per billion,...,Parts per billion,-0.022727,0.0,0,0.0,Parts per million,0.100000,0.100,0,1.0
1746659,24602,56,21,100,NCore - North Cheyenne Soccer Complex,Wyoming,Laramie,Not in a city,2016-03-31,Parts per billion,...,Parts per billion,0.000000,0.0,5,,Parts per million,0.067714,0.127,0,


In [13]:
df2 = df.drop(['Unnamed: 0', 'County Code', 'Site Num', 'Address', 'State', 'NO2 Units', 'NO2 1st Max Value',
              'NO2 1st Max Hour', 'O3 Units', 'O3 1st Max Value', 'O3 1st Max Hour', 'SO2 Units',
              'SO2 1st Max Value', 'SO2 1st Max Hour', 'CO Units', 'CO 1st Max Value', 'CO 1st Max Hour'], axis=1)
df2

Unnamed: 0,State Code,County,City,Date Local,NO2 Mean,NO2 AQI,O3 Mean,O3 AQI,SO2 Mean,SO2 AQI,CO Mean,CO AQI
0,4,Maricopa,Phoenix,2000-01-01,19.041667,46,0.022500,34,3.000000,13.0,1.145833,
1,4,Maricopa,Phoenix,2000-01-01,19.041667,46,0.022500,34,3.000000,13.0,0.878947,25.0
2,4,Maricopa,Phoenix,2000-01-01,19.041667,46,0.022500,34,2.975000,,1.145833,
3,4,Maricopa,Phoenix,2000-01-01,19.041667,46,0.022500,34,2.975000,,0.878947,25.0
4,4,Maricopa,Phoenix,2000-01-02,22.958333,34,0.013375,27,1.958333,4.0,0.850000,
...,...,...,...,...,...,...,...,...,...,...,...,...
1746656,56,Laramie,Not in a city,2016-03-30,1.083333,1,0.043917,44,0.000000,,0.091667,1.0
1746657,56,Laramie,Not in a city,2016-03-31,0.939130,1,0.045263,44,-0.022727,0.0,0.067714,
1746658,56,Laramie,Not in a city,2016-03-31,0.939130,1,0.045263,44,-0.022727,0.0,0.100000,1.0
1746659,56,Laramie,Not in a city,2016-03-31,0.939130,1,0.045263,44,0.000000,,0.067714,


In [14]:
df['County Code'].value_counts()

13     149210
37      95231
83      82998
1       79528
3       68119
        ...  
145      1426
127      1404
55       1164
121       964
191       946
Name: County Code, Length: 73, dtype: int64

In [15]:
df['City'].value_counts()

Not in a city         138411
New York               46887
Los Angeles            42241
Phoenix                37912
El Paso                36908
                       ...  
St. Ann                  860
Kenner                   544
Bakersfield              434
Dentsville (Dents)       116
Roosevelt                104
Name: City, Length: 144, dtype: int64

In [16]:
dfcc13 = df.loc[(df['County Code'] == 13)]

In [17]:
dfcc13['City'].value_counts()

Phoenix          37912
Concord          23686
Bethel Island    23396
San Pablo        21960
Pittsburg        13076
Newark            9534
Scottsdale        8674
Altoona           7580
Crockett          1892
Riverton          1396
Roosevelt          104
Name: City, dtype: int64

In [18]:
df['City'].value_counts()

Not in a city         138411
New York               46887
Los Angeles            42241
Phoenix                37912
El Paso                36908
                       ...  
St. Ann                  860
Kenner                   544
Bakersfield              434
Dentsville (Dents)       116
Roosevelt                104
Name: City, Length: 144, dtype: int64

In [19]:
dfx = df2.drop(['SO2 AQI', 'CO AQI'], axis=1)

In [20]:
X = dfx.drop(['NO2 AQI'], axis=1)
y = dfx['NO2 AQI']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [21]:
cols_encode = ['County', 'City']
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_fit = ohe.fit_transform(X_train[cols_encode])
ohe_df = pd.DataFrame(ohe_fit, columns=ohe.get_feature_names(cols_encode), index=X_train.index)

In [22]:
ohe_df

Unnamed: 0,County_Ada,County_Adair,County_Adams,County_Alameda,County_Alexandria City,County_Allegheny,County_Anoka,County_Aroostook,County_Athens,County_BAJA CALIFORNIA NORTE,...,City_Waco,City_Washington,City_Welby,City_West Los Angeles,City_Westport,City_Wilkes-Barre,City_Wilmington,City_Winston-Salem,City_Winter Park,City_York
913604,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1472046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
662103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1667661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
413828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1414414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
671155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
X_train_int = X_train.drop(cols_encode, axis=1)
X_train_fin = pd.concat((X_train_int, ohe_df), axis=1)
X_train_fin

Unnamed: 0,State Code,Date Local,NO2 Mean,O3 Mean,O3 AQI,SO2 Mean,CO Mean,County_Ada,County_Adair,County_Adams,...,City_Waco,City_Washington,City_Welby,City_West Los Angeles,City_Westport,City_Wilkes-Barre,City_Wilmington,City_Winston-Salem,City_Winter Park,City_York
913604,16,2009-06-02,4.250000,0.026458,31,0.137500,0.100000,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1472046,5,2014-04-20,10.625000,0.040417,64,1.637500,0.250833,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
662103,6,2007-04-19,12.090909,0.030833,29,0.454545,0.429167,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1667661,23,2015-12-27,1.595833,0.027684,26,0.400000,0.154083,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
413828,29,2004-09-15,11.750000,0.020708,28,6.041667,0.537500,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,42,2002-10-07,9.666667,0.024208,26,5.125000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1414414,35,2013-07-27,5.250000,0.051667,71,0.600000,0.105125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
131932,6,2001-08-12,7.173913,0.021500,31,9.214286,0.087500,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
671155,6,2007-07-08,3.826087,0.025917,26,0.657143,0.300000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error as mse


dummy = DummyRegressor()
dummy.fit(X_train_fin, y_train)
yb_hat_train = dummy.predict(X_train_fin)
yb_hat_test = dummy.predict(X_test)
trainb_rmse = mse(y_train, yb_hat_train, squared=False)
testb_rmse = mse(y_test, yb_hat_test, squared=False)
print(trainb_rmse, testb_rmse)

15.16506747704698 15.156006443352808


In [25]:
X_train_temp = X_train_fin.copy()

In [27]:
X_train_temp['Date'] = X_train_temp['Date Local'].dt.day
X_train_temp['Month'] = X_train_temp['Date Local'].dt.month
X_train_temp['Year'] = X_train_temp['Date Local'].dt.year

In [28]:
X_train_temp.drop('Date Local', axis=1, inplace=True)

In [29]:
X_train_temp

Unnamed: 0,State Code,NO2 Mean,O3 Mean,O3 AQI,SO2 Mean,CO Mean,County_Ada,County_Adair,County_Adams,County_Alameda,...,City_West Los Angeles,City_Westport,City_Wilkes-Barre,City_Wilmington,City_Winston-Salem,City_Winter Park,City_York,Date,Month,Year
913604,16,4.250000,0.026458,31,0.137500,0.100000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,6,2009
1472046,5,10.625000,0.040417,64,1.637500,0.250833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20,4,2014
662103,6,12.090909,0.030833,29,0.454545,0.429167,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,4,2007
1667661,23,1.595833,0.027684,26,0.400000,0.154083,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27,12,2015
413828,29,11.750000,0.020708,28,6.041667,0.537500,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,9,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,42,9.666667,0.024208,26,5.125000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,10,2002
1414414,35,5.250000,0.051667,71,0.600000,0.105125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27,7,2013
131932,6,7.173913,0.021500,31,9.214286,0.087500,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,8,2001
671155,6,3.826087,0.025917,26,0.657143,0.300000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,7,2007


In [38]:
X_train_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309995 entries, 913604 to 121958
Columns: 286 entries, State Code to Year
dtypes: float64(281), int64(5)
memory usage: 2.8 GB


In [39]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_temp, y_train)
y1_hat_train = lr.predict(X_train_temp)
#y1_hat_test = lr.predict(X_test)
train1_rmse = mse(y_train, y1_hat_train, squared=False)
#test1_rmse = mse(y_test, y1_hat_test, squared=False)
print(train1_rmse)

5.7405018231916545


In [None]:
for state in df['State']:
    print(state)

In [None]:
state_dict = {}
for state in df['State']:
    if state in state_dict:
        pass
    else:
        state_dict[state] = df['State Code']

In [None]:
state_dict = []
for item in df['State']:
    if (df['State Code'], df['State']) in state_dict:
        pass
    else:
        state_dict.append((df['State Code'], df['State']))

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.loc[df['Unnamed: 0'] == 2047]

In [None]:
dfaqi2016 = pd.read_csv('data/daily_aqi_by_cbsa_2016.csv')
dfaqi2016

In [None]:
dfozone2016 = pd.read_csv('data/daily_44201_2016.csv')

In [None]:
dfozone2016

In [None]:
dfozone2016.columns

In [None]:
dfozone2016.drop(labels=['State Code', 'POC', 'Latitude', 'Longitude', 'Event Type', 'Units of Measure', 'Parameter Code'], axis=1, inplace=True)
dfozone2016

In [None]:
dfozone2016.columns

In [None]:
dfozone2016.drop(labels=['Datum', 'County Code', 'Site Num', 'Parameter Name',], axis=1, inplace=True)
dfozone2016

In [None]:
dfozone2016.drop('Method Name', axis=1, inplace=True)
dfozone2016

In [None]:
plt.figure(figsize=(20,10))
c = df.corr()
sns.heatmap(c, cmap='BrBG',annot=True);

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder().fit_transform(X)