In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import statsmodels.api as sm

## Data Cleaning

In [34]:
Beijing_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/beijing-pm2-5.csv')
Beijing_df.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [35]:
#drop na values?
Beijing_df.dropna(inplace=True)
Beijing_df.drop(columns=['No'], inplace=True)
Beijing_df

Unnamed: 0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
24,2010,1,2,0,129.0,-16,-4.0,1020.0,SE,1.79,0,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,SE,2.68,0,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,SE,3.57,0,0
27,2010,1,2,3,181.0,-7,-5.0,1022.0,SE,5.36,1,0
28,2010,1,2,4,138.0,-7,-5.0,1022.0,SE,6.25,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43819,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43820,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43821,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43822,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [36]:
# .getdummies?
dummies_df = pd.get_dummies(Beijing_df, dtype='int')
dummies_df.head()

Unnamed: 0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv
24,2010,1,2,0,129.0,-16,-4.0,1020.0,1.79,0,0,0,0,1,0
25,2010,1,2,1,148.0,-15,-4.0,1020.0,2.68,0,0,0,0,1,0
26,2010,1,2,2,159.0,-11,-5.0,1021.0,3.57,0,0,0,0,1,0
27,2010,1,2,3,181.0,-7,-5.0,1022.0,5.36,1,0,0,0,1,0
28,2010,1,2,4,138.0,-7,-5.0,1022.0,6.25,2,0,0,0,1,0


In [37]:
scaled_data = StandardScaler().fit_transform(dummies_df)
scaled_data

array([[-1.44335511, -1.59626706, -1.55775043, ..., -0.69059499,
         1.36578456, -0.52208706],
       [-1.44335511, -1.59626706, -1.55775043, ..., -0.69059499,
         1.36578456, -0.52208706],
       [-1.44335511, -1.59626706, -1.55775043, ..., -0.69059499,
         1.36578456, -0.52208706],
       ...,
       [ 1.38291355,  1.58830091,  1.74316783, ...,  1.44802672,
        -0.7321799 , -0.52208706],
       [ 1.38291355,  1.58830091,  1.74316783, ...,  1.44802672,
        -0.7321799 , -0.52208706],
       [ 1.38291355,  1.58830091,  1.74316783, ...,  1.44802672,
        -0.7321799 , -0.52208706]])

In [38]:
scaled_df= pd.DataFrame(scaled_data, columns=dummies_df.columns)
scaled_df

Unnamed: 0,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir,cbwd_NE,cbwd_NW,cbwd_SE,cbwd_cv
0,-1.443355,-1.596267,-1.557750,-1.661040,0.330114,-1.229791,-1.347143,0.345329,-0.444944,-0.071057,-0.137408,-0.358521,-0.690595,1.365785,-0.522087
1,-1.443355,-1.596267,-1.557750,-1.516631,0.536525,-1.160508,-1.347143,0.345329,-0.427007,-0.071057,-0.137408,-0.358521,-0.690595,1.365785,-0.522087
2,-1.443355,-1.596267,-1.557750,-1.372221,0.656027,-0.883375,-1.429278,0.442411,-0.409069,-0.071057,-0.137408,-0.358521,-0.690595,1.365785,-0.522087
3,-1.443355,-1.596267,-1.557750,-1.227812,0.895029,-0.606241,-1.429278,0.539493,-0.372993,1.212862,-0.137408,-0.358521,-0.690595,1.365785,-0.522087
4,-1.443355,-1.596267,-1.557750,-1.083403,0.427888,-0.606241,-1.429278,0.539493,-0.355055,2.496781,-0.137408,-0.358521,-0.690595,1.365785,-0.522087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41752,1.382914,1.588301,1.743168,1.082736,-0.984399,-1.714775,-1.182873,1.704472,4.194201,-0.071057,-0.137408,-0.358521,1.448027,-0.732180,-0.522087
41753,1.382914,1.588301,1.743168,1.227145,-0.962671,-1.645491,-1.265008,1.704472,4.311298,-0.071057,-0.137408,-0.358521,1.448027,-0.732180,-0.522087
41754,1.382914,1.588301,1.743168,1.371554,-0.962671,-1.645491,-1.265008,1.704472,4.410458,-0.071057,-0.137408,-0.358521,1.448027,-0.732180,-0.522087
41755,1.382914,1.588301,1.743168,1.515963,-0.984399,-1.645491,-1.347143,1.704472,4.491479,-0.071057,-0.137408,-0.358521,1.448027,-0.732180,-0.522087


## Split into training and testing sets

In [39]:
# Make an X variable with all columns except ___
X_full = scaled_df.drop(columns = ['pm2.5'])
X_full.columns

Index(['year', 'month', 'day', 'hour', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is',
       'Ir', 'cbwd_NE', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv'],
      dtype='object')

In [40]:
select_features = ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'month',	'day',	'hour']

# Create another variable X_sel with only the columns
# in the "select_features" list
X_sel = scaled_df[select_features]
X_sel.head()

Unnamed: 0,DEWP,TEMP,PRES,Iws,Is,Ir,month,day,hour
0,-1.229791,-1.347143,0.345329,-0.444944,-0.071057,-0.137408,-1.596267,-1.55775,-1.66104
1,-1.160508,-1.347143,0.345329,-0.427007,-0.071057,-0.137408,-1.596267,-1.55775,-1.516631
2,-0.883375,-1.429278,0.442411,-0.409069,-0.071057,-0.137408,-1.596267,-1.55775,-1.372221
3,-0.606241,-1.429278,0.539493,-0.372993,1.212862,-0.137408,-1.596267,-1.55775,-1.227812
4,-0.606241,-1.429278,0.539493,-0.355055,2.496781,-0.137408,-1.596267,-1.55775,-1.083403


In [41]:
y = scaled_df["pm2.5"].values.reshape(-1, 1)

In [42]:
X_full_train, X_full_test, X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_full, X_sel, y, random_state=42)

## Train the models

In [43]:
# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

# Fit the first model to the full training data. 
lr1.fit(X_full_train, y_train)

# Fit the second model to the select training data.
lr2.fit(X_sel_train, y_train)

In [44]:
lr1.coef_

array([[ 0.00812852, -0.0414113 ,  0.06806361,  0.09550738,  0.69692267,
        -0.87179755, -0.17224458, -0.10610593, -0.02893383, -0.10023634,
        -0.0417115 , -0.07992031,  0.06321907,  0.04993816]])

In [45]:
lr2.coef_

array([[ 0.74868767, -0.86573929, -0.16150138, -0.13414599, -0.02373184,
        -0.11264697, -0.05727719,  0.07026255,  0.12134789]])

In [46]:
lr3 = sm.OLS(y_train, X_full_train).fit()
lr3.pvalues.sort_values()

DEWP       0.000000e+00
TEMP       0.000000e+00
cbwd_NW    4.115681e-98
Ir         8.102103e-92
Iws        4.515049e-86
PRES       5.408518e-78
hour       3.003968e-76
cbwd_SE    1.671694e-69
day        9.895313e-46
cbwd_cv    2.204149e-39
cbwd_NE    1.285455e-23
month      8.039967e-16
Is         3.130102e-09
year       9.362489e-02
dtype: float64

In [47]:
lr4 = sm.OLS(y_train, X_sel_train).fit()
lr4.pvalues.sort_values()

DEWP      0.000000e+00
TEMP      0.000000e+00
Iws      1.505054e-147
hour     1.407457e-124
Ir       9.750167e-114
PRES      9.044123e-68
day       1.392573e-47
month     6.421009e-29
Is        1.474330e-06
dtype: float64

In [48]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions

predicted1 = lr1.predict(X_full_test)
predicted2 = lr2.predict(X_sel_test)

# Score the predictions with mse and r2
mse1 = mean_squared_error(y_test, predicted1)
r21 = r2_score(y_test, predicted1)
mse2 = mean_squared_error(y_test, predicted2)
r22 = r2_score(y_test, predicted2)

print(f"All Features:")
print(f"mean squared error (MSE): {mse1}")
print(f"R-squared (R2): {r21}")
print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r22}")

All Features:
mean squared error (MSE): 0.7465322739444719
R-squared (R2): 0.27279444774759387
---------------------
Select Features:
mean squared error (MSE): 0.7650062911083578
R-squared (R2): 0.25479869816934675


In [49]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [50]:
# Calculate the adjusted r-squared value of the model
adj_score1 = r2_adj(X_full_test, y_test, lr1)
adj_score2 = r2_adj(X_sel_test, y_test, lr2)
print(f"All Features Adjusted R2: {adj_score1}")
print(f"Select Features Adjusted R2: {adj_score2}")

All Features Adjusted R2: 0.2718178647517633
Select Features Adjusted R2: 0.25415566732404704


In [51]:
# Examine linear regression on the better training data using cross validation
cv_scores = cross_val_score(LinearRegression(), X_full_train, y_train, scoring = "r2")
print(f"All scores: {cv_scores}")
print(f"Mean score: {cv_scores.mean()}")
print(f"Standard Deviation: {cv_scores.std()}")

All scores: [0.27714359 0.26220975 0.28403888 0.27984968 0.27369142]
Mean score: 0.2753866651441383
Standard Deviation: 0.007407712970151173


## Model Validation

In [52]:
import pipeline_utilities as p_util
from pipeline_utilities import pollution_model_generator

In [53]:
p_util.pollution_model_generator(Beijing_df)

Mean Squared Error: 5886.246707210137
R-squared: 0.2750560578010697
Adjusted R-squared: 0.27498660542109277


In [54]:
NE_Beijing_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/beijing-pm2-5-NE.csv')

NW_Beijing_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/beijing-pm2-5-NW.csv')

SE_Beijing_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/beijing-pm2-5-SE.csv')

Calm_Beijing_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_3/datasets/beijing-pm2-5-cv.csv')

In [55]:
p_util.pollution_model_generator(NE_Beijing_df)
print('-'*50)
p_util.pollution_model_generator(NW_Beijing_df)
print('-'*50)
p_util.pollution_model_generator(SE_Beijing_df)
print('-'*50)
p_util.pollution_model_generator(Calm_Beijing_df)

Mean Squared Error: 6865.704701681553
R-squared: 0.28629819531366063
Adjusted R-squared: 0.2856969301033099
--------------------------------------------------
Mean Squared Error: 5272.706604741196
R-squared: 0.29921083156913697
Adjusted R-squared: 0.29900282053665517
--------------------------------------------------
Mean Squared Error: 5131.758988772725
R-squared: 0.188610444087009
Adjusted R-squared: 0.1883876572786859
--------------------------------------------------
Mean Squared Error: 7617.255019256973
R-squared: 0.1894396158688194
Adjusted R-squared: 0.1890767866906049
