In [1]:
# Aim: To predict Particulate Matter2.5(PM) through meteorological, fire, and geographical based features...
# for Diwali and Non-Diwali day

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression # Method1
from statsmodels.formula.api import ols # Method 2

#### Data Loading and Preparation

In [3]:
df=pd.read_excel(r'C:/Users/LENOVO/Documents/Diwali_Impact_coding/Modelling/ModellingData_Type2.xlsx', usecols=['PM','Temp','RH','WS','Type','FRP','Popu','Region','Traffic_SitesB'])

In [4]:
#df.shape#(548, 9)
df.head()

Unnamed: 0,PM,Temp,RH,WS,Type,FRP,Popu,Region,Traffic_SitesB
0,7.0,24.41,82.12,0.96,Diwali,181.18,SemiUrban,NorthEast,1
1,14.142857,26.281429,75.231905,0.398571,NoNDiwali,181.18,SemiUrban,NorthEast,1
2,5.375,20.78625,95.135417,1.943333,Diwali,129.83,Urban,NorthEast,1
3,44.872917,24.365208,89.651875,0.682292,NoNDiwali,129.83,Urban,NorthEast,1
4,10.081818,20.78625,94.906667,1.506429,Diwali,129.83,Urban,NorthEast,1


In [5]:
#Features Details:

# PM-Particulate Matter (PM2.5), Temp-Temperature, Ws- Wind Speed,RH-Relative Humidity
# FRP- Fire radiative Power (from MODIS NASA)
# Popu- Location/City Characteristics based on population: Metropolitan, Urban, and Semi-Urban
# Region- Different parts of India: North, Central, West, Northeast, East and South
# Traffic_SitesB- 1: Traffic affected site, 0: Non-Traffic site

In [6]:
df_predict=pd.read_excel('C:/Users/LENOVO/Documents/Machine_Learning_coding/Data.xlsx') 

In [7]:
#df_predict.shape #(16, 8)
df_predict.head()

Unnamed: 0,Temp,RH,WS,Type,FRP,Popu,Region,Traffic_SitesB
0,24.365208,89.651875,0.682292,NoNDiwali,129.83,Urban,NorthEast,1
1,28.33875,53.098958,0.364583,NoNDiwali,62685.3,Urban,North,0
2,22.795833,54.392126,0.848118,Diwali,64874.91,Metropolitan,North,1
3,22.795833,57.59125,1.214167,Diwali,64874.91,Metropolitan,North,0
4,22.801708,59.742322,0.640521,NoNDiwali,64874.91,Metropolitan,North,0


In [8]:
df.dropna(inplace=True)

In [9]:
df.corr() # Checking weather any high correlation exists between features or not.

Unnamed: 0,PM,Temp,RH,WS,FRP,Traffic_SitesB
PM,1.0,-0.189354,-0.437362,-0.01422,0.58776,-0.121305
Temp,-0.189354,1.0,-0.083454,0.022858,-0.413304,-0.029372
RH,-0.437362,-0.083454,1.0,0.025037,-0.326871,0.154137
WS,-0.01422,0.022858,0.025037,1.0,-0.106175,0.077805
FRP,0.58776,-0.413304,-0.326871,-0.106175,1.0,-0.243627
Traffic_SitesB,-0.121305,-0.029372,0.154137,0.077805,-0.243627,1.0


In [10]:
## Converting all non-numerical variable through creating dummies into encoding(0,1).
df=pd.get_dummies(df, columns = ['Type','Popu','Region']) 

## 1: scikit-learn

###### Fitting to model

In [11]:
model_LR=LinearRegression() # defining Linearregression model

In [12]:
X=df.loc[:,df.columns!='PM'] # Indepenedent/Predictor Variables
y=df.loc[:,'PM'] # Dependent Variable

In [13]:
model_LR.fit(X,y)

LinearRegression()

In [14]:
model_LR.coef_ 

array([ 1.11655012e+00, -9.83372830e-01,  1.58621777e+00,  8.02892344e-04,
        3.48880916e+00,  1.66409472e+01, -1.66409472e+01,  2.01481435e+01,
       -2.13558523e+01,  1.20770879e+00,  1.39851140e+01, -8.52917297e+00,
        2.12491630e+01, -2.35777254e+01, -1.40564562e+01,  1.09290776e+01])

In [15]:
model_LR.intercept_

99.03845743956944

In [16]:
model_LR.score(X,y) # R-square

0.5255375732548951

In [17]:
def Adj_Rsqr(model,X,y):
    yp = model.predict(X) # yp=y predicted
    SSR = sum((y-yp)**2) # Sum of squares due to regression  
    SST = sum((y-np.mean(y))**2) # Total sum of squares    
    r_squared = 1 - (float(SSR))/SST
    Adj_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
    return  Adj_r_squared

In [18]:
Adj_Rsqr(model_LR,X,y)

0.5070219663575246

##### Prediction

In [19]:
df_predict=pd.get_dummies(df_predict, columns = ['Type','Region','Popu'])

In [20]:
df_predict.shape

(16, 16)

In [21]:
model_LR.predict(df_predict)

array([ 51.0307719 , 124.86771441, 122.19601641, 116.14190925,
        79.84134078, 143.98169082,  76.91967373,  25.2743914 ,
        95.43017987, 102.1306858 ,  83.31991849,  56.99706497,
        -8.69078615,  54.28253302, 132.69237654,  32.86162709])

## 2: statsmodels

#### Modelling

In [22]:
# Dependent/Predicted Variable=PM
model_ols=ols('PM~Temp+RH+WS+FRP+Traffic_SitesB+Type_Diwali+Type_NoNDiwali+Region_Central+Region_East+Region_North+Region_NorthEast+Region_South+Region_West+Popu_Metropolitan+Popu_SemiUrban+Popu_Urban',data=df).fit()

In [23]:
print(model_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                     PM   R-squared:                       0.526
Model:                            OLS   Adj. R-squared:                  0.511
Method:                 Least Squares   F-statistic:                     35.19
Date:                Sun, 02 Apr 2023   Prob (F-statistic):           8.31e-59
Time:                        13:18:34   Log-Likelihood:                -2233.5
No. Observations:                 427   AIC:                             4495.
Df Residuals:                     413   BIC:                             4552.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            49.5192     15.40

In [24]:
model_ols.pvalues

Intercept            1.412296e-03
Temp                 2.142646e-01
RH                   1.927635e-07
WS                   2.065949e-01
FRP                  2.136865e-06
Traffic_SitesB       5.082478e-01
Type_Diwali          1.773617e-07
Type_NoNDiwali       3.254884e-01
Region_Central       3.719143e-02
Region_East          9.703829e-01
Region_North         6.004814e-04
Region_NorthEast     1.521124e-01
Region_South         3.639460e-01
Region_West          3.348838e-03
Popu_Metropolitan    5.223263e-07
Popu_SemiUrban       7.290023e-01
Popu_Urban           1.450331e-02
dtype: float64

In [25]:
model_ols.params

Intercept            49.519229
Temp                  1.116550
RH                   -0.983373
WS                    1.586218
FRP                   0.000803
Traffic_SitesB        3.488809
Type_Diwali          41.400562
Type_NoNDiwali        8.118667
Region_Central       22.238319
Region_East          -0.275968
Region_North         29.502368
Region_NorthEast    -15.324521
Region_South         -5.803251
Region_West          19.182282
Popu_Metropolitan    36.654553
Popu_SemiUrban       -4.849443
Popu_Urban           17.714118
dtype: float64

#### Prediction

In [26]:
model_ols.predict(df_predict)

0       3.746564
1     135.187800
2     185.963340
3     179.909232
4     143.608664
5     154.301776
6      54.885981
7      63.472977
8     133.628765
9      86.882034
10    116.725702
11     90.402848
12     47.861762
13     57.387844
14    116.807978
15     38.388910
dtype: float64