### Importing nessesary Libraries 

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

from sklearn.preprocessing import StandardScaler,LabelEncoder

from sklearn.model_selection import train_test_split,GridSearchCV,cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
#importing metrics 
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score

#hyper
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [53]:
data=pd.read_csv('updated_pollution_dataset.csv')
data.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,Moderate
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,Moderate
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,Moderate
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,Good
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,Good


#### Preprocessing 

In [54]:
lb=LabelEncoder()
data['Air Quality']=lb.fit_transform(data['Air Quality'])
data.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,2
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,2
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,2
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,0
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,0


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature                    5000 non-null   float64
 1   Humidity                       5000 non-null   float64
 2   PM2.5                          5000 non-null   float64
 3   PM10                           5000 non-null   float64
 4   NO2                            5000 non-null   float64
 5   SO2                            5000 non-null   float64
 6   CO                             5000 non-null   float64
 7   Proximity_to_Industrial_Areas  5000 non-null   float64
 8   Population_Density             5000 non-null   int64  
 9   Air Quality                    5000 non-null   int32  
dtypes: float64(8), int32(1), int64(1)
memory usage: 371.2 KB


In [56]:
data.describe()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,30.02902,70.05612,20.14214,30.21836,26.4121,10.01482,1.500354,8.4254,497.4238,1.3
std,6.720661,15.863577,24.554546,27.349199,8.895356,6.750303,0.546027,3.610944,152.754084,1.187553
min,13.4,36.0,0.0,-0.2,7.4,-6.2,0.65,2.5,188.0,0.0
25%,25.1,58.3,4.6,12.3,20.1,5.1,1.03,5.4,381.0,0.0
50%,29.0,69.8,12.0,21.7,25.3,8.0,1.41,7.9,494.0,1.5
75%,34.0,80.3,26.1,38.1,31.9,13.725,1.84,11.1,600.0,2.0
max,58.6,128.1,295.0,315.8,64.9,44.9,3.72,25.8,957.0,3.0


In [57]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [58]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#### Random Forest 

In [38]:
rf1=RandomForestClassifier()
rf1.fit(X_train,y_train)
y_pred=rf1.predict(X_test)

accuracy_score(y_pred,y_test)

0.953

#### Decision Tree 

In [39]:
dc=DecisionTreeClassifier()
dc.fit(X_train,y_train)
y_pred=dc.predict(X_test)
accuracy_score(y_pred,y_test)

0.92

#### Gaussian naive Bayes

In [40]:
nb=GaussianNB()

nb.fit(X_train,y_train)
y_pred=nb.predict(X_test)
accuracy_score(y_pred,y_test)

0.926

#### Hyperparameters Optimization

In [41]:
rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, "auto", 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

randomcv_models=[
    ('RF',RandomForestClassifier(),rf_params)
]


In [42]:
model_param={}

for name,model,params in randomcv_models:
    random=RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    random.fit(X_train,y_train)
    model_param[name]=random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

#### Random forest with optimized parameters 

In [43]:
rf=RandomForestClassifier(n_estimators=500,min_samples_split=2,max_features=5,max_depth=None)
rf.fit(X_train,y_train)

In [44]:
y_pred=rf.predict(X_test)

In [45]:
accuracy_score(y_pred,y_test)

0.956

In [59]:
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


In [60]:
rf=RandomForestClassifier(n_estimators=500,min_samples_split=2,max_features=5,max_depth=None)
rf.fit(X_train,y_train)

In [61]:
y_pred=rf.predict(X_test)

In [62]:
accuracy_score(y_pred,y_test)

0.958