In [1]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, roc_auc_score, recall_score, precision_score, log_loss, f1_score
from imblearn.over_sampling import SMOTE
import warnings, mlflow
warnings.simplefilter("ignore")

In [2]:
file = r'C:\Users\GANAPA\Downloads\MLFlow (MLOps)\pollution_dataset.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,Air Quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,Moderate
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,Moderate
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,Moderate
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,Good
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,Good


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature                    5000 non-null   float64
 1   Humidity                       5000 non-null   float64
 2   PM2.5                          5000 non-null   float64
 3   PM10                           5000 non-null   float64
 4   NO2                            5000 non-null   float64
 5   SO2                            5000 non-null   float64
 6   CO                             5000 non-null   float64
 7   Proximity_to_Industrial_Areas  5000 non-null   float64
 8   Population_Density             5000 non-null   int64  
 9   Air Quality                    5000 non-null   object 
dtypes: float64(8), int64(1), object(1)
memory usage: 390.8+ KB


In [4]:
df.isna().sum()

Temperature                      0
Humidity                         0
PM2.5                            0
PM10                             0
NO2                              0
SO2                              0
CO                               0
Proximity_to_Industrial_Areas    0
Population_Density               0
Air Quality                      0
dtype: int64

In [5]:
x = df.drop('Air Quality',axis=1)
y = df['Air Quality']

In [6]:
y.value_counts()

Air Quality
Good         2000
Moderate     1500
Poor         1000
Hazardous     500
Name: count, dtype: int64

In [7]:
smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(x, y)

In [8]:
y_resampled.value_counts()

Air Quality
Moderate     2000
Good         2000
Hazardous    2000
Poor         2000
Name: count, dtype: int64

In [9]:
#mlflow.set_experiment('Air Quality Prediction')
mlflow.set_tracking_uri('http://localhost:5000')

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(x_resampled,y_resampled,test_size=0.2,random_state=42)

In [11]:
rf = RandomForestClassifier()
extratrees = ExtraTreesClassifier()
extratrees.fit(xtrain,ytrain)
ypred = extratrees.predict(xtest)

In [133]:
report = classification_report(ytest,ypred)
print(report)

              precision    recall  f1-score   support

        Good       1.00      1.00      1.00       418
   Hazardous       0.99      0.97      0.98       385
    Moderate       0.97      0.98      0.98       417
        Poor       0.95      0.96      0.96       380

    accuracy                           0.98      1600
   macro avg       0.98      0.98      0.98      1600
weighted avg       0.98      0.98      0.98      1600



In [134]:
report_dict = classification_report(ytest,ypred,output_dict=True)
report_dict

{'Good': {'precision': 1.0,
  'recall': 0.9976076555023924,
  'f1-score': 0.9988023952095808,
  'support': 418.0},
 'Hazardous': {'precision': 0.9868421052631579,
  'recall': 0.974025974025974,
  'f1-score': 0.9803921568627451,
  'support': 385.0},
 'Moderate': {'precision': 0.9714964370546318,
  'recall': 0.9808153477218226,
  'f1-score': 0.9761336515513126,
  'support': 417.0},
 'Poor': {'precision': 0.9528795811518325,
  'recall': 0.9578947368421052,
  'f1-score': 0.9553805774278216,
  'support': 380.0},
 'accuracy': 0.978125,
 'macro avg': {'precision': 0.9778045308674055,
  'recall': 0.9775859285230736,
  'f1-score': 0.977677195262865,
  'support': 1600.0},
 'weighted avg': {'precision': 0.9782140410098711,
  'recall': 0.978125,
  'f1-score': 0.9781517085682695,
  'support': 1600.0}}

In [19]:
df['Air Quality'].value_counts()

Air Quality
Good         2000
Moderate     1500
Poor         1000
Hazardous     500
Name: count, dtype: int64

In [13]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rn = RandomForestClassifier()
knn = KNeighborsClassifier()
gb = GaussianNB()
sgd = SGDClassifier()
gbc = GradientBoostingClassifier()
abc = AdaBoostClassifier()
extratrees = ExtraTreesClassifier()

In [14]:
df['Air Quality'].unique()

array(['Moderate', 'Good', 'Hazardous', 'Poor'], dtype=object)

In [20]:
Li_model = [lr, dt, rn, knn, gb, sgd, gbc, abc, extratrees]
models = []
model_dict = dict()
model_dict_df = dict()
evaluation_results = {}
for i in Li_model:
    model_name = type(i).__name__.replace('Classifier', ' Classifier').replace('Regression', ' Regression')
    print(model_name)
    i.fit(xtrain, ytrain)
    ypred = i.predict(xtest)
    report = classification_report(ytest, ypred, output_dict=True)
    model_dict.update({model_name: report})
    evaluation_results = pd.DataFrame(report).transpose().reset_index().rename(columns={'index':'Metrics'})
    models.append(model_name)
    model_dict_df.update({model_name:report['accuracy']})

Logistic Regression
DecisionTree Classifier
RandomForest Classifier
KNeighbors Classifier
GaussianNB
SGD Classifier
GradientBoosting Classifier
AdaBoost Classifier
ExtraTrees Classifier


In [None]:
experiment_name = 'Air Quality Pred'
mlflow.set_experiment(experiment_name)

Li_model = [lr, dt, rn, knn, gb, sgd, gbc, abc, extratrees]
models = []
model_dict = dict()
for i in Li_model:
    model_name = type(i).__name__.replace('Classifier', ' Classifier').replace('Regression', ' Regression')
    with mlflow.start_run(run_name=model_name, description = f'{model_name} model for air quality classification using metrics like accuracy and recall for various classes.') as run:
        i.fit(xtrain, ytrain)
        ypred = i.predict(xtest)
        report = classification_report(ytest, ypred, output_dict=True)
        model_dict.update({model_name: report})
        
        mlflow.sklearn.log_model(i, model_name, input_example = xtrain.iloc[0:1])
        metrics = {
            'Accuracy': report['accuracy'],
            'F1_Score_Macro': report['macro avg']['f1-score'],
            'Train_score': i.score(xtrain, ytrain),
            'Test_score': i.score(xtest, ytest),
            'Training_f1_score': f1_score(ytrain, i.predict(xtrain), average='macro'),
            'Training_precision_score': precision_score(ytrain, i.predict(xtrain), average='macro'),
            'Training_recall_score': recall_score(ytrain, i.predict(xtrain), average='macro'),
        }
        
        for class_label in report.keys():
            if class_label not in ['accuracy', 'macro avg', 'weighted avg']:
                metrics[f'Recall_Class_{class_label}'] = report[class_label]['recall']
                metrics[f'Precision_Class_{class_label}'] = report[class_label]['precision']
        if hasattr(i, "predict_proba"):
            metrics['Training_log_loss'] = log_loss(ytrain, i.predict_proba(xtrain))
        mlflow.log_metrics(metrics)
        mlflow.log_params({key: value for key, value in i.get_params().items()})

        model_uri = f'runs:/{mlflow.active_run().info.run_id}/{model_name}'
        # mlflow.register_model(model_uri, model_name)    #to regsiter the model for deployment
        
        mlflow.log_params({'Model': model_name})
        mlflow.log_artifact(file)
        models.append(model_name)
        
        mlflow.log_input(mlflow.data.from_pandas(xtrain), context="Train")
        mlflow.log_input(mlflow.data.from_pandas(xtest), context="Eval")

    mlflow.end_run()

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1750.34it/s]


🏃 View run LogisticRegression at: http://localhost:5000/#/experiments/964761882172761097/runs/7b1b03ab7069413b9e357547d919da94
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1749.29it/s]


🏃 View run DecisionTreeClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/3f021a33d30f4594bf76f555bd5c1fb8
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1750.23it/s]


🏃 View run RandomForestClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/d8cc3ad7f14342cdaecdc86a0c6c6acc
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1400.10it/s]


🏃 View run KNeighborsClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/8c5aad5cce7a4ecd90eb0ea77f5a64ce
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1749.61it/s]


🏃 View run GaussianNB at: http://localhost:5000/#/experiments/964761882172761097/runs/45113a05cd85431b8624741e520eadd4
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1751.38it/s]


🏃 View run SGDClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/084273499d0f4f179e3226db96cf7d1c
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 2332.58it/s]


🏃 View run GradientBoostingClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/2622547cf47c41dbb46060ac62a2d52e
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1397.37it/s]


🏃 View run AdaBoostClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/f2ed1bc8200842369f6f9403f5a62669
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1000.11it/s]


🏃 View run ExtraTreesClassifier at: http://localhost:5000/#/experiments/964761882172761097/runs/50ea5b1dc2fd47e98788e9afe9de68ad
🧪 View experiment at: http://localhost:5000/#/experiments/964761882172761097


In [28]:
print(models)

['Logistic Regression', 'SGD Classifier', 'KNeighbors Classifier', 'GaussianNB', 'RandomForest Classifier', 'DecisionTree Classifier', 'SGD Classifier', 'GradientBoosting Classifier', 'AdaBoost Classifier']


In [30]:
# using register model to predict

name, model_version = 'RandomForest Classifier', '1'
model_uri = f'models:/{name}/{model_version}'
load_model = mlflow.sklearn.load_model(model_uri)
y_pred = load_model.predict(xtest)
y_pred[:5]

Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.24it/s]


array(['Hazardous', 'Good', 'Moderate', 'Hazardous', 'Good'], dtype=object)

In [None]:
# Admin Code

In [13]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiments = client.search_experiments()

for experiment in experiments:
    print(f"Experiment ID: {experiment.experiment_id}, Name: {experiment.name}")

Experiment ID: 585084346383979865, Name: Air Quality Prediction
Experiment ID: 0, Name: Default


In [None]:
# to delete experiment

experiment_name = 'Air Quality Prediction'
client = MlflowClient()
experiment = client.get_experiment_by_name(experiment_name)

if experiment:
	mlflow.delete_experiment(experiment.experiment_id)
	print(f"Experiment '{experiment_name}' deleted.")
else:
	print(f"Experiment '{experiment_name}' does not exist.")