In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('article.csv')
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Graduated,Spending_Score,Segmentation
0,462809,Male,No,No,Low,D
1,462643,Female,Yes,Yes,Average,A
2,466315,Female,Yes,Yes,Low,B
3,461735,Male,Yes,Yes,High,B
4,462669,Female,Yes,Yes,High,A


In [3]:
#drop the unnecessary columns
df = df.drop(columns=['ID'])


# encoding target variable
target_mapping = {"A": 0, "B": 1, "C": 2, "D": 3}
df['Segmentation'] = df['Segmentation'].map(target_mapping)


#oneHot encode categorical columns
one_hot_df = pd.get_dummies(df, columns = ['Gender','Ever_Married','Graduated','Spending_Score']
                            ,drop_first=True)
one_hot_df.head()

Unnamed: 0,Segmentation,Gender_Male,Ever_Married_Yes,Graduated_Yes,Spending_Score_High,Spending_Score_Low
0,3,1,0,0,0,1
1,0,0,1,1,0,0
2,1,0,1,1,0,1
3,1,1,1,1,1,0
4,0,0,1,1,1,0


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X = one_hot_df.drop(['Segmentation'], axis=1)
y = one_hot_df['Segmentation']

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

#parameters for the model
max_depth= 5 
n_estimators= 100

# create model and fit the data
rf = RandomForestClassifier(n_estimators= n_estimators, max_depth= max_depth)
rf.fit(X_train,y_train)


# get the predictions and find the accuracy
y_pred = rf.predict(X_test)
acc = round(accuracy_score(y_test,y_pred),2)

### Tracking with MLFlow

In [1]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("Classification_Task")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

with mlflow.start_run(run_name="RandomForest_Classifier"):
    mlflow.log_param("max_depth",max_depth)
    mlflow.log_param("n_estimators",n_estimators)
    
    mlflow.log_metric("Accuracy", acc) 
    mlflow.sklearn.log_model(rf, "RandomForest Classifier")
    print("Logging Complete")

### Tracking with MLFlow autolog()

In [12]:
params= {"max_depth":5, 
         "n_estimators":100
        }

mlflow.sklearn.autolog()
with mlflow.start_run() :
    rf = RandomForestClassifier(**params)
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    acc = round(accuracy_score(y_test,y_pred),2)

    print("Logging Complete")

Logging Complete


### List all the runs in an experiment using MLFlow

In [21]:
mlflow.search_runs(experiment_names=[ "Classification_Task" ])

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.training_score,metrics.training_roc_auc,metrics.training_log_loss,metrics.training_f1_score,...,params.max_depth,params.n_jobs,params.warm_start,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.user,tags.estimator_name,tags.mlflow.runName,tags.mlflow.log-model.history,tags.estimator_class
0,b063b2d014cc4dfaabbbb930c452418f,656952028390482534,FINISHED,mlflow-artifacts:/656952028390482534/b063b2d01...,2023-03-21 20:45:21.887000+00:00,2023-03-21 20:45:41.583000+00:00,0.416985,0.668269,1.263294,0.407887,...,5,,False,C:\Users\abdurrahman\Anaconda3\lib\site-packag...,LOCAL,abdurrahman,RandomForestClassifier,youthful-ram-582,"[{""run_id"": ""b063b2d014cc4dfaabbbb930c452418f""...",sklearn.ensemble._forest.RandomForestClassifier
1,226c1c420d204efa8d58872906ea8250,656952028390482534,FINISHED,mlflow-artifacts:/656952028390482534/226c1c420...,2023-03-21 20:20:55.104000+00:00,2023-03-21 20:21:29.085000+00:00,,,,,...,5,,,C:\Users\abdurrahman\Anaconda3\lib\site-packag...,LOCAL,abdurrahman,,RandomForest_Classifier,"[{""run_id"": ""226c1c420d204efa8d58872906ea8250""...",
