In [17]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [22]:
load_iris = load_iris()

In [3]:
load_iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

**a) Create a data processing class that implements:**

1.  Conversion of data to pandas DataFrame with proper column names
2.  Feature scaling using StandardScaler
3.  Train-test split with experiment tracking



In [26]:
class IrisDataProcessor:
  def __init__(self,load_iris):
    self.iris = load_iris

  def prepare_data(self):
    X = load_iris.data
    scalar = StandardScaler()
    X = scalar.fit_transform(X)
    y = load_iris.target
    feature_names = load_iris.feature_names

    df = pd.DataFrame(data = X , columns = feature_names)
    df['target'] = y

    return df

  def get_feature_stats(self):
    df = self.prepare_data()
    return df.describe()


In [29]:
iris = IrisDataProcessor(load_iris)
df = iris.prepare_data()
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,-0.900681,1.019004,-1.340227,-1.315444,0
1,-1.143017,-0.131979,-1.340227,-1.315444,0
2,-1.385353,0.328414,-1.397064,-1.315444,0
3,-1.506521,0.098217,-1.283389,-1.315444,0
4,-1.021849,1.249201,-1.340227,-1.315444,0


In [28]:
df_stat = iris.get_feature_stats()
df_stat.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,-1.468455e-15,-1.823726e-15,-1.610564e-15,-9.473903e-16,1.0
std,1.00335,1.00335,1.00335,1.00335,0.819232
min,-1.870024,-2.433947,-1.567576,-1.447076,0.0
25%,-0.9006812,-0.592373,-1.226552,-1.183812,0.0


In [35]:
X_train ,X_test ,y_train ,y_test = train_test_split(df.drop('target',axis = 1) , df['target'] , test_size = 0.2 , random_state = 42)

Question 2: Experiment Tracking and Model
Development (20 marks)

In [36]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

lr = LogisticRegressionCV(cv = 5)
rf = RandomForestClassifier()

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_lr = precision_score(y_test, y_pred_lr, average='macro')
precision_rf = precision_score(y_test, y_pred_rf, average='macro')
recall_lr = recall_score(y_test, y_pred_lr, average='macro')
recall_rf = recall_score(y_test, y_pred_rf, average='macro')
f1_lr = f1_score(y_test, y_pred_lr, average='macro')
f1_rf = f1_score(y_test, y_pred_rf, average='macro')

In [None]:
class IrisExperiment:
  def __init__(self, data_processor):
    self.data_processor = data_processor
    pass
  def run_experiment(self):
    self.lr_params = {'cv':5,'penalty'='elasticnet'}
    self.lr = LogisticRegressionCV(**lr_params)
    self.rf_params = {'criterion'='gini','bootstrap'=True}
    self.rf = RandomForestClassifier(**rf_params)
    X_train = self.data_processor.X_train
    y_train = self.data_processor.y_train
    lr.fit(X_train, y_train)
    rf.fit(X_train, y_train)



  def log_results(self):
    X_test = self.data_processor.X_test
    y_test = self.data_processor.y_test
    y_pred_lr = lr.predict(X_test)
    y_pred_rf = rf.predict(X_test)

    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    precision_lr = precision_score(y_test, y_pred_lr, average='macro')
    precision_rf = precision_score(y_test, y_pred_rf, average='macro')
    recall_lr = recall_score(y_test, y_pred_lr, average='macro')
    recall_rf = recall_score(y_test, y_pred_rf, average='macro')
    f1_lr = f1_score(y_test, y_pred_lr, average='macro')
    f1_rf = f1_score(y_test, y_pred_rf, average='macro')

    metric_lr = {'accuracy': accuracy_lr, 'precision': precision_lr, 'recall': recall_lr, 'f1': f1_lr}
    metric_rf = {'accuracy': accuracy_rf, 'precision': precision_rf, 'recall': recall_rf, 'f1': f1_rf}
    with mlflow.start_run(run_name=logistic) as run:
      # Log the parameters used for the model fit
      mlflow.log_params(self.lr_params)

      # Log the error metrics that were calculated during validation
      mlflow.log_metrics(metrics_lr)

      # Log an instance of the trained model for later use
      mlflow.sklearn.log_model(
          sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )

    with mlflow.start_run(run_name=random_forest) as run:
      mlfolw.log_params(self.rf_params)
      mlflow.log_metrics(metrics_rf)
      mlflow.sklearn.log_model(
          sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )


Question 3: Model Optimization and Testing (15 marks)

In [37]:
lr = LogisticRegressionCV(cv = 5)
rf = RandomForestClassifier()

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)