In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

%matplotlib inline

# Loading the Dataset

In [2]:
df = pd.read_csv('../data/car_data.csv')

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0


In [4]:
categorical_variables = ["Gender"]
df_final = pd.get_dummies(df, columns = categorical_variables, drop_first = True)

In [5]:
# Drop useless columns
df_final = df_final.drop('User ID', axis = 1)

In [6]:
df_final.head()

Unnamed: 0,Age,AnnualSalary,Purchased,Gender_Male
0,35,20000,0,1
1,40,43500,0,1
2,49,74000,0,1
3,40,107500,1,1
4,25,79000,0,1


## Data Split and Preprocessing

In [7]:
X = df_final.drop('Purchased', axis=1)
y = df_final['Purchased']
X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size = 0.3, random_state= 42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Classifiers

In [9]:
accuracy_list = {}
model_list = {}

## Decision Tree Model 

In [13]:
dtree = DecisionTreeClassifier(random_state=0)
dtree = dtree.fit(X_train,y_train)

In [14]:
y_pred = dtree.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred))
print("\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       172
           1       0.88      0.83      0.85       128

    accuracy                           0.88       300
   macro avg       0.88      0.87      0.88       300
weighted avg       0.88      0.88      0.88       300



[[158  14]
 [ 22 106]]


In [16]:
model_name = "DecisionTreeClassifier"
model = dtree

accuracy_list[model_name] = accuracy_score(y_test,y_pred)
model_list[model_name] = model
accuracy_list

{'DecisionTreeClassifier': 0.88}

## Logistic Regression

In [17]:
logistic_classifier = LogisticRegression(random_state=0).fit(X_train, y_train)

In [18]:
y_pred = logistic_classifier.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))
print("\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.91      0.85       172
           1       0.85      0.67      0.75       128

    accuracy                           0.81       300
   macro avg       0.82      0.79      0.80       300
weighted avg       0.82      0.81      0.81       300



[[157  15]
 [ 42  86]]


In [20]:
model_name = "LogisticRegression"
model = logistic_classifier

accuracy_list[model_name] = accuracy_score(y_test,y_pred)
model_list[model_name] = model
accuracy_list

{'DecisionTreeClassifier': 0.88, 'LogisticRegression': 0.81}

## XGBoost Classifier

XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. https://xgboost.readthedocs.io/en/stable/

In [21]:
xgb_model = XGBClassifier(random_state=0)
xgb_model.fit(X_train, y_train)

In [22]:
y_pred = xgb_model.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))
print("\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       172
           1       0.91      0.81      0.86       128

    accuracy                           0.89       300
   macro avg       0.89      0.88      0.88       300
weighted avg       0.89      0.89      0.89       300



[[162  10]
 [ 24 104]]


In [24]:
model_name = "XGBClassifier"
model = xgb_model

accuracy_list[model_name] = accuracy_score(y_test,y_pred)
model_list[model_name] = model
accuracy_list

{'DecisionTreeClassifier': 0.88,
 'LogisticRegression': 0.81,
 'XGBClassifier': 0.8866666666666667}

## Model result/ summary

In [25]:
item_max_val = max(accuracy_list.keys(), key=(lambda new_k: accuracy_list[new_k]))
print(f'Max Accuracy value: {accuracy_list[item_max_val]:.2f} for the {item_max_val}')
best_model = model_list[item_max_val]
print(f"The best model can is the be the {item_max_val} as presents the highest accuracy")

Max Accuracy value: 0.89 for the XGBClassifier
The best model can is the be the XGBClassifier as presents the highest accuracy


# Preparation of the Deployment model

In [26]:
pipeline = Pipeline([
    ('StandardScaler', scaler), 
    ('model', best_model)
])

## Reload test data

In [27]:
df = pd.read_csv('../data/car_data.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0


In [28]:
categorical_variables = ["Gender"]
df_final = pd.get_dummies(df, columns = categorical_variables, drop_first = True)
df_final = df_final.drop('User ID', axis = 1)
df_final.head()

Unnamed: 0,Age,AnnualSalary,Purchased,Gender_Male
0,35,20000,0,1
1,40,43500,0,1
2,49,74000,0,1
3,40,107500,1,1
4,25,79000,0,1


In [29]:
X = df_final.drop('Purchased', axis=1)
y = df_final['Purchased']
X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size = 0.3, random_state= 42)

In [30]:
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       172
           1       0.91      0.81      0.86       128

    accuracy                           0.89       300
   macro avg       0.89      0.88      0.88       300
weighted avg       0.89      0.89      0.89       300



[[162  10]
 [ 24 104]]


Model execution looks well, so we will proceed to save it.

In [45]:
from joblib import dump
dump(pipeline, '../model/model.joblib')

['../model/model.joblib']

In [42]:
type(pipeline)

sklearn.pipeline.Pipeline

And now we will test it

In [40]:
from joblib import load
loaded_pipeline = load('model.joblib') 

In [33]:
X_test

Unnamed: 0,Age,AnnualSalary,Gender_Male
521,41,73500,1
737,59,135500,1
740,25,59500,1
660,47,42500,0
411,46,135500,0
...,...,...,...
468,59,106500,0
935,35,65000,0
428,42,46500,1
7,42,64000,1


In [34]:
y_pred = loaded_pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
print("\n")
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       172
           1       0.91      0.81      0.86       128

    accuracy                           0.89       300
   macro avg       0.89      0.88      0.88       300
weighted avg       0.89      0.89      0.89       300



[[162  10]
 [ 24 104]]


In [35]:
accuracy_score(y_test,y_pred)

0.8866666666666667

The saved model looks good

## Test with the syntetic batch data

In [36]:
import pandas as pd
df_batch = pd.read_csv('../data/unknown_batch.csv')
df_batch.head()

Unnamed: 0,User ID,Age,AnnualSalary,Gender
0,1,45,57791,Male
1,2,44,42052,Male
2,3,30,151570,Female
3,4,62,86157,Male
4,5,27,15771,Female


In [37]:
categorical_variables = ["Gender"]
df_batch_final = pd.get_dummies(df_batch, columns = categorical_variables, drop_first = True)
df_batch_final = df_batch_final.drop('User ID', axis = 1)
df_batch_final.head()

Unnamed: 0,Age,AnnualSalary,Gender_Male
0,45,57791,1
1,44,42052,1
2,30,151570,0
3,62,86157,1
4,27,15771,0


In [38]:
from joblib import load
loaded_pipeline = load('model.joblib') 

Predictions done with our ML model

In [39]:
df_batch["Purchased"] = loaded_pipeline.predict(df_batch_final)
df_batch

Unnamed: 0,User ID,Age,AnnualSalary,Gender,Purchased
0,1,45,57791,Male,0
1,2,44,42052,Male,0
2,3,30,151570,Female,1
3,4,62,86157,Male,1
4,5,27,15771,Female,0
...,...,...,...,...,...
994,995,25,104955,Female,0
995,996,24,132855,Male,1
996,997,29,17633,Male,0
997,998,34,80648,Female,0


# Test execution model from Model Registry

In [2]:
import pandas as pd
df_batch = pd.read_csv('../data/unknown_batch.csv')
df_batch.head()

Unnamed: 0,User ID,Age,AnnualSalary,Gender
0,1,45,57791,Male
1,2,44,42052,Male
2,3,30,151570,Female
3,4,62,86157,Male
4,5,27,15771,Female


In [3]:
categorical_variables = ["Gender"]
df_batch_final = pd.get_dummies(df_batch, columns = categorical_variables, drop_first = True)
df_batch_final = df_batch_final.drop('User ID', axis = 1)
df_batch_final.head()

Unnamed: 0,Age,AnnualSalary,Gender_Male
0,45,57791,1
1,44,42052,1
2,30,151570,0
3,62,86157,1
4,27,15771,0


In [29]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [30]:
client = MlflowClient()
EXPERIMENT_NAME = "car-purchase-best-models"
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)

best_run = client.search_runs(
                    experiment_ids=experiment.experiment_id,
                    run_view_type=ViewType.ACTIVE_ONLY,
                    max_results=5,
                    order_by=["metrics.accuracy ASC"]
                )[0]

In [31]:
best_run_id = best_run.to_dictionary()["info"]["run_id"]
f"runs:/{best_run_id}/model"

logged_model = f"runs:/{best_run_id}/model"

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [32]:
# Predict on a Pandas DataFrame.
import pandas as pd

df_batch["Purchased"] = loaded_model.predict(df_batch_final)
df_batch

Unnamed: 0,User ID,Age,AnnualSalary,Gender,Purchased
0,1,45,57791,Male,0
1,2,44,42052,Male,0
2,3,30,151570,Female,1
3,4,62,86157,Male,1
4,5,27,15771,Female,0
...,...,...,...,...,...
994,995,25,104955,Female,1
995,996,24,132855,Male,1
996,997,29,17633,Male,0
997,998,34,80648,Female,0
