In [1]:
import numpy as np
import pandas as pd
import sklearn
import mlflow

In [None]:
!mlflow server --host 127.0.0.1 --port 8080

In [15]:
mlflow.set_tracking_uri('http://localhost:8080')

In [16]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
validation_dataset = pd.read_csv('validation.csv')

In [17]:
train_dataset

Unnamed: 0,Email,Spam
0,candlestick charts fyi fallout forwarded mike ...,0
1,fyi,0
2,take action immediately miss attention valued ...,1
3,ameriflash newsletter business highlights coal...,0
4,viagrra scores hello welcome pharmonlin purita...,1
...,...,...
3412,save money buy getting thing tried cialls yet ...,1
3413,programming rdi model michelle met cecil chris...,0
3414,wish dd tried sooner save supper medlcations r...,1
3415,summer internship hi vince writing time inquir...,0


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [19]:
X_train = tfidf.fit_transform(train_dataset['Email']).toarray()
y_train = train_dataset['Spam']

In [20]:
X_test = tfidf.transform(test_dataset['Email']).toarray()
y_test = test_dataset['Spam']


In [21]:
X_validation = tfidf.transform(validation_dataset['Email']).toarray()
y_validation = validation_dataset['Spam']

In [22]:
from sklearn.metrics import auc, precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid
from random import seed

import warnings
warnings.filterwarnings("ignore")

param_grid = {
    'max_depth': [2,8,13,20, None],
    'min_samples_split': [2, 7, 15, 20],
    'min_samples_leaf': [1, 2, 4, 7, 10]
}

with mlflow.start_run(run_name= 'Decision Tree'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(564)
        clf = DecisionTreeClassifier(**params)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, f"[ {params['max_depth']}, {params['min_samples_split']}, {params['min_samples_leaf']} ]")
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:,1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params
            
        
    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(564)

    best_model_DT = DecisionTreeClassifier(**best_params)
    best_model_DT.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_DT, "Best Decision Tree model")
        

In [23]:
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

param_grid = {
    'l1_ratio':list(np.arange(0.0, 1.01, 0.1)),
    'penalty':['elasticnet'],
    'solver': ['saga']
}

with mlflow.start_run(run_name= 'Logistic Regression'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(789)
        clf = LogisticRegression(**params)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, 'lamda: ' + str(params['l1_ratio']))
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:,1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params
            
        
    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(789)

    best_model_logistic = LogisticRegression(**best_params)
    best_model_logistic.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_logistic, "Best Logistic model")
        

MlflowException: API request to http://localhost:8080/api/2.0/mlflow-artifacts/artifacts/0/2c755a6966324ef192d7a7f26282895c/artifacts/lamda: 0.0/conda.yaml failed with exception HTTPConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/0/2c755a6966324ef192d7a7f26282895c/artifacts/lamda:%200.0/conda.yaml (Caused by ResponseError('too many 500 error responses'))

In [24]:
from sklearn.naive_bayes import MultinomialNB

warnings.filterwarnings("ignore")

param_grid = {
    'alpha': list(np.arange(0.0, 10.01, 0.5))
}

with mlflow.start_run(run_name= 'Naive Bayes'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(561)
        clf = MultinomialNB(**params)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, 'alpha: ' + str(params['alpha']))
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:,1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params
            
        
    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(561)

    best_model_NB = MultinomialNB(**best_params)
    best_model_NB.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_NB, "Best Naive Bayes model")

MlflowException: API request to http://localhost:8080/api/2.0/mlflow-artifacts/artifacts/0/2ae48a87f9774d9fbc618fc6da7a86ff/artifacts/alpha: 0.0/conda.yaml failed with exception HTTPConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/0/2ae48a87f9774d9fbc618fc6da7a86ff/artifacts/alpha:%200.0/conda.yaml (Caused by ResponseError('too many 500 error responses'))

In [25]:
from sklearn.svm import SVC
from sklearn.metrics import auc, precision_recall_curve
from sklearn.model_selection import ParameterGrid
from numpy.random import seed
import warnings
import mlflow
import numpy as np

warnings.filterwarnings("ignore")

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

with mlflow.start_run(run_name='SVM'):
    best_auc = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        seed(789)
        clf = SVC(**params, probability=True)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, 'C: ' + str(params['C']) + ', kernel: ' + str(params['kernel']) + ', gamma: ' + str(params['gamma']))
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:,1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(recall, precision)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_params = params

    mlflow.log_params(best_params)
    mlflow.log_metric("validation_auc", best_auc)

    seed(789)

    best_model_svm = SVC(**best_params, probability=True)
    best_model_svm.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_svm, "Best SVM model")


MlflowException: API request to http://localhost:8080/api/2.0/mlflow-artifacts/artifacts/0/6eec938ab88c41dbaa78bff46933e0f4/artifacts/C: 0.1, kernel: linear, gamma: scale/conda.yaml failed with exception HTTPConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/0/6eec938ab88c41dbaa78bff46933e0f4/artifacts/C:%200.1,%20kernel:%20linear,%20gamma:%20scale/conda.yaml (Caused by ResponseError('too many 500 error responses'))

In [30]:
DT_model = mlflow.sklearn.load_model('runs:/03b63dc9b4b245f290aa0d62363f322b/Best Decision Tree model')
Logistic_model = mlflow.sklearn.load_model('runs:/d9ba0ea035814f43853fe816dd785b53/Best Logistic model')
NB_model = mlflow.sklearn.load_model('runs:/77489b68f4e246c4a213cbe81cb68f8d/Best Naive Bayes model')

Downloading artifacts: 100%|█████████████████████| 5/5 [00:00<00:00, 305.39it/s]
Downloading artifacts: 100%|█████████████████████| 5/5 [00:00<00:00, 255.52it/s]
Downloading artifacts: 100%|█████████████████████| 5/5 [00:00<00:00, 236.16it/s]


In [32]:
models = [DT_model, Logistic_model, NB_model]
aucpr = []
for i in range(3):
    precision, recall, _ = precision_recall_curve(y_test, models[i].predict_proba(X_test)[:,1])
    sorted_indices = np.argsort(precision)
    precision = precision[sorted_indices]
    recall = recall[sorted_indices]
    aucpr.append(auc(precision, recall))

In [33]:
result = {"Model":["Decision Tree", "Logistic Regression", "Naive Bayes"]}

In [34]:
result['AUCPR'] = aucpr

In [35]:
pd.DataFrame(result)

Unnamed: 0,Model,AUCPR
0,Decision Tree,0.654417
1,Logistic Regression,0.756142
2,Naive Bayes,0.754453


Since AUCPR is maximum for Logistic Regression model. It is the model we would choose. 

In [36]:
import pickle

with open('decision_tree.pkl', 'wb') as f:
    pickle.dump(DT_model, f)

with open('logistic_regression.pkl', 'wb') as f:
    pickle.dump(Logistic_model, f)


with open('Naive_Bayes.pkl', 'wb') as f:
    pickle.dump(NB_model, f)