In [11]:
import numpy as np
import pandas as pd
import sklearn
import mlflow
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
mlflow.set_tracking_uri("http://localhost:5081")

In [13]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
validation_data = pd.read_csv('validation.csv')

In [14]:
train_data

Unnamed: 0,Email,Spam
0,candlestick charts fyi fallout forwarded mike ...,0
1,fyi,0
2,take action immediately miss attention valued ...,1
3,ameriflash newsletter business highlights coal...,0
4,viagrra scores hello welcome pharmonlin purita...,1
...,...,...
3412,save money buy getting thing tried cialls yet ...,1
3413,programming rdi model michelle met cecil chris...,0
3414,wish dd tried sooner save supper medlcations r...,1
3415,summer internship hi vince writing time inquir...,0


In [15]:
test_data

Unnamed: 0,Email,Spam
0,storage book ravi samer met morning sara ledbe...,0
1,meeting wharton office next week please send d...,0
2,enron recruiting mscf speaker series vince pie...,0
3,research sign steve sign research group someth...,0
4,f u iris mack mba phd enron vince iris receive...,0
...,...,...
1134,visit enron frank great idea think opportunity...,0
1135,meeting bob butts scheduled pm thursday th off...,0
1136,visit vince kaminski enron corp research dear ...,0
1137,resume mark giancola attached resume mark gian...,0


In [16]:
validation_data

Unnamed: 0,Email,Spam
0,get latest version cds download wide range sof...,1
1,yana kristal rotation hi vince already spoke m...,0
2,select small cap astute investors momentum ale...,1
3,power plant model jeff comments model reservat...,0
4,bullet points hi vince thanks bullets regardin...,0
...,...,...
1134,make dialup go faster visioson hpp za net find...,1
1135,save money buy getting thing tried cialls yet ...,1
1136,investor insight oil gas advisory oi gas enter...,1
1137,percent life insurance get free quote instantl...,1


In [17]:
tfidf = TfidfVectorizer()

In [18]:
X_train = tfidf.fit_transform(train_data['Email']).toarray()
y_train = train_data['Spam']

X_test = tfidf.transform(test_data['Email']).toarray()
y_test = test_data['Spam']

X_validation = tfidf.transform(validation_data['Email']).toarray()
y_validation = validation_data['Spam']

In [19]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_curve, auc
from numpy.random import seed
import warnings

warnings.filterwarnings("ignore")

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

param_grid = {
    
    'l1_ratio': np.arange(0.0, 1.01, 0.14),
    'penalty': ['elasticnet'],
    'solver': ['saga']
}

with mlflow.start_run(run_name ='Logistic Regression'):
    best_auc = 0
    best_para = None
    
    for para in ParameterGrid(param_grid):
        seed(555)
        clf = LogisticRegression(**para)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, 'lambda: ' + str(para['l1_ratio']))
        
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:, 1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_para = para
            
    mlflow.log_params(best_para)
    mlflow.log_metric("validation_auc", best_auc)

    seed(555)
    best_model_logistic = LogisticRegression(**best_para)
    best_model_logistic.fit(X_train, y_train)
    mlflow.sklearn.log_model(best_model_logistic, "Best Logistic model")

## Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [4, 8, 15, 25, None],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [2, 4, 8, 12, 16]
}

with mlflow.start_run(run_name ='Decision Tree'):
    best_auc = 0
    best_para = None
    
    for para in ParameterGrid(param_grid):
        seed(444)
        clf = DecisionTreeClassifier(**para)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, f"[{para['max_depth']}, {para['min_samples_split']}, {para['min_samples_leaf']}]")
        
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:, 1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(precision, recall)
        
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_para = para
            
    mlflow.log_params(best_para)
    mlflow.log_metric("validation_auc", best_auc)

    seed(444)
    best_model_DT = DecisionTreeClassifier(**best_para)
    best_model_DT.fit(X_train, y_train)
    mlflow.sklearn.log_model(best_model_DT, "Best Decision Tree model")


## Adaboost

In [38]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {
    'n_estimators': [50, 75, 100],
    'learning_rate': [0.01]
}

with mlflow.start_run(run_name ='Adaboost'):
    best_auc = 0
    best_para = None
    for para in ParameterGrid(param_grid):
        seed(333)
        clf = AdaBoostClassifier(**para)
        clf.fit(X_train, y_train)

        mlflow.sklearn.log_model(clf, 'n_estimators: ' + str(para['n_estimators']) + ', learning_rate:' + str(para['learning_rate']))
        precision, recall, _ = precision_recall_curve(y_validation, clf.predict_proba(X_validation)[:,1])
        sorted_indices = np.argsort(precision)
        precision = precision[sorted_indices]
        recall = recall[sorted_indices]
        validation_auc = auc(recall, precision)
        if validation_auc > best_auc:
            best_auc = validation_auc
            best_para = para

    mlflow.log_params(best_para)
    mlflow.log_metric("validation_auc", best_auc)

    seed(333)
    best_model_svm = AdaBoostClassifier(**best_para)
    best_model_svm.fit(X_train, y_train)

    mlflow.sklearn.log_model(best_model_svm, "Best ada model")


In [39]:
LogisticRegression_model = mlflow.sklearn.load_model('runs:/88bb95ec5fab44afb9b50980cb147dc4/Best Logistic model')
DecisionTree_model = mlflow.sklearn.load_model('runs:/4a0267da3e25478f903a07d7996a04a5/Best Decision Tree model')
Adaboost_model = mlflow.sklearn.load_model('runs:/2fe6a95bb3d04f79939336d98fac5492/Best ada model')

Downloading artifacts: 100%|██████████████████████| 5/5 [00:00<00:00, 94.16it/s]
Downloading artifacts: 100%|█████████████████████| 5/5 [00:00<00:00, 375.50it/s]
Downloading artifacts: 100%|█████████████████████| 5/5 [00:00<00:00, 298.14it/s]


In [41]:
models = [DecisionTree_model, LogisticRegression_model, Adaboost_model]
auc_pr = []

for i in range(3):
    precision, recall, _ = precision_recall_curve(y_test, models[i].predict_proba(X_test)[:,1])
    sorted_indices = np.argsort(precision)
    precision = precision[sorted_indices]
    recall = recall[sorted_indices]
    auc_pr.append(auc(precision, recall))

In [44]:
Model_AUCPR = {"Model":["Logistic Regression", "Decision Tree", "AdaBoost"]}

Model_AUCPR['AUCPR'] = auc_pr

In [45]:
pd.DataFrame(Model_AUCPR)

Unnamed: 0,Model,AUCPR
0,Logistic Regression,0.705601
1,Decision Tree,0.757358
2,AdaBoost,0.582078


Since AUCPR is maximum for Decision Tree model. It is the model we would choose. 