# Preparation

## Import Packages

In [4]:
# used to ignore "FutureWarnings"
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# data packages
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import StandardScaler

# splits and hyper paramater tuning
from sklearn.model_selection import train_test_split, GridSearchCV

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# metrics for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

## Import Data

In [8]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
X = spambase.data.features 
y = spambase.data.targets

# peek at the head of data set 
pd.concat([X, y], axis=1).head()

# No preprocessing necessary in terms of cleaning. Data set is pre-cleaned.

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


# Logistic Regression

## Pre-processing

In [12]:
# Standardize data for logistic regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

## Hyper Paramater Tuning

In [16]:
# fix dimensions of y
y = y.to_numpy().ravel()

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=47)
logreg = LogisticRegression()

# hyper tuning with GridSearchCV
param_grid = [
    {
        'penalty': ['l2'],
        'C': [0.001, 0.01, 0.1],
        'solver': ['newton-cg', 'lbfgs', 'sag'],
        'max_iter': [2000, 3000, 5000]
    },
    {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1],
        'solver': ['saga'],
        'max_iter': [2000, 3000, 5000]
    }
]

grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [46]:
# Get tuned paramaters
print(f'{grid_search.best_params_=}')
print(f'{grid_search.best_score_=}')

grid_search.best_params_={'C': 0.1, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'newton-cg'}
grid_search.best_score_=0.9182065217391304


## Train and Test with tuned hyper parameters

In [48]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=47)

# logistic regression with tuned parameters
logreg = LogisticRegression(C=0.1, max_iter=2000, solver='newton-cg', penalty='l2')

# fit the data to the model
logreg.fit(X_train, y_train)

# get predictions on testing and training sets
y_pred = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)

# test overfitting
print(f'Training precision: {precision_score(y_train, y_pred_train)}')
print(f'Testing precision: {precision_score(y_test, y_pred)}')

Training precision: 0.9266123054114158
Testing precision: 0.9303621169916435


# Decision Tree

## Train Model

In [23]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)

# Decision Tree Classifier Instantiation
dtc = DecisionTreeClassifier(ccp_alpha=0.01)

# fit the data to the model
dtc.fit(X_train, y_train)

## Test Model

In [25]:
# Training and testing predictions
y_pred_train = dtc.predict(X_train)
y_pred = dtc.predict(X_test)

In [26]:
print(f'Training precision: {precision_score(y_train, y_pred_train)}')
print(f'Testing precision: {precision_score(y_test, y_pred)}\n')
print(f'Classification Report:\n {classification_report(y_test, y_pred)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}')

Training precision: 0.9171270718232044
Testing precision: 0.9188405797101449

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92       539
           1       0.92      0.83      0.87       382

    accuracy                           0.90       921
   macro avg       0.90      0.89      0.89       921
weighted avg       0.90      0.90      0.90       921

Confusion Matrix:
 [[511  28]
 [ 65 317]]


In [27]:
# Get feature importance
feature_importance = pd.DataFrame(dtc.feature_importances_, index=X.columns)

# Displays with values of 0.0 importance dropped
feature_importance[(feature_importance != 0).all(axis=1)]

Unnamed: 0,0
word_freq_remove,0.211144
word_freq_free,0.032872
word_freq_hp,0.066041
char_freq_!,0.134977
char_freq_$,0.507705
capital_run_length_total,0.04726


# Random Forest

## Train Model

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)

# Random Forest Classifier instantiation
rfc = RandomForestClassifier(random_state=47, n_jobs=-1, min_samples_split=5)
rfc.fit(X_train, y_train)

## Test Model

In [128]:
y_pred_train = rfc.predict(X_train)
y_pred = rfc.predict(X_test)


print(f'Classification Report Training:\n {classification_report(y_train, y_pred_train)}')
print(f'Classification Report:\n {classification_report(y_test, y_pred)}')

Classification Report Training:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      2249
           1       1.00      0.99      0.99      1431

    accuracy                           0.99      3680
   macro avg       0.99      0.99      0.99      3680
weighted avg       0.99      0.99      0.99      3680

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       539
           1       0.95      0.92      0.94       382

    accuracy                           0.95       921
   macro avg       0.95      0.94      0.95       921
weighted avg       0.95      0.95      0.95       921



In [113]:
# model may be overfitting, but the test split still performs well

# XGBoost