Simple spam-filter classifier

## IMPORTS

#### Standards

In [1]:
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

#### Externals

In [13]:
# Data handling
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn import compose, metrics, model_selection, pipeline, preprocessing
import joblib

## CONFIGS

In [34]:
# Paths
ROOT = Path("../")
DATA_PATH = ROOT / "data" / "SMSSpamCollection"
MODEL_DIR = ROOT / "models"

# Randomness
RANDOM_STATE = 42

# Options
pd.options.display.max_rows = None
pd.options.display.max_columns = None
sk.set_config(transform_output='pandas')



# Dataset
TEST_SIZE = 0.2
TEXT_COLUMN = "message"
TARGET_COLUMN = "label"
COLUMN_NAMES = [TARGET_COLUMN, TEXT_COLUMN]
SEPARATOR = '\t'
CV_FOLDS = 5

# Pipeline
NGRAM_RANGE = (1, 2)
MAX_FEATURES = 10000
SCORING = "f1"

## DATASETS

In [35]:
spam = pd.read_csv(DATA_PATH, 
                     delimiter=SEPARATOR,
                     header = None, 
                     names = COLUMN_NAMES)
print("Dataset shape: ",spam.shape)
spam.head()

Dataset shape:  (5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
spam.info()

<class 'pandas.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   label    5572 non-null   str  
 1   message  5572 non-null   str  
dtypes: str(2)
memory usage: 87.2 KB


In [37]:
X, y = spam.drop(TARGET_COLUMN, axis='columns'), spam[TARGET_COLUMN]
print("Inputs shape: ", X.shape, "Targets shape: " , y.shape)

Inputs shape:  (5572, 1) Targets shape:  (5572,)


In [38]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, 
                                                                    y, 
                                                                    test_size=TEST_SIZE, 
                                                                    random_state=RANDOM_STATE,
                                                                    stratify=y)

## ANALYSIS