In [40]:
import kaggle
import os
import chardet

import pandas   as pd
import numpy    as np

from dotenv import load_dotenv
from typing import Tuple, Annotated

from loguru import logger

load_dotenv()
os.chdir('..')

KAGGLE_USERNAME = os.getenv('KAGGLE_USERNAME')
PROJECT_ROOT = os.getenv('PROJECT_ROOT')

pd.options.plotting.backend = 'plotly'

In [2]:
# download spam data set
dataset_slug = "uciml/sms-spam-collection-dataset"
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')

kaggle.api.dataset_download_files(
    dataset_slug,
    DATA_PATH,
    unzip=True,
    quiet=False,
    )

Downloading sms-spam-collection-dataset.zip to /home/azdt/code_zone/data_science/data-science-overview/logistic_regression/data


100%|██████████| 211k/211k [00:00<00:00, 317kB/s]







# EDA and writing initial functions

## Loading in data

In [59]:
path_spam: str = os.path.join(DATA_PATH, 'spam.csv')

def get_file_encoding(path: str) -> Annotated[str, 'file_encoding']:
    try:
        with open(path, 'rb') as bin_data:
            result = chardet.detect(bin_data.read(100_000))
    except Exception as e:
        logger.info(
            f'An error has occurred while trying to detect file encoding: {e}.'
            )
    return result['encoding']

def load_data(path: str) -> Annotated[pd.DataFrame, 'df']:
    try:
        logger.info('Loading in spam dataset.')
        df: pd.DataFrame = pd.read_csv(path, encoding=get_file_encoding(path)) \
            .iloc[:, [0, 1]]
        logger.info('Data loaded successfully')
        return df
    except Exception as e:
        logger.info(f'An error has occurred in `load_data`: {e}.')

df = load_data(path_spam)

def create_mapper(df: pd.DataFrame, values: list[str]) -> Annotated[
    dict[str, str], 'mapper']:
    keys = df.columns
    values: list[str] = ['label', 'message']
    mapper = {k:v for k, v in zip(keys, values)}
    return mapper

def rename_columns(df: pd.DataFrame, mapper: dict[str, str]=None) -> Annotated[
    pd.DataFrame, 'df_renamed']:
    if mapper == None:
        raise Exception('mapping dict is None')
    else:
        df_renamed = df.rename(mapper, axis=1)
        return df_renamed
    
def update_labels(df: pd.DataFrame) -> Annotated[pd.DataFrame, 
                                                 'df_label_updated']:
    df = df.copy()
    label_mapper = {k:v for k,v in zip(df.label.unique(), (0, 1))}
    df['label'] = df.label.map(label_mapper)
    return df
    
mapper = create_mapper(df, 'label message'.split())
df = rename_columns(df, mapper)
df = update_labels(df)

[32m2024-03-31 22:58:38.246[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_data[0m:[36m15[0m - [1mLoading in spam dataset.[0m
[32m2024-03-31 22:58:38.588[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_data[0m:[36m18[0m - [1mData loaded successfully[0m


# EDA

In [4]:
print('class count')
display(df.label.value_counts())
print('class count normalized')
display(df.label.value_counts().div(len(df)).mul(100).round(2))

class count


label
ham     4825
spam     747
Name: count, dtype: int64

class count normalized


label
ham     86.59
spam    13.41
Name: count, dtype: float64

Unbalanced classes, have to consider that while developing the model

In [5]:
def sample_class(df: pd.DataFrame, class_label: str='spam', n: int=5) -> \
    Annotated[pd.DataFrame, 'df_sample']:
    mask_class: pd.Series = df.label == class_label
    df_sample: pd.DataFrame = df.loc[mask_class, :].sample(n)
    return df_sample

display(sample_class(df, ))
display(sample_class(df, 'ham'))

Unnamed: 0,label,message
4582,spam,For ur chance to win a å£250 wkly shopping spr...
5190,spam,Our records indicate u maybe entitled to 5000 ...
4726,spam,Had your mobile 10 mths? Update to the latest ...
5147,spam,Get your garden ready for summer with a FREE s...
15,spam,"XXXMobileMovieClub: To use your credit, click ..."


Unnamed: 0,label,message
5259,ham,Can help u swoop by picking u up from wherever...
3265,ham,tap & spile at seven. * Is that pub on gas st ...
4898,ham,"Haha, that was the first person I was gonna ask"
112,ham,I'm ok wif it cos i like 2 try new things. But...
5383,ham,Good day to You too.Pray for me.Remove the tee...


# Preprocessing
Since the main feature is textual, we need to use NLP techniques in order to encode
nature language to numbers for logistic model.

classification pipeline:

$\rightarrow$ [email] $\rightarrow$ [model] $\rightarrow$ [spam or not]

In [6]:
from nltk.corpus import stopwords
import string
import nltk
def preprocess_text(text):
    text = text.translate(
        str.maketrans('', '', string.punctuation)
    )
    text = [word for word in text.split()
             if word.lower() not in stopwords.words('english')]
    text = ' '.join(text)
    return text


In [61]:
from sklearn.model_selection import train_test_split
def split_data(X, y, test_size=.3) -> Tuple[
    Annotated[np.ndarray, 'X_train'],
    Annotated[np.ndarray, 'X_test'],
    Annotated[np.ndarray, 'y_train'],
    Annotated[np.ndarray, 'y_test'],
]:
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=2447,)
    return X_train, X_test, y_train, y_test

X = df.copy()
y = X.pop('label')

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [46]:
print(X_train.shape, y_train.shape)

(4179, 1) (4179,)


In [63]:
y_train

4649    0
847     0
5045    0
1101    0
5209    0
       ..
2489    0
4835    0
1196    0
4994    0
896     0
Name: label, Length: 4179, dtype: int64

This preprocessing step removes punctuation and stopwords as they do not provide any useful information to the model.

**Q: why stopwords are useless to NLP models?**

**A: Stopwords appear often in text and they do not provide much information for how much they appear and stopword removage is kind of data normalization step.**

# Model development
Since this is a classification model will use logistic regression as the first algorithm alongside a dummy classifier as a baseline.

## Dummy classifier

In [65]:
from sklearn.dummy      import DummyClassifier
from sklearn.base       import ClassifierMixin
from sklearn.metrics    import f1_score, accuracy_score
# search for best dummy classifier
def make_dummy_cls() -> ClassifierMixin:
    '''
    This function returns the higest scoring classifier amongest the dummy classifier strategies which are:
        * most_frequent
        * prior
        * stratified
        * uniform
        * constant
    '''
    strategies = 'most_frequent prior stratified uniform'.split()
    for strategy in strategies:
        cls = DummyClassifier(strategy=strategy, random_state=2447)
        cls.fit(X_train, y_train)
        y_hat = cls.predict(X_test)
        print(f'Strategy = {strategy}, has f1_score = {f1_score(y_test, y_hat)}')
make_dummy_cls()         

Strategy = most_frequent, has f1_score = 0.0
Strategy = prior, has f1_score = 0.0
Strategy = stratified, has f1_score = 0.11931818181818182
Strategy = uniform, has f1_score = 0.22482435597189696
