In [2]:
import kaggle
import os
import chardet

import pandas as pd

from dotenv import load_dotenv
from typing import Tuple, Annotated

from loguru import logger

load_dotenv()
os.chdir('..')

KAGGLE_USERNAME = os.getenv('KAGGLE_USERNAME')
PROJECT_ROOT = os.getenv('PROJECT_ROOT')

pd.options.plotting.backend = 'plotly'

In [3]:
# download spam data set
dataset_slug = "uciml/sms-spam-collection-dataset"
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')

kaggle.api.dataset_download_files(
    dataset_slug,
    DATA_PATH,
    unzip=True,
    quiet=False,
    )

Downloading sms-spam-collection-dataset.zip to /home/azdt/code_zone/data_science/data-science-overview/logistic_regression/data


100%|██████████| 211k/211k [00:00<00:00, 350kB/s]







# EDA and writing initial functions

## Loading in data

In [4]:
path_spam: str = os.path.join(DATA_PATH, 'spam.csv')

def get_file_encoding(path: str) -> Annotated[str, 'file_encoding']:
    try:
        with open(path, 'rb') as bin_data:
            result = chardet.detect(bin_data.read(100_000))
    except Exception as e:
        logger.info(
            f'An error has occurred while trying to detect file encoding: {e}.'
            )
    return result['encoding']

def load_data(path: str) -> Annotated[pd.DataFrame, 'df']:
    try:
        logger.info('Loading in spam dataset.')
        df: pd.DataFrame = pd.read_csv(path, encoding=get_file_encoding(path)) \
            .iloc[:, [0, 1]]
        logger.info('Data loaded successfully')
        return df
    except Exception as e:
        logger.info(f'An error has occurred in `load_data`: {e}.')

df = load_data(path_spam)

def create_mapper(df: pd.DataFrame, values: list[str]) -> Annotated[
    dict[str, str], 'mapper']:
    keys = df.columns
    values: list[str] = ['label', 'message']
    mapper = {k:v for k, v in zip(keys, values)}
    return mapper

def rename_columns(df: pd.DataFrame, mapper: dict[str, str]=None) -> Annotated[
    pd.DataFrame, 'df_renamed']:
    if mapper == None:
        raise Exception('mapping dict is None')
    else:
        df_renamed = df.rename(mapper, axis=1)
        return df_renamed
    
mapper = create_mapper(df, 'label message'.split())
df = rename_columns(df, mapper)

[32m2024-03-26 23:15:11.234[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_data[0m:[36m13[0m - [1mLoading in spam dataset.[0m
[32m2024-03-26 23:15:11.597[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_data[0m:[36m15[0m - [1mData loaded successfully[0m


# EDA

In [13]:
print('class count')
display(df.label.value_counts())
print('class count normalized')
display(df.label.value_counts().div(len(df)).mul(100).round(2))

class count


label
ham     4825
spam     747
Name: count, dtype: int64

class count normalized


label
ham     86.59
spam    13.41
Name: count, dtype: float64

Unbalanced classes, have to consider that while developing the model

In [29]:
def sample_class(df: pd.DataFrame, class_label: str='spam', n: int=5) -> \
    Annotated[pd.DataFrame, 'df_sample']:
    mask_class: pd.Series = df.label == class_label
    df_sample: pd.DataFrame = df.loc[mask_class, :].sample(n)
    return df_sample

display(sample_class(df, ))
display(sample_class(df, 'ham'))

Unnamed: 0,label,message
2861,spam,Adult 18 Content Your video will be with you s...
822,spam,"SMSSERVICES. for yourinclusive text credits, p..."
814,spam,U were outbid by simonwatson5120 on the Shinco...
5466,spam,http//tms. widelive.com/index. wml?id=820554ad...
4901,spam,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...


Unnamed: 0,label,message
431,ham,Does she usually take fifteen fucking minutes ...
4131,ham,Hi baby ive just got back from work and i was ...
2281,ham,"I hav almost reached. Call, i m unable to conn..."
4945,ham,I'm already back home so no probably not
2577,ham,Hey whats up? U sleeping all morning?


# Preprocessing

# Model development
Since this is a classification model will use logistic regression as the first algorithm alongside a dummy classifier as a baseline.

## Dummy classifier

In [None]:
from sklearn.dummy  import DummyClassifier
from sklearn.base   import ClassifierMixin
# search for best dummy classifier
def make_dummy_cls() -> ClassifierMixin:
    '''
    This function returns the higest scoring classifier amongest the dummy classifier strategies which are:
        * most_frequent
        * prior
        * stratified
        * uniform
        * constant
    '''
    strategies = 'most_frequent prior stratified uniform constant'.split()
    for strategy in strategies:
        

