In [35]:
import kaggle
import os
import chardet

import pandas as pd

from dotenv import load_dotenv
from typing import Tuple, Annotated

from loguru import logger

load_dotenv()
os.chdir('..')

KAGGLE_USERNAME = os.getenv('KAGGLE_USERNAME')
PROJECT_ROOT = os.getenv('PROJECT_ROOT')

pd.options.plotting.backend = 'plotly'

In [3]:
# download spam data set
dataset_slug = "uciml/sms-spam-collection-dataset"
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')

kaggle.api.dataset_download_files(
    dataset_slug,
    DATA_PATH,
    unzip=True,
    quiet=False,
    )

Downloading sms-spam-collection-dataset.zip to /home/azdt/code_zone/data_science/data-science-overview/logistic_regression/data


100%|██████████| 211k/211k [00:00<00:00, 390kB/s]







# EDA and writing initial functions

## Loading in data

In [None]:
path_spam: str = os.path.join(DATA_PATH, 'spam.csv')

def get_file_encoding(path: str) -> Annotated[str, 'file_encoding']:
    try:
        with open(path, 'rb') as bin_data:
            result = chardet.detect(bin_data.read(100_000))
    except Exception as e:
        logger.info(f'An error has occurred while trying to detect file encoding: {e}.')
    return result['encoding']

def load_data(path: str) -> Annotated[pd.DataFrame, 'df']:
    try:
        logger.info('Loading in spam dataset.')
        df: pd.DataFrame = pd.read_csv(path, encoding=get_file_encoding(path)).iloc[:, [0, 1]]
        logger.info('Data loaded successfully')
        return df
    except Exception as e:
        logger.info(f'An error has occurred in `load_data`: {e}.')

df = load_data(path_spam)

## Preprocessing functions

In [32]:
def create_mapper(df: pd.DataFrame, values: list[str]) -> Annotated[dict[str, str], 'mapper']:
    keys = df.columns
    values: list[str] = ['label', 'message']
    mapper = {k:v for k, v in zip(keys, values)}
    return mapper

def rename_columns(df: pd.DataFrame, mapper: dict[str, str]=None) -> Annotated[pd.DataFrame, 'df_renamed']:
    if mapper == None:
        raise Exception('mapping dict is None')
    else:
        df_renamed = df.rename(mapper, axis=1)
        return df_renamed
mapper = create_mapper(df, 'label message'.split())
df = rename_columns(df, mapper)

In [34]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# EDA

In [40]:
df.label.value_counts().div(len(df)).mul(100).round(2)

label
ham     86.59
spam    13.41
Name: count, dtype: float64

In [41]:
def sample_class(df: pd.DataFrame, class_label: str='spam', n: int=5) -> Annotated[pd.DataFrame, 'df_sample']:
    mask_class: pd.Series = df.label == class_label
    df_sample: pd.DataFrame = df.loc[mask_class, :].sample(n)
    return df_sample

display(sample_class(df, ))
display(sample_class(df, 'ham'))

Unnamed: 0,label,message
1658,spam,RGENT! This is the 2nd attempt to contact U!U ...
4899,spam,"ou are guaranteed the latest Nokia Phone, a 40..."
1317,spam,Win the newest ‰ÛÏHarry Potter and the Order o...
4234,spam,FREEMSG: Our records indicate you may be entit...
2260,spam,SplashMobile: Choose from 1000s of gr8 tones e...


Unnamed: 0,label,message
2959,ham,Sir send to group mail check it.
383,ham,Hey i will be late ah... Meet you at 945+
1165,ham,"Haha yeah I see that now, be there in a sec"
5182,ham,I sent them. Do you like?
2563,ham,K..k..i'm also fine:)when will you complete th...
