In [17]:
import os

download_name = "SMSSpamCollection.txt.bz2"
if not os.path.exists(download_name):
    import requests
    response = requests.get(f"https://raw.githubusercontent.com/bzitko/nlp_repo/main/assignments/a02/{download_name}")
    with open(download_name, "wb") as fp:
        fp.write(response.content)
    response.close()
        
name = "SMSSpamCollection.txt"
if not os.path.exists(name):
    import bz2
    with open(download_name, 'rb') as bzf, open(name, 'wb') as fp:
        fp.write(bz2.decompress(bzf.read()))    

# SMS Spam Preprocessing Solution

In [18]:
import collections
import numpy as np
import pandas as pd
import re
from tqdm.notebook import trange, tqdm

from argparse import Namespace

### Arguments

raw dataset and split dataset are included, as well as proportions for training, validation and evaluation.

In [19]:
args = Namespace(
    raw_dataset_csv="SMSSpamCollection.txt",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="sms_spam_with_splits.csv",
    seed=1337
)


### Read Dataset

👍 Read raw dataset's csv file into pandas DataFrame.

In [20]:
# Read raw data
sms_df = pd.read_csv(args.raw_dataset_csv, sep="\t", names=['cls', 'sms'])
sms_df

Unnamed: 0,cls,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


👍 Count how many datapoints are in each class.

In [21]:
sms_df.cls.value_counts()

ham     4825
spam     747
Name: cls, dtype: int64

### Split to TRAIN, VAL, TEST and add field SPLIT to reviews

👍 Create new DataFrame by shuffling datapoints from raw dataset.
Append DataFrame with column "split" which will tell if datapoint is for train, val or test.
Use percentages defined in args do determine how many datapoints will be for train, val and test.

In [22]:
# Splitting the subset by cls to create our new train, val, and test splits
np.random.seed(args.seed)

by_cls = collections.defaultdict(list)
for _, row in sms_df.iterrows():
    by_cls[row.cls].append(row.to_dict())

final_list = []
for cls, item_list in by_cls.items():
    np.random.shuffle(item_list)

    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)

    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'    

    final_list.extend(item_list)        

final_sms_df = pd.DataFrame(final_list)
print(final_sms_df.split.value_counts())
final_sms_df


train    3899
test      838
val       835
Name: split, dtype: int64


Unnamed: 0,cls,sms,split
0,ham,Guai... Ü shd haf seen him when he's naughty.....,train
1,ham,Btw regarding that we should really try to see...,train
2,ham,"I dont knw pa, i just drink milk..",train
3,ham,She doesnt need any test.,train
4,ham,Lol u still feeling sick?,train
...,...,...,...
5567,spam,YOU VE WON! Your 4* Costa Del Sol Holiday or £...,test
5568,spam,Congrats! Nokia 3650 video camera phone is you...,test
5569,spam,Congratulations ur awarded 500 of CD vouchers ...,test
5570,spam,"Claim a 200 shopping spree, just call 08717895...",test


### Preprocess sms text

👍 Define function for preprocessing text which will:
* convert text to lower case
* put one space before and after each interpunction (.,!? are interpunctions)
* replace all non-letter and non-interpunction characters with one space

Apply this function to all SMS-es in previous dataframe.

In [23]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

final_sms_df.sms = final_sms_df.sms.apply(preprocess_text)
final_sms_df

Unnamed: 0,cls,sms,split
0,ham,guai . . . shd haf seen him when he s naughty ...,train
1,ham,btw regarding that we should really try to see...,train
2,ham,"i dont knw pa , i just drink milk . .",train
3,ham,she doesnt need any test .,train
4,ham,lol u still feeling sick ?,train
...,...,...,...
5567,spam,you ve won ! your costa del sol holiday or awa...,test
5568,spam,congrats ! nokia video camera phone is your ca...,test
5569,spam,congratulations ur awarded of cd vouchers or g...,test
5570,spam,"claim a shopping spree , just call now ! have ...",test


### Save dataset

👍 Save DataFrame with header to tab separated file defined by arguments.

In [24]:
final_sms_df.to_csv(args.output_munged_csv, sep="\t", index=False)