Finetuning for classification

Preparaing a dataset

In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [2]:
import pandas as pd

In [7]:
df = pd.read_csv(data_file_path,sep='\t',header=None,names=['Label','Text'])
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
def create_balanced_dataset(df):
    num_spam = df[df['Label'] == 'spam'].shape[0]
    ham_subset = df[df['Label']=='ham'].sample(num_spam,random_state=123)
    balanced_df = pd.concat([ham_subset,df[df['Label'] == 'spam']])
    return balanced_df
balanced_df = create_balanced_dataset(df)
balanced_df['Label'].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

In [11]:
balanced_df.head()

Unnamed: 0,Label,Text
4307,ham,Awww dat is sweet! We can think of something t...
4138,ham,Just got to &lt;#&gt;
4831,ham,"The word ""Checkmate"" in chess comes from the P..."
4461,ham,This is wishing you a great day. Moji told me ...
5440,ham,Thank you. do you generally date the brothas?


In [12]:
map_dict = {'ham':0,'spam':1}
balanced_df['Label'] = balanced_df['Label'].map(map_dict)

In [13]:
balanced_df.head()

Unnamed: 0,Label,Text
4307,0,Awww dat is sweet! We can think of something t...
4138,0,Just got to &lt;#&gt;
4831,0,"The word ""Checkmate"" in chess comes from the P..."
4461,0,This is wishing you a great day. Moji told me ...
5440,0,Thank you. do you generally date the brothas?


In [15]:
def random_split(df,train_frac=0.7,valid_frac=0.2):
    df = df.sample(frac=1,random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    valid_end = train_end + int(len(df) * valid_frac)
    train_df = df[:train_end]
    valid_df = df[train_end:valid_end]
    test_df = df[valid_end:]
    return train_df, valid_df, test_df

In [16]:
train_df, valid_df, test_df = random_split(balanced_df,0.7,0.1)

train_df.to_csv("train.csv",index=None)
valid_df.to_csv("validation.csv",index=None)
test_df.to_csv("test.csv",index=None)

In [19]:
test_df

Unnamed: 0,Label,Text
1194,1,85233 FREE>Ringtone!Reply REAL
1195,1,Ur cash-balance is currently 500 pounds - to m...
1196,1,"Thanks for your ringtone order, reference numb..."
1197,0,We live in the next &lt;#&gt; mins
1198,1,1st wk FREE! Gr8 tones str8 2 u each wk. Txt N...
...,...,...
1489,1,FREE2DAY sexy St George's Day pic of Jordan!Tx...
1490,1,Urgent! Please call 09066612661 from your land...
1491,1,For your chance to WIN a FREE Bluetooth Headse...
1492,1,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
