In [10]:
import os
import urllib.request
import zipfile
import pandas as pd
from pathlib import Path

data_set_url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "ssm_spam_collection.zip"
extracted_path = "ssm_spam_collection"
save_data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"


def download_and_zip_data(url, zip_path, extracted_path, data_file_path):
    if save_data_file_path.exists():
        print("Data already downloaded")
        return
    
    with urllib.request.urlopen(url) as response:
        with open(zip_path, 'wb') as f:
            f.write(response.read())

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"Data downloaded and saved to {data_file_path}")

In [9]:
download_and_zip_data(data_set_url, zip_path, extracted_path, save_data_file_path)

Data downloaded and saved to ssm_spam_collection/SMSSpamCollection.tsv


In [26]:
def read_data(file_path):
    return pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'])

In [27]:
read_data(save_data_file_path)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [28]:
df = read_data(save_data_file_path)
df['label'].value_counts(), df.shape

(label
 ham     4825
 spam     747
 Name: count, dtype: int64,
 (5572, 2))

In [29]:
def create_balance_dataset(df:pd.DataFrame):
    spam_num = df[df['label'] == 'spam'].shape[0]
    ham_subset  = df[df['label'] == 'ham'].sample(spam_num, random_state=123)
    # print(ham_subset)
    balanced_df = pd.concat([ham_subset, df[df['label'] == 'spam']])
    return balanced_df

In [30]:
balance_df = create_balance_dataset(df)
balance_df['label'].value_counts(), balance_df.shape

(label
 ham     747
 spam    747
 Name: count, dtype: int64,
 (1494, 2))

In [31]:
map_dict = {
    "ham": 0,
    "spam": 1
}
balance_df['label'] = balance_df['label'].map(map_dict)
balance_df.head(), balance_df.tail()

(      label                                               text
 4307      0  Awww dat is sweet! We can think of something t...
 4138      0                             Just got to  &lt;#&gt;
 4831      0  The word "Checkmate" in chess comes from the P...
 4461      0  This is wishing you a great day. Moji told me ...
 5440      0      Thank you. do you generally date the brothas?,
       label                                               text
 5537      1  Want explicit SEX in 30 secs? Ring 02073162414...
 5540      1  ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
 5547      1  Had your contract mobile 11 Mnths? Latest Moto...
 5566      1  REMINDER FROM O2: To get 2.50 pounds free call...
 5567      1  This is the 2nd time we have tried 2 contact u...)

In [32]:
def random_split(df:pd.DataFrame, train_ratio=0.7, val_ration=0.2):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    tran_end = int(len(df) * train_ratio)
    val_end = tran_end + int(len(df) * val_ration)
    train_df = df[:tran_end]
    val_df = df[tran_end:val_end]
    test_df = df[val_end:]
    return train_df, val_df, test_df

In [44]:
train_df, val_df, test_df = random_split(balance_df)
train_df

Unnamed: 0,label,text
0,0,Dude how do you like the buff wind.
1,0,Tessy..pls do me a favor. Pls convey my birthd...
2,1,Reminder: You have not downloaded the content ...
3,1,Got what it takes 2 take part in the WRC Rally...
4,1,"Shop till u Drop, IS IT YOU, either 10K, 5K, £..."
...,...,...
1040,1,4mths half price Orange line rental & latest c...
1041,1,Thanks for the Vote. Now sing along with the s...
1042,1,IMPORTANT INFORMATION 4 ORANGE USER 0796XXXXXX...
1043,1,Urgent! call 09066612661 from landline. Your c...


In [45]:
import tiktoken
import torch
from torch.utils.data import  Dataset

class SMSSDataset(Dataset):
    def __init__(self, csv_file:pd.DataFrame, tokenizer, max_len=None, pad_token_id=50256):
        self.data = csv_file
        self.encode_texts = [tokenizer.encode(text) for text in self.data['text']]
    
        if max_len is None:
            self.max_len = self._longest_encode_length()
        else:
            self.max_len = max_len
            self.encode_texts = [
                self.encode_texts[:self.max_len]
            ]
    
    def __getitem__(self, index):
        encode = self.encode_texts[index]
        label = self.data.iloc[index]['label']
        return torch.tensor(encode, dtype=torch.long), torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def _longest_encode_length(self):
        max_length = 0
        for encode in self.encode_texts:
            encode_length = len(encode)
            if encode_length > max_length:
                max_length = encode_length
        # return max([len(encode) for encode in self.encode_texts])
        return max_length


In [48]:
tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = SMSSDataset(train_df, tokenizer, None)
eval_dataset = SMSSDataset(val_df, tokenizer, None)
test_dataset = SMSSDataset(test_df, tokenizer, None)


In [49]:
from torch.utils.data import DataLoader
torch.manual_seed(123)
num_workers = 4
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers,drop_last=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,drop_last=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers,drop_last=False)

In [5]:
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
import torch
from datasets import load_dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"

checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 3
batch_size = 8

In [6]:
# creating model
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

trainable params: 2,359,296 || all params: 1,231,940,608 || trainable%: 0.1915


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): MT5ForConditionalGeneration(
      (shared): Embedding(250112, 1024)
      (encoder): MT5Stack(
        (embed_tokens): Embedding(250112, 1024)
        (block): ModuleList(
          (0): MT5Block(
            (layer): ModuleList(
              (0): MT5LayerSelfAttention(
                (SelfAttention): MT5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
  

In [7]:
# loading dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

classes = dataset["train"].features["label"].names
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["label"]]},
    batched=True,
    num_proc=1,
)

dataset["train"][0]

README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]