In [1]:

!pip install transformers[torch]
!pip install accelerate -U

# Restart your kernel after running the above commands

import urllib.request
import zipfile
import os
from pathlib import Path
import pandas as pd

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip(url, zip_path, extracted_path, data_file_path)

df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df["Label"] = df["Label"].map({"ham": 0, "spam": 1})
df.rename(columns={'Text':'Scammer'},inplace=True)


def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])
    return balanced_df



balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

# Assuming the ScamDataNew.csv and scams13.xlsx files are already in place
data = pd.read_csv('/kaggle/input/scam-dataset/ScamDataNew.csv')
testData = pd.read_excel('/kaggle/input/scam-dataset/scams13.xlsx')

# For demonstration purposes, I'll create dummy data to replace the above two dataset


testData.rename(columns={'content': 'Scammer'}, inplace=True)
testData.rename(columns={'is scam': 'Label'}, inplace=True)
balanced_df= pd.concat([balanced_df, data], ignore_index=True)
balanced_df= pd.concat([balanced_df, testData], ignore_index=True)

def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.30.1
    Uninstalling accelerate-0.30.1:
      Successfully uninstalled accelerate-0.30.1
Successfully installed accelerate-0.31.0
File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv
Series([], Name: count, dtype: int64)


In [2]:
train_df

Unnamed: 0,Label,Scammer,scam type,trick type,attack type,reason
0,0,Automatic payment of Rs.3000 will be deducted ...,,,,
1,1,"Hi there, I'm reaching out from HDFC's custome...",,,,
2,1,Urgent! Your PayPa1 account has been temporarl...,Phishing,"Scarcity, Using Manipulative Language","Intentional spelling mistakes, Homograph Attack","[""Creates a sense of urgency with 'Urgent!' to..."
3,0,"Nitesh, 3 days for our Merry Xmas Party with P...",,,,
4,1,"Hello, I've been trying to send the amount for...",,,,
...,...,...,...,...,...,...
1287,0,Alert: We've detected unusual activity on your...,Phishing,"Authority, Making False Threats","Intentional spelling mistakes, Homograph Attack","[""It's from a trusted source (the account prov..."
1288,1,"Dear Valued Customer, This is Officier John fr...",Phishing,"Scarcity, Using Fake Accents or Identities","Intentional spelling mistakes, Homograph Attack",['Uses scarcity tactic to create a sense of ur...
1289,0,FLAT 50% OFF on purchase of 25K at the Benetto...,,,,
1290,1,Good day Sir. With our FAMILYSAFE plan your fa...,,,,


In [3]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset
import pandas as pd


# Add a padding token to the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None):
        self.data = pd.read_csv(csv_file)
        print(f"Loaded data from {csv_file}:")
        print(self.data.head())  # Print the first few rows for debugging

        self.tokenizer = tokenizer
        self.max_length = max_length or 250
        self.encoded_texts = self.tokenizer(
            self.data["Scammer"].tolist(),
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        print(f"Tokenization successful for {csv_file}. Sample encoding:")
        print(self.encoded_texts)

    def __getitem__(self, index):
        item = {key: val[index] for key, val in self.encoded_texts.items()}
        item['labels'] = torch.tensor(self.data.iloc[index]["Label"], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.data)

# Load datasets
print("Loading training dataset...")
train_dataset = SpamDataset('/kaggle/working/train.csv', tokenizer)

print("Loading validation dataset...")
validation_dataset = SpamDataset('/kaggle/working/validation.csv', tokenizer)

# Model setup
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate=2e-3,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    save_steps=10_000,
    save_total_limit=2,
)

# Create a data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

2024-06-27 09:44:37.743026: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-27 09:44:37.743154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-27 09:44:38.023802: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading training dataset...
Loaded data from /kaggle/working/train.csv:
   Label                                            Scammer scam type  \
0      0  Automatic payment of Rs.3000 will be deducted ...       NaN   
1      1  Hi there, I'm reaching out from HDFC's custome...       NaN   
2      1  Urgent! Your PayPa1 account has been temporarl...  Phishing   
3      0  Nitesh, 3 days for our Merry Xmas Party with P...       NaN   
4      1  Hello, I've been trying to send the amount for...       NaN   

                              trick type  \
0                                    NaN   
1                                    NaN   
2  Scarcity, Using Manipulative Language   
3                                    NaN   
4                                    NaN   

                                       attack type  \
0                                              NaN   
1                                              NaN   
2  Intentional spelling mistakes, Homograph Attack   
3       

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ·····················································································································································································································································································································


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 309
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
1,No log,0.22952
2,No log,0.067336
3,No log,0.078781
4,No log,0.148165
5,No log,0.112105


TrainOutput(global_step=405, training_loss=0.17226213996793016, metrics={'train_runtime': 316.0174, 'train_samples_per_second': 20.442, 'train_steps_per_second': 1.282, 'total_flos': 824207523840000.0, 'train_loss': 0.17226213996793016, 'epoch': 5.0})

In [11]:
torch.save(model.state_dict(), 'model.bin')

In [21]:


# Export the model to ONNX format
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
dummy_input = torch.randint(0, 50257, size=(1, 250), dtype=torch.long,device=device)
torch.onnx.export(model, dummy_input, 'model.safetensors', export_params=True, opset_version=12)



In [13]:
dummy_input

tensor([[ 2.3372e-01,  1.4083e+00, -2.0686e+00,  8.6857e-01, -8.0322e-01,
         -1.1209e+00,  1.9564e-01, -7.8152e-01, -6.1194e-01,  6.0262e-01,
         -8.3618e-01, -3.3326e-01, -4.8010e-01, -1.2872e+00,  7.3888e-01,
          3.3895e-02, -1.1860e+00,  1.2986e+00,  8.9667e-01, -2.1818e+00,
          6.1277e-02,  8.5261e-02,  7.4813e-01, -1.6356e-01,  3.0481e-01,
          5.1303e-01, -1.2514e+00, -8.3081e-01,  4.9816e-01, -1.2000e+00,
          1.2711e-01,  4.4037e-01,  4.7277e-01,  3.6402e-01, -2.8120e-01,
         -1.0375e+00, -1.8737e+00,  2.3259e+00, -9.2039e-01,  6.6611e-01,
          8.9822e-01, -1.5388e-01, -5.6820e-01, -8.6795e-02, -8.4834e-01,
          1.6489e+00,  1.6006e+00, -7.8589e-02,  9.7003e-01, -6.7577e-01,
          2.0425e-01, -2.6476e-02, -4.1379e-01,  5.1841e-01, -7.0154e-01,
         -4.3234e-01,  6.6608e-02, -9.1199e-01,  3.6821e-01,  7.0497e-01,
         -1.0838e+00, -3.8893e-01,  8.1261e-01,  1.4981e+00,  3.1258e-01,
         -5.2286e-02, -1.8611e-01, -7.

In [22]:
from google.colab import drive

ModuleNotFoundError: No module named 'google.colab'