<a href="https://colab.research.google.com/github/abdulrehman898998/gpt2/blob/main/spam_non_pam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install tiktoken
!pip install transformers datasets tiktoken


In [None]:
import pandas as pd
df = pd.read_csv('/content/SMSSpamCollection', sep='\t', header=None ,names=['labels', 'text'] )
df

In [None]:
def balanced_data(df):
  spam_len=df[df['labels']=='spam'].shape[0]
  ham_df=df[df['labels']=='ham'].sample(spam_len, random_state=123)
  balanced_df = pd.concat([ham_df, df[df["labels"] == "spam"]])

  return balanced_df

balanced_df = balanced_data(df)
print(balanced_df["labels"].value_counts())


In [None]:
def train_test(balanced_df,train_ratio,val_ratio):
  train_data=balanced_df[:int(train_ratio*len(balanced_df))]
  test_data=balanced_df[int(train_ratio*len(balanced_df)):int((train_ratio+val_ratio)*len(balanced_df)+1)]
  val_data=balanced_df[int((train_ratio+val_ratio)*len(balanced_df)+1):]
  return train_data,val_data,test_data

In [None]:
train_data,val_data,test_data=train_test(balanced_df,0.7,0.2)

In [None]:
train_data.to_csv('/content/train_data.csv',index=False)
val_data.to_csv('/content/val_data.csv',index=False)
test_data.to_csv('/content/test_data.csv',index=False)

In [None]:
len(train_data),len(val_data),len(test_data)

df['target'].value_counts()

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50255):
        data = pd.read_csv(csv_file)
        self.max_length = max_length


        self.labels = data["labels"].map({"ham": 0, "spam": 1}).values


        encoded_data = [tokenizer.encode(text) for text in data["text"]]


        if self.max_length is None:
            self.max_length = max(len(text) for text in encoded_data)

        else:
            self.max_length = min(self.max_length, max(len(text) for text in encoded_data))
        self.encoded_data = [
            text[:self.max_length] + [pad_token_id] * max(0, self.max_length - len(text[:self.max_length]))
            for text in encoded_data
        ]

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        # Return encoded text and label as tensors
        return (
            torch.tensor(self.encoded_data[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long),
        )


In [None]:
train_dataset=SpamDataset(csv_file='/content/train_data.csv',tokenizer=tokenizer)
val_dataset=SpamDataset(csv_file='/content/val_data.csv',tokenizer=tokenizer)
test_dataset=SpamDataset(csv_file='/content/test_data.csv',tokenizer=tokenizer)

In [None]:
train_dataset[0]

In [None]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pretrained GPT-2 model and tokenizer
model_name = "gpt2"  # You can use other variants like "gpt2-medium", "gpt2-large", or "gpt2-xl"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


In [None]:
print(model)

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
model.config

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


input_text = "who are humans"
input_ids = tokenizer.encode(input_text, return_tensors="pt")


output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated text:", generated_text)


In [None]:
torch.manual_seed(123)

num_classes = 2
model.lm_head = torch.nn.Linear(in_features=768, out_features=num_classes)

In [None]:
print(model)

In [None]:
def calculate_acc(model,device,data_loader,num_batches=None):
  model.eval()
  correct_pred=0
  example_seen=0
  if num_batches is None:
    num_batches=len(data_loader)
  else:
    num_batches=min(num_batches,len(data_loader))
  for i,(inputs,labels) in enumerate(data_loader):
    if i<num_batches:
      inputs=inputs.to(device)
      labels=labels.to(device)
      with torch.no_grad():
        outputs = model(inputs)
        logits = outputs.logits[:, -1, :]
      outputs=torch.argmax(logits,dim=-1)

      example_seen+=inputs.shape[0]
      correct_pred+=(outputs==labels).sum().item()

    else:
      break
  return correct_pred/example_seen

In [None]:
def calculate_loss_batch(model,device,inputs,labels):
  model.to(device)
  inputs=inputs.to(device)
  labels=labels.to(device)
  outputs=model(inputs)
  logits = outputs.logits
  logits = logits[:, -1, :]
  loss=torch.nn.functional.cross_entropy(logits,labels)
  return loss

In [None]:
def calculate_loss(model,device,data_loader,num_batches=None):
  total_loss=0

  if num_batches is None:
    num_batches=len(data_loader)
  else:
    num_batches=min(num_batches,len(data_loader))
  for i,(inputs,labels) in enumerate(data_loader):
    if i<num_batches:
      loss=calculate_loss_batch(model,device,inputs,labels).item()
      total_loss+=loss



    else:
      break
  return total_loss/num_batches

In [None]:
def eval_model(model,train_loader,val_loader,device,eval_iter):
  model.eval()
  train_loss=calculate_loss(model,device,train_loader,num_batches=eval_iter)
  val_loss=calculate_loss(model,device,val_loader,num_batches=eval_iter)
  model.train()
  return train_loss,val_loss

In [None]:
def training(model,num_epochs,optimizer,eval_freq,train_loader,val_loader,device,eval_iter):
  examples_seen=0
  global_step=-1
  train_losses,val_losses=[],[]
  train_accs, val_accs = [], []
  for epoch in range(num_epochs):
    model.train()
    for inputs,labels in train_loader:
      optimizer.zero_grad()
      loss=calculate_loss_batch(model, device, inputs, labels)
      loss.backward()
      optimizer.step()
      examples_seen+=inputs.shape[0]
      global_step+=1

      if global_step % eval_freq==0:
        train_loss,val_loss=eval_model(model, train_loader, val_loader, device, eval_iter)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Ep {epoch+1} (Step {global_step:06d}): "
              f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
    train_accuracy=calculate_acc(model,device,train_loader,num_batches=eval_iter)
    val_accuracy=calculate_acc(model,device,val_loader,num_batches=eval_iter)
    print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    train_accs.append(train_accuracy)
    val_accs.append(val_accuracy)
  return train_losses,val_losses,train_accs, val_accs,examples_seen

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)


train_losses, val_losses, train_accs, val_accs, examples_seen = training(
    model, 6, optimizer, 50, train_loader, val_loader, device, 5  # num_epochs=5, eval_freq=50, eval_iter=5
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [None]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()

    input_ids = tokenizer.encode(text)
    input_ids += [pad_token_id] * (max_length - len(input_ids))
    input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0)

    with torch.no_grad():

        outputs = model(input_tensor)

        logits = outputs[0][:, -1, :]


    predicted_label = torch.argmax(logits, dim=-1).item()

    return "spam" if predicted_label == 1 else "not spam"

In [None]:
text_1 = (
"We are pleased to announce that regular academic activities will resume in Face-to-Face (FTF) mode at all H-12 institutions, CoEME, and MCS, effective Thursday, November 28, 2024."
)


print(classify_review(
    text_1, model, tokenizer, device, max_length=train_dataset.max_length
))

In [None]:
!pip install huggingface_hub

import torch
from huggingface_hub import notebook_login, Repository
import shutil
import os
import json

# Step 1: Save the fine-tuned model and tokenizer
model_save_path = "/content/fine_tuned_gpt2.pth"
tokenizer_save_path = "/content/gpt2_tokenizer"

# Save the model weights
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
model.save_pretrained(tokenizer_save_path)

# Step 2: Log in to Hugging Face
notebook_login()

# Step 3: Clone the existing repository (Replace with your username and repo name)
username = "abdulrehman89OK"  # Replace with your username
repo_name = "spam_non_sapm_classifier"  # Replace with your repo name
repo_url = f"https://huggingface.co/{username}/{repo_name}"

# Clone the repository to the Colab environment
repo = Repository(local_dir="model_repo", clone_from=repo_url)

# Step 4: Verify that model and tokenizer files exist
if not os.path.exists(model_save_path):
    raise FileNotFoundError(f"Model file not found at {model_save_path}")
if not os.path.exists(tokenizer_save_path):
    raise FileNotFoundError(f"Tokenizer directory not found at {tokenizer_save_path}")

# Step 5: Overwrite the model and tokenizer files in the repo directory
shutil.copy(model_save_path, "model_repo/pytorch_model.bin")  # Overwrite with fine-tuned model
shutil.copytree(tokenizer_save_path, "model_repo/tokenizer", dirs_exist_ok=True)  # Overwrite tokenizer

# Step 6: Optional: Update the `config.json` file if needed (if your model config has changed)
config_path = "model_repo/config.json"
config = {
    "model_type": "gpt2",
    "num_labels": 2,  # Number of classification labels (spam and ham)
    "id2label": {0: "ham", 1: "spam"},
    "label2id": {"ham": 0, "spam": 1},
}
with open(config_path, "w") as f:
    json.dump(config, f)

# Step 7: Git configuration and pushing to Hugging Face
!git config --global user.email "abdulrehmanpti12@gmail.com"  # Replace with your email
!git config --global user.name "abdulrehman89OK"  # Replace with your username

# Commit and push the updates to Hugging Face
repo.push_to_hub(commit_message="Updated model with fine-tuned GPT-2 and classification head")




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/abdulrehman89OK/spam_non_sapm_classifier into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/475M [00:00<?, ?B/s]

Download file tokenizer/model.safetensors:   0%|          | 32.0k/475M [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/475M [00:00<?, ?B/s]

Clean file tokenizer/model.safetensors:   0%|          | 1.00k/475M [00:00<?, ?B/s]

FileNotFoundError: Model file not found at /content/fine_tuned_gpt2.pth