In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("MLBtrio/genz-slang-dataset")
df = dataset['train'].to_pandas()
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,Slang,Description,Example,Context
0,W,Shorthand for win,"Got the job today, big W!",Typically used in conversations to celebrate s...
1,L,Shorthand for loss/losing,"I forgot my wallet at home, that’s an L.",Often used when referring to a failure or mish...
2,L+ratio,Response to a comment or action on the interne...,Your tweet got 5 likes and 100 replies calling...,Popularized on social media platforms to signi...
3,Dank,excellent or of very high quality,That meme is so dank!,Commonly used in internet slang to refer to me...
4,Cheugy,Derogatory term for Millennials. Used when mil...,"That phrase is so cheugy, no one says that any...",Used to refer to things that were once popular...


In [None]:
dataset['train']

Dataset({
    features: ['Slang', 'Description', 'Example', 'Context'],
    num_rows: 1779
})

In [None]:
df.isnull().sum()

Unnamed: 0,0
Slang,0
Description,0
Example,0
Context,0


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import re
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def clean_and_tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9\s']", '', text)
    return word_tokenize(text.lower())

In [None]:
def tag_sentence(example, slang):
    tokens = clean_and_tokenize(example)
    tags = ['O'] * len(tokens)
    slang_tokens = clean_and_tokenize(slang)

    for i in range(len(tokens)):
        if tokens[i:i+len(slang_tokens)] == slang_tokens:
            tags[i] = 'B-SLANG'
            for j in range(1, len(slang_tokens)):
                tags[i+j] = 'I-SLANG'
    return tokens, tags

In [None]:
sample = df.iloc[0]
tokens, tags = tag_sentence(sample['Example'], sample['Slang'])
list(zip(tokens, tags))

[('got', 'O'),
 ('the', 'O'),
 ('job', 'O'),
 ('today', 'O'),
 ('big', 'O'),
 ('w', 'B-SLANG')]

In [None]:
data = []
for _, row in df.iterrows():
    tokens, tags = tag_sentence(row['Example'], row['Slang'])
    if len(tokens) > 0:
        data.append((tokens, tags))

print(f"Prepared {len(data)} labeled sentences.")

Prepared 1779 labeled sentences.


In [None]:
from collections import Counter

# Build vocab
word_counts = Counter([word for sent, _ in data for word in sent])
vocab = {w: i+2 for i, (w, _) in enumerate(word_counts.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# Tags
tag2id = {'O': 0, 'B-SLANG': 1, 'I-SLANG': 2}
id2tag = {v: k for k, v in tag2id.items()}

In [None]:
from torch.nn.utils.rnn import pad_sequence

MAX_LEN = 40  # or compute dynamically based on data distribution

def encode_sentence(tokens, tags):
    token_ids = [vocab.get(t, vocab['<UNK>']) for t in tokens]
    tag_ids = [tag2id[t] for t in tags]
    return torch.tensor(token_ids), torch.tensor(tag_ids)

In [None]:
class SlangNERDataset(Dataset):
    def __init__(self, data, max_len=MAX_LEN):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, tags = self.data[idx]
        token_ids, tag_ids = encode_sentence(tokens, tags)
        token_ids = token_ids[:self.max_len]
        tag_ids = tag_ids[:self.max_len]
        return {
            'input_ids': torch.nn.functional.pad(token_ids, (0, self.max_len - len(token_ids))),
            'labels': torch.nn.functional.pad(tag_ids, (0, self.max_len - len(tag_ids)), value=tag2id['O'])
        }

In [None]:
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
train_dataset = SlangNERDataset(train_data)
val_dataset = SlangNERDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [None]:
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)

    def forward(self, input_ids):
        embeds = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        logits = self.fc(lstm_out)
        return logits

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
tagset_size = len(tag2id)

model = BiLSTM_NER(vocab_size, tagset_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tag2id['O'])
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids)
            logits = logits.view(-1, logits.shape[-1])
            labels = labels.view(-1)

            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        val_loss = evaluate_model(model, val_loader, criterion)
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f} | Val Loss = {val_loss:.4f}")

In [None]:
def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids)
            logits = logits.view(-1, logits.shape[-1])
            labels = labels.view(-1)
            loss = criterion(logits, labels)
            val_loss += loss.item()
    return val_loss / len(val_loader)

In [None]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=8)

Epoch 1/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 1: Train Loss = 0.3399 | Val Loss = 0.1440


Epoch 2/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 2: Train Loss = 0.0866 | Val Loss = 0.1258


Epoch 3/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 3: Train Loss = 0.0719 | Val Loss = 0.1333


Epoch 4/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 4: Train Loss = 0.0875 | Val Loss = 0.1268


Epoch 5/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 5: Train Loss = 0.0348 | Val Loss = 0.1355


Epoch 6/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 6: Train Loss = 0.0185 | Val Loss = 0.1397


Epoch 7/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 7: Train Loss = 0.0100 | Val Loss = 0.1524


Epoch 8/8:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 8: Train Loss = 0.0074 | Val Loss = 0.1491


In [None]:
from sklearn.metrics import classification_report

def evaluate_predictions(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids)
            preds = torch.argmax(logits, dim=-1)

            for p, l in zip(preds, labels):
                for pi, li in zip(p, l):
                    if li != tag2id['O']:  # focus on non-O tags
                        all_preds.append(pi.item())
                        all_labels.append(li.item())

    print(classification_report(all_labels, all_preds, target_names=list(tag2id.keys())))

In [None]:
from sklearn.metrics import classification_report

def evaluate_predictions(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids)
            preds = torch.argmax(logits, dim=-1)

            for p, l in zip(preds, labels):
                for pi, li in zip(p, l):
                    if li != tag2id['O']:  # still ignoring "O"
                        all_preds.append(pi.item())
                        all_labels.append(li.item())

    # Explicitly provide label IDs and corresponding names
    labels = list(tag2id.values())
    target_names = list(tag2id.keys())

    print(classification_report(
        all_labels,
        all_preds,
        labels=labels,
        target_names=target_names,
        zero_division=0
    ))

In [None]:
evaluate_predictions(model, val_loader)

              precision    recall  f1-score   support

           O       0.00      0.00      0.00         0
     B-SLANG       0.97      0.99      0.98       170
     I-SLANG       0.00      0.00      0.00         5

    accuracy                           0.97       175
   macro avg       0.32      0.33      0.33       175
weighted avg       0.94      0.97      0.95       175



In [None]:
def predict_entities(sentence):
    tokens = clean_and_tokenize(sentence)
    input_ids = torch.tensor([[vocab.get(t, vocab['<UNK>']) for t in tokens]], device=device)
    with torch.no_grad():
        logits = model(input_ids)
        preds = torch.argmax(logits, dim=-1).squeeze(0).tolist()
    return list(zip(tokens, [id2tag[i] for i in preds]))

In [None]:
predict_entities("That concert was a huge W last night!")

[('that', 'B-SLANG'),
 ('concert', 'B-SLANG'),
 ('was', 'B-SLANG'),
 ('a', 'B-SLANG'),
 ('huge', 'B-SLANG'),
 ('w', 'B-SLANG'),
 ('last', 'I-SLANG'),
 ('night', 'B-SLANG')]

## Using CRF

In [None]:
!pip install pytorch-crf



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
from tqdm.auto import tqdm

In [None]:
def tag_sentence(example, slang):
    example = example.lower()
    slang = slang.lower()
    tokens = word_tokenize(example)
    tags = ['O'] * len(tokens)

    for i, tok in enumerate(tokens):
        if slang in tok:  # partial match allowed
            tags[i] = 'B-SLANG'
    return tokens, tags

In [None]:
class BiLSTM_CRF_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, input_ids, labels=None, mask=None):
        embeds = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        emissions = self.fc(lstm_out)

        if labels is not None:
            loss = -self.crf(emissions, labels, mask=mask, reduction='mean')
            return loss
        else:
            return self.crf.decode(emissions, mask=mask)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(vocab)
tagset_size = len(tag2id)

model = BiLSTM_CRF_NER(vocab_size, tagset_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_model(model, train_loader, val_loader, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            mask = input_ids != 0

            loss = model(input_ids, labels=labels, mask=mask)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            total_loss += loss.item()

        val_loss = evaluate_model(model, val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_loader):.4f} | Val Loss = {val_loss:.4f}")

In [None]:
def evaluate_model(model, val_loader):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            mask = input_ids != 0
            loss = model(input_ids, labels=labels, mask=mask)
            val_loss += loss.item()
    return val_loss / len(val_loader)

In [None]:
train_model(model, train_loader, val_loader, optimizer)

Epoch 1/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 1: Train Loss = 3.8162 | Val Loss = 2.5700


Epoch 2/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 2: Train Loss = 2.3132 | Val Loss = 2.2739


Epoch 3/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 3: Train Loss = 1.8115 | Val Loss = 2.0422


Epoch 4/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 4: Train Loss = 1.4184 | Val Loss = 1.9825


Epoch 5/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 5: Train Loss = 1.0639 | Val Loss = 1.9926


Epoch 6/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 6: Train Loss = 0.7617 | Val Loss = 2.0164


Epoch 7/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 7: Train Loss = 0.5214 | Val Loss = 1.9914


Epoch 8/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 8: Train Loss = 0.3470 | Val Loss = 2.2363


Epoch 9/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 9: Train Loss = 0.2546 | Val Loss = 2.2868


Epoch 10/10:   0%|          | 0/51 [00:00<?, ?it/s]

Epoch 10: Train Loss = 0.1732 | Val Loss = 2.4514


In [None]:
def predict_entities(sentence):
    tokens = clean_and_tokenize(sentence)
    input_ids = torch.tensor([[vocab.get(t, vocab['<UNK>']) for t in tokens]], device=device)
    mask = input_ids != 0

    with torch.no_grad():
        preds = model(input_ids, mask=mask)[0]

    return list(zip(tokens, [id2tag[i] for i in preds]))

In [None]:
def visualize_predictions(sentence):
    result = predict_entities(sentence)
    for tok, tag in result:
        if "SLANG" in tag:
            print(f"\033[92m{tok}\033[0m", end=' ')  # green for slang
        else:
            print(tok, end=' ')
    print()

visualize_predictions("He’s such an e-boy with those black nails and chains.")

hes such an [92meboy[0m with those black nails and chains 


In [None]:
for sent in ["That’s a big W!", "No cap, that was wild", "This party was mid fr"]:
    print("\nSentence:", sent)
    print(predict_entities(sent))


Sentence: That’s a big W!
[('thats', 'O'), ('a', 'O'), ('big', 'O'), ('w', 'B-SLANG')]

Sentence: No cap, that was wild
[('no', 'O'), ('cap', 'O'), ('that', 'O'), ('was', 'O'), ('wild', 'O')]

Sentence: This party was mid fr
[('this', 'O'), ('party', 'O'), ('was', 'O'), ('mid', 'B-SLANG'), ('fr', 'O')]
