In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd


In [6]:
df = pd.read_csv("genz_slang.csv")

for col in df.columns:
    df[col] = df[col].str.lower()

display(df)

Unnamed: 0,Slang,Description,Example,Context
0,w,shorthand for win,"got the job today, big w!",typically used in conversations to celebrate s...
1,l,shorthand for loss/losing,"i forgot my wallet at home, that’s an l.",often used when referring to a failure or mish...
2,l+ratio,response to a comment or action on the interne...,your tweet got 5 likes and 100 replies calling...,popularized on social media platforms to signi...
3,dank,excellent or of very high quality,that meme is so dank!,commonly used in internet slang to refer to me...
4,cheugy,derogatory term for millennials. used when mil...,"that phrase is so cheugy, no one says that any...",used to refer to things that were once popular...
...,...,...,...,...
1774,zh,sleeping hour,"it’s zh, goodnight!",refers to the time when someone usually goes t...
1775,zomg,oh my god,"zomg, i can’t believe you did that!","an exaggerated or enthusiastic version of ""omg..."
1776,zot,zero tolerance,our school has a zot policy for bullying.,refers to a strict policy where certain behavi...
1777,zup,what’s up?,"hey, zup with you today?",a casual way to ask how someone is doing or wh...


In [7]:
# Load and prepare your dataset
train_texts = df['Slang'].tolist()
train_labels = df['Description'].tolist()

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenize the data
def tokenize_function(examples):
    inputs = tokenizer(examples['Slang'], padding="max_length", truncation=True)
    targets = tokenizer(examples['Description'], padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Convert your data into a dataset (using Hugging Face datasets or PyTorch Dataset)
dataset = Dataset.from_pandas(df)

# Split the dataset into training and evaluation sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Tokenize both training and evaluation datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,  # Added the evaluation dataset here
)

# Train the model
trainer.train()

Map: 100%|██████████| 1423/1423 [00:00<00:00, 3833.88 examples/s]
Map: 100%|██████████| 356/356 [00:00<00:00, 7003.75 examples/s]
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save the model and tokenizer to reuse later
model_path = "./results/final_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Function to translate a sentence using the trained model
def translate_sentence(sentence):
    # Prepare the input
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Generate translation
    output = model.generate(
        inputs.input_ids,
        max_length=50,  # Adjust as needed for your translations
        num_beams=4,    # Beam search for better results
        temperature=1.0,  # For more creative outputs, increase this value
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    
    # Decode the output
    translated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return translated_text

# Test with some example sentences
test_sentences = [
    "This party is lit",
    "No cap",
    "That's fire",
    "I'm so dead",
    "She ate that"
]

# Translate and display results
print("Example translations:")
print("-" * 40)
for sentence in test_sentences:
    translation = translate_sentence(sentence)
    print(f"Input:  {sentence}")
    print(f"Output: {translation}")
    print("-" * 40)

# Interactive testing
def interactive_translation():
    while True:
        user_input = input("\nEnter a sentence to translate (or 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        
        translation = translate_sentence(user_input)
        print(f"Translation: {translation}")

# Run interactive testing
print("\nInteractive translation mode:")
interactive_translation()