<a href="https://colab.research.google.com/github/anticuch0/.ai/blob/main/FT_LLM_Second_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Category 1: Dataset preparation

1. Synthetic Dataset Creation and Augmentation

In [4]:
import random

# Positiivisia fraaseja
positive_phrases = [
    "This is fantastic!",
    "I am extremely satisfied.",
    "Great product and amazing quality.",
    "I would definitely recommend this to others.",
    "Exceeded my expectations!",
    "Fantastic value for the price.",
    "Very well made and durable.",
    "Service was excellent and quick.",
    "An absolute pleasure to use.",
    "I couldn't be happier with this purchase.",
    "Absolutely love this product!",
    "Top-notch quality and performance.",
    "Best purchase I've made in a long time.",
    "This has made my life so much easier.",
    "Five stars all the way!",
    "Outstanding customer service!",
    "Totally worth the price.",
    "High quality and very reliable.",
    "A must-have for everyone.",
    "I am beyond happy with this!"
]

# Negatiivisia fraaseja
negative_phrases = [
    "This is terrible.",
    "I am extremely disappointed.",
    "Poor quality and not worth the price.",
    "I would not recommend this to anyone.",
    "Completely failed to meet my expectations.",
    "Terrible value for money.",
    "Broke after a few uses.",
    "Service was slow and unhelpful.",
    "Not at all what I expected.",
    "This was a complete waste of money.",
    "This was a huge disappointment.",
    "The quality is shockingly bad.",
    "I regret buying this product.",
    "Nothing about this works as it should.",
    "Save your money and avoid this.",
    "The worst experience I've ever had.",
    "Very poor craftsmanship and design.",
    "Completely unusable and frustrating.",
    "I can't believe how bad this is.",
    "This is a total rip-off."
]


dataset = []

for _ in range(250):
  dataset.append(random.choice(positive_phrases))
  dataset.append(random.choice(negative_phrases))

print(f"First 5 sentences:\n{dataset[:5]}")

First 5 sentences:
['Service was excellent and quick.', 'Completely unusable and frustrating.', 'Absolutely love this product!', 'Service was slow and unhelpful.', 'Five stars all the way!']


In [5]:
!pip install nlpaug
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet
import nltk
nltk.download('averaged_perceptron_tagger_eng')


# Lataa WordNet-sanakirja
nltk.download('wordnet')

# Augmentointimenetelmät
synonym_aug = naw.SynonymAug(aug_src='wordnet')  # Synonyymien korvaus WordNetin avulla
insertion_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")  # Sanan lisääminen
deletion_aug = naw.RandomWordAug(action="delete")  # Sanojen poisto

augmented_dataset = []

for sentence in dataset:
  synonym_augmented = synonym_aug.augment(sentence)
  insertion_augmented = insertion_aug.augment(sentence)
  deletion_augmented = deletion_aug.augment(sentence)

  augmented_dataset.extend(synonym_augmented)
  augmented_dataset.extend(insertion_augmented)
  augmented_dataset.extend(deletion_augmented)

augmented_dataset = augmented_dataset[:500]

print(f"First 5 sentences of the augmented dataset: {augmented_dataset[:5]}")



[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


First 5 sentences of the augmented dataset: ['Service of process was excellent and quick.', 'our service detail was excellent and quick.', 'Service was excellent.', 'Entirely unuseable and frustrating.', 'now completely unusable... and wildly frustrating.']


2. Handling missing values


In [6]:
def simulate_missing_data(augmented_dataset, missing_percentage=0.1):
    missing_count = int(len(augmented_dataset) * missing_percentage)
    missing_indices = random.sample(range(len(augmented_dataset)), missing_count)
    dataset_with_missing = [sentence if i not in missing_indices else None for i, sentence in enumerate(augmented_dataset)]
    return dataset_with_missing, missing_indices

# Simulate missing values (10%)
dataset_with_missing, missing_indices = simulate_missing_data(augmented_dataset, missing_percentage=0.1)


2.2 Reconstructing the missing sentences with T5

In [7]:
!pip install transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def reconstruct_sentences(dataset_with_missing):
  reconstructed_dataset = []
  for sentence in dataset_with_missing:
    if sentence is None:
      # Use T5 to reconstruct a missing sentence
      input_text = "reconstruct: Fill in the missing sentence based on context."
      input_ids = tokenizer.encode(input_text, return_tensors="pt")
      output_ids = model.generate(input_ids, max_length=20, num_beams=5, early_stopping=True)
      reconstructed_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)
      reconstructed_dataset.append(reconstructed_sentence)
    else:
      reconstructed_dataset.append(sentence)
  return reconstructed_dataset

reconstructed_dataset = reconstruct_sentences(dataset_with_missing)

print(reconstructed_dataset)



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

['Service of process was excellent and quick.', 'our service detail was excellent and quick.', 'Service was excellent.', 'Entirely unuseable and frustrating.', 'now completely unusable... and wildly frustrating.', 'Completely unusable.', 'Absolutely hump this mathematical product!', 'and absolutely love this specific product!', 'Love this product!', 'Service was dull and unhelpful.', 'service was only slow sometimes and most unhelpful.', 'Service slow and.', 'Five stars wholly the manner!', 'get five stars all the damn way!', 'Five stars the!', 'I repent corrupt this product.', 'i regret almost buying this fine product.', 'Regret this product.', 'High pitched quality and rattling reliable.', 'high good quality equipment and very reliable.', 'Quality and very reliable.', 'Non at all what I await.', 'still not at nearly all what what i expected.', 'At all I.', 'This be marvelous!', 'because this is both fantastic!', 'Is fantastic!', 'This be a full rip - off.', 'this book is a huge total

3. Kaggle Dataset Preprocessing

In [8]:
!pip install kaggle
!pip install datasets
!pip install scikit-learn
import pandas as pd


# Lataa Kaggle-datasetti
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

!unzip imdb-dataset-of-50k-movie-reviews.zip

# Lue CSV-tiedosto
df = pd.read_csv("IMDB Dataset.csv")

# Tarkista datasetin sisältö
print(df.head())

# Poista tyhjät rivit
df.dropna(inplace=True)

# Muunna sentimentit binäärimuotoon
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print(df.head())



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

3.2 Tokenizing with Hugging Face


In [9]:
from transformers import AutoTokenizer

# Lataa tokenisaattori
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenoi datasetin tekstit
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Tokenisoi datasetti
texts = df['review'].tolist()
labels = df['sentiment'].tolist()
tokenized_data = tokenize_data(texts)

# Näytä esimerkki tokenisoinnista
print(tokenized_data.keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


3.3 Training a baseline model and comparing it to a transformer model

In [10]:
#Train baseline model with Logistic Regression

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Jaa datasetti koulutus- ja testisetteihin
X_train, X_test, y_train, y_test = train_test_split(tokenized_data['input_ids'], labels, test_size=0.2, random_state=42)

# Kouluta Logistic Regression -malli
X_train_flat = [x.numpy().flatten() for x in X_train]
X_test_flat = [x.numpy().flatten() for x in X_test]

model = LogisticRegression(max_iter=1000)
model.fit(X_train_flat, y_train)

# Ennusta ja arvioi tarkkuus
predictions = model.predict(X_test_flat)
print("Logistic Regression Accuracy:", accuracy_score(y_test, predictions))


# Logistic Regression Accuracy: 0.5121


Logistic Regression Accuracy: 0.5121


3. Transformer Model

In [11]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Luo Dataset-objekti, jossa on input_ids, attention_mask ja labels
train_dataset = Dataset.from_dict({
    "input_ids": [x.tolist() for x in tokenized_data["input_ids"][:40000]],  # 80% koulutukseen
    "attention_mask": [x.tolist() for x in tokenized_data["attention_mask"][:40000]],
    "labels": labels[:40000]
})

eval_dataset = Dataset.from_dict({
    "input_ids": [x.tolist() for x in tokenized_data["input_ids"][40000:]],  # 20% testaukseen
    "attention_mask": [x.tolist() for x in tokenized_data["attention_mask"][40000:]],
    "labels": labels[40000:]
})

# Varmistetaan, että tiedot ovat oikeassa muodossa
print(train_dataset)
print(eval_dataset)

# 🔹 Lataa TinyBERT-malli
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)

# 🔹 Määritä koulutusargumentit
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 🔹 Luo Trainer uudella Dataset-muodolla
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# 🔹 Kouluta malli
trainer.train()

# Vertailu?

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 40000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mantti-halme[0m ([33mantti-halme-tampere-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.4725,0.446375
2,0.4236,0.434181
3,0.4014,0.419194


TrainOutput(global_step=7500, training_loss=0.4611923400878906, metrics={'train_runtime': 164.084, 'train_samples_per_second': 731.333, 'train_steps_per_second': 45.708, 'total_flos': 38114611200000.0, 'train_loss': 0.4611923400878906, 'epoch': 3.0})

Category 2: Tokenization

In [None]:
# Category 2: Tokenization
!pip install datasets

from datasets import load_dataset

# Lataa esimerkkidatasetti (IMDB-elokuva-arvostelut)
dataset = load_dataset("imdb", split="train[:1000]")  # Käytä vain 1000 riviä testin nopeuttamiseksi
texts = dataset["text"]

from transformers import AutoTokenizer

# Lataa tokenisaattorit
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenisoi data
bert_tokens = bert_tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
gpt_tokens = gpt_tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
roberta_tokens = roberta_tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Tulosta tokenisaatiotuloksia
print("BERT Tokenized Input IDs:", bert_tokens["input_ids"][:1])
print("GPT Tokenized Input IDs:", gpt_tokens["input_ids"][:1])
print("RoBERTa Tokenized Input IDs:", roberta_tokens["input_ids"][:1])







In [None]:
!pip install tokenizers

from datasets import load_dataset

# Lataa Wikipedia-artikkeleita sisältävä datasetti
wiki_dataset = load_dataset("wikipedia", "20220301.en", split="train[:1%]")  # Käytä 1 % datasta
texts = wiki_dataset["text"]


from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# Luo tyhjä tokenisaattori
tokenizer = Tokenizer(models.WordPiece())

# Määritä esikäsittely ja tokenien pilkkominen
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Määritä kouluttaja
trainer = trainers.WordPieceTrainer(vocab_size=5000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

# Kouluta tokenisaattori
tokenizer.train_from_iterator(texts, trainer)

# Tallenna tokenisaattori tiedostoon
tokenizer.save("custom_tokenizer.json")

from tokenizers import Tokenizer

# Lataa tokenisaattori tiedostosta
custom_tokenizer = Tokenizer.from_file("custom_tokenizer.json")

# Tokenisoi uusi datasetti
new_texts = ["This is an example sentence.", "Custom tokenizers are useful."]
encoded = [custom_tokenizer.encode(text).tokens for text in new_texts]

print("Tokenized New Texts:", encoded)


Category 3: Pre-trained models

Fine-Tune DistilBERT for Sentiment Analysis

In [None]:
!pip install evaluate

from datasets import load_dataset

# Lataa IMDB-datasetti
dataset = load_dataset("imdb")
dataset = dataset.map(lambda x: {'label': 1 if x['label'] == 'positive' else 0})  # Muunna sentimentti binääriseksi

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Lataa DistilBERT ja tokenisaattori
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenisoi dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Määritä koulutusparametrit
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
)

# Luo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Kouluta malli
trainer.train()

# Arvioi malli
trainer.evaluate()


Binary classification task

In [None]:
import pandas as pd
from datasets import Dataset

# Lataa dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/sms.csv"
df = pd.read_csv(url, encoding="latin-1")
df = df.rename(columns={"v1": "label", "v2": "text"})
df["label"] = df["label"].map({"ham": 0, "spam": 1})  # Muunna binääriseksi
dataset = Dataset.from_pandas(df)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Lataa BERT-tokenisaattori ja malli
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenisoi dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Jaa dataset koulutus- ja testiosiin
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Määritä koulutusparametrit
training_args = TrainingArguments(
    output_dir="./sms_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Luo Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Kouluta malli
trainer.train()

# Arvioi malli
predictions = trainer.predict(test_dataset)

from sklearn.metrics import classification_report

# Ennusteet ja todelliset arvot
predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = test_dataset["labels"]

# Laske metriikat
print(classification_report(true_labels, predicted_labels, target_names=["Ham", "Spam"]))


HTTPError: HTTP Error 404: Not Found

BLIP for Image Captioning



In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

# Lataa BLIP-malli ja prosessori
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Lataa kuvat
image_paths = ["image1.jpg", "image2.jpg", "image3.jpg", "image4.jpg", "image5.jpg"]
images = [Image.open(img_path).convert("RGB") for img_path in image_paths]

# Generoi kuvatekstit
captions = []
for image in images:
    inputs = processor(images=image, return_tensors="pt")
    outputs = model.generate(**inputs)
    captions.append(processor.decode(outputs[0], skip_special_tokens=True))

print("Generated Captions:", captions)

from transformers import MarianMTModel, MarianTokenizer

# Lataa MarianMT-malli
translation_model_name = "Helsinki-NLP/opus-mt-en-fi"  # Englannista suomeksi
tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
model = MarianMTModel.from_pretrained(translation_model_name)

# Käännä kuvatekstit
translations = []
for caption in captions:
    inputs = tokenizer(caption, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translations.append(translation)

print("Translated Captions:", translations)



Hyperparameter Tuning with Ray Tune

In [None]:
!pip install ray[tune]

from datasets import load_dataset

# Lataa IMDB-datasetti
dataset = load_dataset("imdb")
dataset = dataset.map(lambda x: {'label': 1 if x['label'] == 'positive' else 0})  # Binäärimuotoilu

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import numpy as np

# Lataa tokenisaattori
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenisoi data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Jaa koulutus- ja validointijoukkoihin
train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tune_transformer(config):
    # Koulutusparametrit
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=config["lr"],
        per_device_train_batch_size=config["batch_size"],
        num_train_epochs=3,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        disable_tqdm=True,
    )

    # Luo Trainer
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Kouluta malli
    trainer.train()

    # Hae validointitarkkuus
    metrics = trainer.evaluate(eval_dataset)
    tune.report(accuracy=metrics["eval_accuracy"], loss=metrics["eval_loss"])


search_space = {
    "lr": tune.loguniform(1e-5, 1e-3),          # Oppimisnopeus (1e-5 - 1e-3)
    "batch_size": tune.choice([16, 32, 64]),    # Eräkoko (16, 32, 64)
    "dropout": tune.uniform(0.1, 0.5),          # Dropout (0.1 - 0.5)
}

scheduler = ASHAScheduler(
    metric="accuracy",
    mode="max",
    max_t=3,  # Maksimi-epochit
    grace_period=1,  # Minimimäärä epocheja ennen pysäytystä
    reduction_factor=2,
)

from ray.tune import run

# Suorita hyperparametrien optimointi
analysis = tune.run(
    tune.with_parameters(tune_transformer),
    resources_per_trial={"cpu": 2, "gpu": 1},
    config=search_space,
    num_samples=20,  # 20 kokeilua
    scheduler=scheduler,
    local_dir="./ray_results",  # Tulosten tallennuspaikka
)

# Parhaat tulokset
print("Best hyperparameters found were: ", analysis.best_config)

best_hyperparams = analysis.best_config

training_args = TrainingArguments(
    output_dir="./best_model",
    learning_rate=best_hyperparams["lr"],
    per_device_train_batch_size=best_hyperparams["batch_size"],
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.evaluate()


Category 5: Training Optimization

In [None]:
pip install torch transformers accelerate datasets
pip install deepspeed

import torch
import time
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Model and dataset selection
model_name = "bert-base-uncased"
dataset = load_dataset("imdb")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare for PyTorch
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))  # Use a subset for faster training
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training configurations
def train_model(fp16: bool):
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="no",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        logging_dir="./logs",
        logging_steps=10,
        fp16=fp16,  # Enable mixed precision training
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    start_time = time.time()
    trainer.train()
    end_time = time.time()

    return end_time - start_time, torch.cuda.memory_allocated()

# Train in FP32
fp32_time, fp32_memory = train_model(fp16=False)

# Train in FP16
fp16_time, fp16_memory = train_model(fp16=True)

# Results
print(f"FP32 Training Time: {fp32_time:.2f} seconds, Memory Usage: {fp32_memory / 1e6:.2f} MB")
print(f"FP16 Training Time: {fp16_time:.2f} seconds, Memory Usage: {fp16_memory / 1e6:.2f} MB")


In [None]:
import deepspeed

# DeepSpeed training configuration
ds_config = {
    "train_micro_batch_size_per_gpu": 64,
    "gradient_accumulation_steps": 1,
    "zero_optimization": {
        "stage": 2,  # Memory optimization
    },
    "fp16": {"enabled": True},  # Mixed precision
}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=64,  # Large batch size
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    deepspeed=ds_config,  # Enable DeepSpeed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()


Category 6: Evaluation and visualization

In [None]:
pip install torch transformers datasets evaluate

import numpy as np
import evaluate

# Use sklearn.metrics to compute precision, recall and F1 Score
from sklearn.metrics import precision_score, recall_score, f1_score

# Load Hugging Face's built-in evaluation metric
hf_metric = evaluate.load("accuracy")

# Custom metric function
def compute_custom_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = precision_score(labels, predictions, average="binary")
    recall = recall_score(labels, predictions, average="binary")
    f1 = f1_score(labels, predictions, average="binary")
    accuracy = hf_metric.compute(predictions=predictions, references=labels)["accuracy"]

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1-score": f1}

# Example Usage (Simulated Predictions)
logits = np.array([[2.3, 0.2], [0.5, 1.5], [2.1, 0.4], [1.1, 1.2]])
labels = np.array([0, 1, 0, 1])

metrics = compute_custom_metrics((logits, labels))
print(metrics)



In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_custom_metrics,
)

trainer.evaluate()


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import matplotlib.pyplot as plt

# Load a pre-trained BERT model
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define an input sentence
sentence = "The movie was absolutely fantastic and I loved it!"
inputs = tokenizer(sentence, return_tensors="pt")

# Forward pass to get attention weights
with torch.no_grad():
    outputs = model(**inputs, output_attentions=True)

# Extract attention weights
attentions = outputs.attentions  # Shape: (num_layers, num_heads, seq_len, seq_len)



In [None]:
import seaborn as sns
import numpy as np

def plot_attention(attentions, layer=0, head=0):
    """Plot attention heatmap for a specific layer and head."""
    attn = attentions[layer][0, head].cpu().numpy()  # Extract a single head's attention weights

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    plt.figure(figsize=(10, 8))
    sns.heatmap(attn, xticklabels=tokens, yticklabels=tokens, cmap="Blues", annot=True, fmt=".2f")
    plt.xlabel("Key Tokens")
    plt.ylabel("Query Tokens")
    plt.title(f"Attention Map - Layer {layer}, Head {head}")
    plt.show()

# Visualize attention for the first layer, first head
plot_attention(attentions, layer=0, head=0)


Category 7: Advanced API Tasks


In [None]:
pip install kaggle datasets transformers
mkdir -p ~/.kaggle
mv kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json
kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
unzip imdb-dataset-of-50k-movie-reviews.zip -d dataset/

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset("csv", data_files={"train": "dataset/IMDB Dataset.csv"})

# Preprocessing
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))

# Load TinyBERT model
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Save model & tokenizer
model.save_pretrained("tinybert_model")
tokenizer.save_pretrained("tinybert_model")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/tinybert_model")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/tinybert_model")

def predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    prediction = torch.argmax(logits).item()
    return "Positive" if prediction == 1 else "Negative"

print(predict("This movie was amazing, I loved it!"))  # Expected output: Positive
print(predict("The plot was terrible and boring."))    # Expected output: Negative


In [None]:
pip install fastapi uvicorn requests
from fastapi import FastAPI
import requests
import os

app = FastAPI()

# Set your API key
GROQ_API_KEY = os.getenv("GROQ_API_KEY")  # Or replace with your actual key

@app.get("/translate")
def translate(text: str, target_lang: str = "fi"):
    url = "https://api.groq.com/v1/translate"
    headers = {"Authorization": f"Bearer {GROQ_API_KEY}"}
    data = {"text": text, "target_language": target_lang}

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        return {"translated_text": response.json()["translation"]}
    return {"error": response.text}

# Run the API with: uvicorn app:app --reload


In [None]:
uvicorn app:app --reload
http://127.0.0.1:8000/translate?text=Hello,%20how%20are%20you?&target_lang=fi
