# Tutorial 1

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

#### Text Classification

In [None]:
classifier = pipeline('sentiment-analysis', device=0)
result = classifier("i was so not happy with the last mission impossible movie")
print(result)

### Text Generation

In [None]:
text_gen = pipeline('text-generation', device=0)
result = text_gen("say my name")
print(result)

# Question Answering

In [None]:
qa = pipeline("question-answering", device=0)
q = 'Who is the protector of gotham?'
c = 'Batman protects gotham'
qa(q, c)

# Tokenization

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "albert/albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
print(classifier('i am loki. king of asgard'))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER')

text = 'the hardest choices  requires strongest will !!'

tokens = tokenizer.tokenize(text, split_special_tokens=True)
print(tokens)

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

In [None]:
tokenizer(text)

In [None]:
tokenizer.decode(ids)

# Datasets

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('stanfordnlp/imdb')

In [None]:
train, test = dataset['train'], dataset['test']

In [None]:
train['text'][0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

def token_fun(ex):
    return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=256)

tokenized_data = dataset.map(token_fun, batched=True)

In [None]:
tokenized_data

In [None]:
tokenized_data['train'][0]

# Training

In [None]:
# Subset individual datasets
train_subset = tokenized_data['train'].select(range(1000))
test_subset = tokenized_data['test'].select(range(1000))

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer
import os
os.environ["WANDB_DISABLED"] = "true"

model = AutoModelForSequenceClassification.from_pretrained('albert/albert-base-v1', num_labels=2)

trainer = Trainer(model=model, train_dataset =train_subset, eval_dataset =test_subset)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained('model')
tokenizer.save_pretrained('tokenizer')

# Arxiv - Summarization

In [None]:
!pip install arxiv

In [None]:
import arxiv
import pandas as pd

In [None]:
query = 'ai OR artificial intelligence OR machine learning'
search = arxiv.Search(query=query, max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate)

papers = []

for result in search.results():
    papers.append({
        'published':result.published,
        'title':result.title,
        'abstract':result.summary,
        'categories':result.categories
    })

df = pd.DataFrame(papers)
df

In [None]:
df.head(5)

In [None]:
abstract = df['abstract'][0]

summarizer = pipeline('summarization', model='facebook/bart-large-cnn', device=0)

print(summarizer(abstract))

# Text Summarizer

In [None]:
!pip install evaluate

In [None]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import evaluate
import matplotlib.pyplot as plt
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch
nltk.download('punkt')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
model_name = 'google/pegasus-cnn_dailymail'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [None]:
!pip install py7zr

In [None]:
dataset_samsum = load_dataset('Samsung/samsum')

In [None]:
dataset_samsum

In [None]:
dataset_samsum['train']['dialogue'][0]

In [None]:
dataset_samsum['train']['summary'][0]

In [None]:
split_len = [len(dataset_samsum[data]) for data in dataset_samsum]
split_len

In [None]:
dataset_samsum['train'].column_names

In [None]:
def convert_examples_to_features(example_batch):
    input_encoding = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        output_encoding = tokenizer(example_batch['summary'], max_length=128, truncation=True)

    return{
        'input_ids': input_encoding['input_ids'],
        'attention_mask': input_encoding['attention_mask'],
        'labels': output_encoding['input_ids']
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)

In [None]:
dataset_samsum_pt

In [None]:
dataset_samsum_pt['train']['input_ids'][0]

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_name)

In [None]:
from transformers import TrainingArguments, Trainer

train_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    eval_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [None]:
trainer = Trainer(
    model=model, args=train_args, data_collator=seq2seq_data_collator, 
    train_dataset=dataset_samsum_pt['test'], eval_dataset=dataset_samsum_pt['validation'])

In [None]:
trainer.train()

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i:i+batch_size]

def calc_metric_on_test_ds(dataset, metric, model, tokenizer,
                          batch_size=16, device=device,
                          column_text='dialogue',
                          column_summary='summary'):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for art_batch, tar_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(art_batch, max_length=1024, truncation=True, padding="max_length", return_tensors='pt')

        summaries = model.generate(input_ids=inputs['input_ids'].to(device),
                                  attention_mask=inputs['attention_mask'].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=tar_batch)

    score = metric.compute()
    return score

In [None]:
!pip install rouge_score

In [None]:
rouge_names = ['rouge1', 'rouge2', "rougeL", "rougeLsum"]
rouge_metric = evaluate.load('rouge')

In [None]:
score=calc_metric_on_test_ds(dataset_samsum['test'], rouge_metric, trainer.model, tokenizer, batch_size=2)

In [None]:
model.save_pretrained('new_model')
tokenizer.save_pretrained('new_tokenizer')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/new_tokenizer')

In [None]:
gen_kwargs = {'len_penalty':0.8, 'num_beams':8, 'max_len':128}

sample_text = dataset_samsum['test'][0]['dialogue']
sample_summary = dataset_samsum['test'][0]['summary']

pipe = pipeline("summarization", model='new_model', tokenizer=tokenizer, device=0)

print(pipe(sample_text))
print(sample_summary)

# Text to Image

In [None]:
!pip install diffusers transformers accelerate

In [None]:
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt
import torch

In [None]:
model1 = 'dreamlike-art/dreamlike-diffusion-1.0'
model2 = 'stabilityai/stable-diffusion-xl-base-1.0'

pipe = StableDiffusionPipeline.from_pretrained(model1, torch_dtype=torch.float16, use_safetensors=True)
pipe = pipe.to('cuda')

In [None]:
prompt='batman flying'
pipe(prompt).images[0]

In [None]:
prompt='batman vs superman vs hulk'
pipe(prompt).images[0]

In [None]:
prompt='batman vs superman vs hulk. make it cinematic and realistic'
pipe(prompt).images[0]

In [None]:
prompt='AI taking over the world concept with lot of robots including iron man fighting army and air force. have wanda as well. make it cinimatic'
pipe(prompt).images[0]