<a href="https://colab.research.google.com/github/asgardian1196/asg-ml/blob/main/T5_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [None]:
import scipy.special
from typing import List, Dict, Tuple
import numpy as np
import sklearn.metrics

## Load Model and Tokenizer

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)"spiece.model";:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

## Process Data

In [None]:
from datasets import load_dataset
dataset = load_dataset('tweet_eval', 'emotion')

Downloading and preparing dataset tweet_eval/emotion to /root/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/134k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/569 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/emotion/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
for example in dataset['test']:
    print(example)
    break

{'text': '#Deppression is real. Partners w/ #depressed people truly dont understand the depth in which they affect us. Add in #anxiety &amp;makes it worse', 'label': 3}


In [None]:
classes = ['anger',
'joy',
'optimism',
'sadness']

In [None]:
class_idx = {i: c for i,c in enumerate(classes)}

In [None]:
class_tokens = [tokenizer.encode(c)[0] for c in classes]

## Define model execution

In [None]:
def classify_tweet(tweet: str, 
                   classes: List[str] = classes, 
                   class_tokens: List[int] = class_tokens,
                   model = model,
                   tokenizer = tokenizer) -> Tuple[str, Dict[str, float]]:
    prompt = f'Classify the following sentence by sentiment. Sentence: {tweet} Possible sentiments: {" ".join(classes)}'
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, max_new_tokens=1)
    scores = [outputs.scores[0][:,t].numpy()[0] for t in class_tokens]
    scores = scipy.special.softmax(scores)
    return classes[np.argmax(scores)], {c: s for c,s in zip(classes, scores)}

In [None]:
emotion, scores = classify_tweet('wow!')

In [None]:
emotion, scores

('joy',
 {'anger': 0.00572423,
  'joy': 0.7449225,
  'optimism': 0.011245241,
  'sadness': 0.238108})

## Run the model inference on test set

In [None]:
preds = []
targets = []
for i, example in enumerate(dataset['test']):
    pred, _ = classify_tweet(example['text'])
    target = class_idx[example['label']]
    preds.append(pred)
    targets.append(target)
    if i % 100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400


## Metrics

In [None]:
print(sklearn.metrics.classification_report(targets, preds))

              precision    recall  f1-score   support

       anger       0.98      0.10      0.18       558
         joy       0.96      0.29      0.45       358
    optimism       0.00      0.00      0.00       123
     sadness       0.30      0.99      0.46       382

    accuracy                           0.38      1421
   macro avg       0.56      0.35      0.27      1421
weighted avg       0.71      0.38      0.31      1421



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
