In [1]:
!pip -q install transformers accelerate bitsandbytes sentencepiece
!pip -q install huggingface_hub

In [2]:
from transformers import pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
import random
import numpy as np

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained(
    "google/flan-t5-large", 
    device_map="auto", 
    load_in_8bit=True, 
    low_cpu_mem_usage=True
)
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [3]:
def flan_t5(func):
    def wrapper(prompt):
        to_model = func(prompt)
        completion = generator(to_model)
        return completion[0]['generated_text']
    return wrapper

categories = [
    "Science", "Technology", "Art", "Engineering", "Mathematics"
]

# We're going to generate the few-shot prompt with the T5 model first so we
# don't have to do any of that pesky "prompting" stuff
@flan_t5
def create_example(category):
    return "Create a creative example subject that falls under the given category. " + \
           "For example 'Transportation': 'buses go rogue', 'Soda Cans': 'mt. dew cans have been exploding!'. " + \
           f"Category: {category}"

examples = [create_example(i) for i in categories]
examples = list(zip(examples, categories))
random.shuffle(examples)
examples_prompt = "\n".join([f"{i}: {j}" for i,j in examples])

@flan_t5
def clean_label(text):
    return "Your task is to classify a given label into the following categories: " + \
           f"{', '.join(categories)}." + \
           f"For example: {examples_prompt}" + \
           f"Label: {text}"

In [4]:
print(clean_label("bridge building"))

'Engineering'

In [11]:
dirty_labels = ["bridge building", "alcoholic", "pretzel sculptor", "watermelon portrait creator", "wifi"]
clean_labels = [clean_label(x) for x in dirty_labels]
print(clean_labels)



['Engineering', 'Science', 'Science', 'Art', 'Technology']


In [5]:
# I saw another example that used fuzzy scores / probabilities from ChatGPT so here's a way to do that
@flan_t5
def class_confidence_score(data): # {text: 'text', class: 'class'}
    return f"Classes to choose from are: {', '.join(categories)}. " + \
            "Your task is to provide a score from 1-10 for how well a dirty label fits a class name. " + \
            "How well does 'addition' fit the 'College Algebra' class? 5 \n" + \
            "How well does 'space travel' fit the 'Space' class? 10 \n" + \
            "How well does 'tomato soup' fit the 'Fruits' class? 6 \n" + \
            f"How well does '{data['text']}' fit the '{data['class']}' class?"

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def probabilities(x, categories):
    scores = {}
    for i in categories:
        scores[i] = class_confidence_score({'text': x, 'class': i})
    softmaxed_scores = softmax(np.array([int(i) for i in scores.values()])).tolist()
    return softmaxed_scores

probabilities('bridge building', categories)



[0.017065385160357122,
 0.017065385160357122,
 0.017065385160357122,
 0.9317384593585716,
 0.017065385160357122]