In [1]:
#!pip install transformers

In [2]:
#!pip install accelerate -U

In [3]:
#!pip install transformers[torch] -U

In [4]:
#!pip install pyarrow --upgrade

In [5]:
from transformers import pipeline, set_seed




In [6]:
# Sentiment Analysis

classifier = pipeline("sentiment-analysis")
classifier("illyes loves naruto.")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9977988600730896}]

In [7]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [8]:
# Text Generation

generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to develop a fully functional and practical application for writing, visualizing, managing, maintaining and creating your own online apps using React. The course is about how to write something which in theory would be as easy'}]

In [9]:
generator = pipeline("text-generation", model="benjamin/gpt2-wechsel-french")
generator(
    "les chats",
    max_length=30,
    num_return_sequences=1,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'les chats de nos grands-parents.\nUn des petits plus de ce livre de recettes? Son mode de cuisson : un fond de moules accompagné d'}]

In [10]:
unmasker = pipeline("fill-mask")
unmasker("This course will teach you all about <mask> models.", top_k=2)

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.19619695842266083,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.040526971220970154,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

In [11]:
# NER
ner = pipeline("ner", grouped_entities=True)
ner("roberto lives in mexico.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.38135758,
  'word': 'robe',
  'start': 0,
  'end': 4},
 {'entity_group': 'LOC',
  'score': 0.4991569,
  'word': 'mexi',
  'start': 17,
  'end': 21}]

In [12]:
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33")

In [13]:
text = "The market was very volatile today due to the unexpected interest rate hike."

candidate_labels = ['science', 'economy','health']

classifier(text, candidate_labels, multi_label=True)

{'sequence': 'The market was very volatile today due to the unexpected interest rate hike.',
 'labels': ['economy', 'science', 'health'],
 'scores': [0.9698341488838196, 0.00038923960528336465, 3.376215317985043e-05]}

In [14]:
text = "I don't like the food."

candidate_labels = ['satisfied', 'unsatisfied','neutral']
hypothesis_template = "The text show the notion of {} sentiment."

classifier(text, candidate_labels, multi_label=True, hypothesis_template=hypothesis_template)

{'sequence': "I don't like the food.",
 'labels': ['unsatisfied', 'neutral', 'satisfied'],
 'scores': [0.9968516826629639, 0.000567193899769336, 0.00014923184062354267]}

In [15]:
from transformers import pipeline, set_seed

In [16]:
def classify_new_text(new_text):
    generator = pipeline('text-generation', model='distilgpt2')
    set_seed(42)

    prompt_template = """The following are examples of text classification:
    Text: "The market was very volatile today due to the unexpected interest rate hike."
    Category: Economy

    Text: "The new fitness program includes routines that improve cardiovascular health."
    Category: Health

    Text: "A groundbreaking discovery in renewable energy has been announced."
    Category: Science

    Text: "The local sports team won their game last night in a surprising upset."
    Category: Sports

    Text: "{}"
    Category:"""

    prompt = prompt_template.format(new_text)

    response = generator(prompt, max_length=200, num_return_sequences=1, temperature=0.7)
    generated_text = response[0]['generated_text']

    # Attempt to extract the category from the generated text
    try:
        # Splitting the generated text to find the category part
        category_part = generated_text.split("Category:")[1].strip()
        # Assuming the category is the first word/phrase followed by any newline or extra text
        predicted_category = category_part.split('\n')[0].strip()
        print(f'Text: "{new_text}"\nPredicted Category: {predicted_category}')
    except IndexError:
        # If the expected format isn't found
        print("Failed to extract the category. Please check the generated text format.")

# Example usage
new_text = "The new fitness program includes routines that improve cardiovascular health."
classify_new_text(new_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Text: "The new fitness program includes routines that improve cardiovascular health."
Predicted Category: Economy


In [17]:
!pip install setfit

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer




In [7]:
dataset = load_dataset('SetFit/SentEval-CR')

SyntaxError: invalid syntax (2051448715.py, line 1)

In [None]:
#To simulate a real-world scenario with just a few labeled examples, we'll sample 8 examples per class from the training set:
train_ds = dataset["train"].shuffle(seed=42).select(range(8 * 2))
test_ds = dataset["test"]