In [2]:
#for sentiment analysis we can use PyTorch or Tensor flow, in this case I've already intalled pytorch so just in case i'm installing tensorflow
#we're using PyTorch for this exercise
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.9.1-cp37-cp37m-win_amd64.whl (444.0 MB)
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting libclang>=13.0.0
  Downloading libclang-14.0.6-py2.py3-none-win_amd64.whl (14.2 MB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1
  Downloading tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-win_amd64.whl (1.5 MB)
Collecting keras<2.10.0,>=2.9.0rc0
  Downloading keras-2.9.0-py2.py3-none-any.whl (1.6 MB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting absl-py>=1.0.0
  Downloading absl_py-1.2.0-py3-none-any.whl (123 kB)
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloadi

In [3]:
!pip install transformers



In [4]:
from transformers import pipeline

In [5]:
import torch
import torch.nn.functional as F

# EXAMPLE

In [6]:
#specify the task that we want, we can find the list in https://huggingface.co/models
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [9]:
#example shows that we're 99% the text is NEGATIVE sentiment
example_results = classifier("I really miss my country, I wish I can go more often")
example_results

[{'label': 'NEGATIVE', 'score': 0.9989567995071411}]

In [16]:
#we can also give this pipeline a list of texts to analyse 
example_results_2 = classifier(["I really miss my country, I wish I can go more often", 
                                "I love sushi", 
                                "this is disscusting, I feel like the news are only giving more stress to people.", 
                               ":)",
                               "I dont know yet, but I'm looking for the answer."])


for results in example_results_2:
    print(results)

{'label': 'NEGATIVE', 'score': 0.9989567995071411}
{'label': 'POSITIVE', 'score': 0.9998181462287903}
{'label': 'NEGATIVE', 'score': 0.9992493987083435}
{'label': 'POSITIVE', 'score': 0.6489291191101074}
{'label': 'NEGATIVE', 'score': 0.9972866773605347}


# EXAMPLE WITH TOKENIZDER AND A MODEL

In [17]:
#we want to tonkenize our data, the autotokenizer is a generic class and the next one is more specific
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [18]:
#you can get another model here:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [22]:
#from_pretrained is a popular function in hugging face 
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [24]:
#we add the model and the tokenizer
classifier_2 = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [25]:
#the results are the same because we're using the classifier
example_results_3 = classifier(["I really miss my country, I wish I can go more often", 
                                "I love sushi", 
                                "this is disscusting, I feel like the news are only giving more stress to people.", 
                               ":)",
                               "I dont know yet, but I'm looking for the answer."])


for results in example_results_3:
    print(results)

{'label': 'NEGATIVE', 'score': 0.9989567995071411}
{'label': 'POSITIVE', 'score': 0.9998181462287903}
{'label': 'NEGATIVE', 'score': 0.9992493987083435}
{'label': 'POSITIVE', 'score': 0.6489291191101074}
{'label': 'NEGATIVE', 'score': 0.9972866773605347}


In [31]:
#let's try now to use tokenize the sentence, if we only print the tokenizer.tokenize we got the 13 words separated
tokens = tokenizer.tokenize("I really miss my country, I wish I can go more often")

print(f'Tokens: {tokens}')

Tokens: ['i', 'really', 'miss', 'my', 'country', ',', 'i', 'wish', 'i', 'can', 'go', 'more', 'often']


In [33]:
#The token IDs are the mathematical word representation
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'Tokens IDs:{token_ids}')

Tokens IDs:[1045, 2428, 3335, 2026, 2406, 1010, 1045, 4299, 1045, 2064, 2175, 2062, 2411]


In [37]:
#we also can apply tokenizer directly and we get another result
#in the IDs the 101 means the beginning and the 102 the end of the sentence
#we can use this input token IDs to put them directly in our model

input_ids = tokenizer("I really miss my country, I wish I can go more often")

print(f'Input IDs:{input_ids}')

Input IDs:{'input_ids': [101, 1045, 2428, 3335, 2026, 2406, 1010, 1045, 4299, 1045, 2064, 2175, 2062, 2411, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# MULTIPLE FRASES TOKENIZED

In [40]:
#this is how we get the input ids for several frases, they are inside a dictionary, separated by 101 and 102

X_train = ["I really miss my country, I wish I can go more often", 
          "I love sushi", 
          "this is disscusting, I feel like the news are only giving more stress to people.", 
          ":)", 
          "I dont know yet, but I'm looking for the answer."]    

#return_tensors="pt" is for pytorch if we want to use tensorflow you need to make other steps 
#for more directions watch youtube.com/watch?v=GSt00_-0ncQ&t=1s minute 30:56
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")

batch

{'input_ids': tensor([[  101,  1045,  2428,  3335,  2026,  2406,  1010,  1045,  4299,  1045,
          2064,  2175,  2062,  2411,   102,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  1045,  2293, 10514,  6182,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  2023,  2003,  4487,  4757,  7874,  3436,  1010,  1045,  2514,
          2066,  1996,  2739,  2024,  2069,  3228,  2062,  6911,  2000,  2111,
          1012,   102],
        [  101,  1024,  1007,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  1045,  2123,  2102,  2113,  2664,  1010,  2021,  1045,  1005,
          1049,  2559,  2005,  1996,  3437,  1012,   102,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,

In [51]:
#we are going to pass the batch to our model, the ** means to unpack the dictionary
#inside tensor([[are the probabilities]])
# if our result has a loss, we can put labels=torch.tensor([1,0]) next to the **batch



with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    
    #inside tensor([[are the probabilities]])
    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)
    
    #give us the max and min
    labels = torch.argmax((predictions), dim=1)
    print(labels)
    
    #id2label give us the class name POSITIVE NEGATIVE
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

#NOTEEE!!! THE classifier pipeline GAVE US THE SAME RESULT AND MORE CLEAR

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.7767, -3.0878],
        [-4.1509,  4.4612],
        [ 3.9423, -3.2516],
        [-0.2246,  0.3897],
        [ 3.2588, -2.6480]]), hidden_states=None, attentions=None)
tensor([[9.9896e-01, 1.0432e-03],
        [1.8186e-04, 9.9982e-01],
        [9.9925e-01, 7.5056e-04],
        [3.5107e-01, 6.4893e-01],
        [9.9729e-01, 2.7134e-03]])
tensor([0, 1, 0, 1, 0])
['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE']


In [56]:
#here we're naming the folder "saved"
saved_directory = "saved"

#its going to save the jsons with the information above
tokenizer.save_pretrained(saved_directory)
model.save_pretrained(saved_directory)

#load to directory
tokenizer = AutoTokenizer.from_pretrained(saved_directory)
model = AutoModelForSequenceClassification.from_pretrained(saved_directory) 

# CHANGE MODEL TO SPANISH

In [61]:
#go to https://huggingface.co/models and select text classifaction, there you can select your language and copy it 
model_name_2 = "JonatanGk/roberta-base-bne-finetuned-cyberbullying-spanish"

In [62]:
tokenizer_spanish = AutoTokenizer.from_pretrained(model_name_2)
model_spanish = AutoModelForSequenceClassification.from_pretrained(model_name_2) 

text = ["Estamos a un estornudo de que los militares se encarguen de la Educación en México.", 
       "IGUAL QUE EL AEROPUERTO CHAFA Y LA REFINERIA QUE NO SIRVE PARA NADA", 
       "🥰🥰🥰 siempre apoyándome", 
       "Espero que tengas muy bonito día, todo va a estar bien 🤍",
        "Qué te sea leve y te mejores pronto querido ✨"]

batch = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
print(batch)

with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    
    #inside tensor([[are the probabilities]])
    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)
    
    #give us the max and min
    labels = torch.argmax((predictions), dim=1)
    print(labels)
    
    #id2label give us the class name POSITIVE NEGATIVE
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)


Downloading tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/831k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/911 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  9765, 22591,  2015,  1037,  4895,  9765,  9691,  6784,  2080,
          2139, 10861,  3050, 23689,  6590,  6072,  7367,  4372, 10010,  9077,
          2078,  2139,  2474,  3968, 18100, 10446,  4372,  3290,  1012,   102],
        [  101,  1045, 19696,  2140, 10861,  3449, 18440, 14289,  8743,  2080,
         15775,  7011,  1061,  2474, 25416, 26455,  2401, 10861,  2053,  2909,
          3726, 11498, 23233,  2050,   102,     0,     0,     0,     0,     0],
        [  101,   100,  9033,  6633, 28139,  9706, 18232, 15482,  4168,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  9686,  4842,  2080, 10861,  2702, 12617, 14163,  2100, 14753,
          9956, 22939,  1010, 28681,  2080, 12436,  1037,  9765,  2906, 29316,
           100,   102,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 10861,  8915,  271

# TEST PIO TRANSLATE SPANISH TO ENGLISH

In [74]:
#Tried to used a spanish model but it didn't work, so I translate it to english and it worked better

classifier = pipeline("sentiment-analysis")

example_results_5= classifier(["We are one sneeze away from the military taking over education in Mexico.",
       "SAME AS THE CHAFA AIRPORT AND THE REFINERY THAT IS USELESS FOR NOTHING",
       " 🥰🥰🥰 always supporting me", 
        "I hope you have a very nice day, everything will be fine 🤍",
    "May it be light to you and get better soon dear ✨"])

for results in example_results_5:
    print(results)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'label': 'NEGATIVE', 'score': 0.729952871799469}
{'label': 'NEGATIVE', 'score': 0.9986312985420227}
{'label': 'POSITIVE', 'score': 0.9996458292007446}
{'label': 'POSITIVE', 'score': 0.9998538494110107}
{'label': 'POSITIVE', 'score': 0.9941353797912598}


# MAKE YOUR OWN MODEL

In [75]:
#go to for more documentation huggingface.co/transformers/v3.2.0/custom_datasets.html
#there are five steps you need to follow

In [None]:
#1. Prepare dataset (loaded from a csv)

In [None]:
#2. load pretrained tokenizer then call it with dataset -> encoding (token ids)

In [None]:
#3. build a PyTorch dataset with encodings 

In [None]:
#4. load pretrained Model

In [None]:
#5. a)load trainer and train it b)or use native Pytorchtraining pipeline