<a href="https://colab.research.google.com/github/blancavazquez/PLN/blob/main/notebooks/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT

El objetivo de esta libreta es usar el modelo BERT en diferentes tareas.

# Tokenizando textos

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
text = 'UNAM: It was founded on September 21, 1551, under the name of the Royal and Pontifical University of Mexico. It is the largest and most important university in Mexico. Throughout time, it has played a leading role in the history and formation of our country. UNAM is recognized as a space of freedom. Respect, tolerance, and dialogue are practiced. The plurality of ideas and thought is appreciated as a sign of its wealth.'
 ## Tokenize and encode the text
encoding = tokenizer.encode(text)
print("Token IDs:", encoding)

In [None]:
# Convert token IDs back to tokens
tokens = tokenizer.convert_ids_to_tokens(encoding)
print(f"Original text: {text}")
print("Tokens:", tokens)

# Análisis de sentimientos

In [None]:
import torch
from transformers import pipeline

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# Test text
text = "I absolutely love this product! Would buy again."

# Get the sentiment
result = sentiment_analyzer(text)
print(f"Sentiment: {result[0]['label']}")
print(f"Confidence: {result[0]['score']:.4f}")

# Análisis de sentimientos (sin pipeline)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class BERTSentimentAnalyzer:
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        self.labels = ['NEGATIVE', 'POSITIVE']

    def preprocess_text(self, text):
        # Remove extra whitespace and normalize
        text = ' '.join(text.split())

        # Tokenize with BERT-specific tokens
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Move to GPU if available
        return {k: v.to(self.device) for k, v in inputs.items()}

    def predict(self, text):
        # Prepare text for model
        inputs = self.preprocess_text(text)

        # Get model predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

        # Convert to human-readable format
        prediction_dict = {
            'text': text,
            'sentiment': self.labels[probabilities.argmax().item()],
            'confidence': probabilities.max().item(),
            'probabilities': {
                label: prob.item()
                for label, prob in zip(self.labels, probabilities[0])
            }
        }
        return prediction_dict

In [None]:
def demonstrate_sentiment_analysis():
    # Initialize analyzer
    analyzer = BERTSentimentAnalyzer()

    # Test texts
    texts = [
        "This product completely transformed my workflow!",
        "Terrible experience, would not recommend.",
        "It's decent for the price, but nothing special."
    ]

    # Analyze each text
    for text in texts:
        result = analyzer.predict(text)
        print(f"\nText: {result['text']}")
        print(f"Sentiment: {result['sentiment']}")
        print(f"Confidence: {result['confidence']:.4f}")
        print("Detailed probabilities:")
        for label, prob in result['probabilities'].items():
            print(f"  {label}: {prob:.4f}")

# Running demonstration
demonstrate_sentiment_analysis()

# Reconocimiento de entidades (NER)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

class BERTNamedEntityRecognizer:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
        self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()

    def recognize_entities(self, text):
        # Tokenize input text
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            return_tensors="pt",
            padding=True,
            truncation=True
        )

        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        # print(inputs)

        # Get predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            predictions = outputs.logits.argmax(-1)

        # Convert predictions to entities
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        labels = [self.model.config.id2label[p.item()] for p in predictions[0]]
        # print(labels)

        # Extract entities
        entities = []
        current_entity = None

        for token, label in zip(tokens, labels):
            if label.startswith('B-'):
                if current_entity:
                    entities.append(current_entity)
                current_entity = {'type': label[2:], 'text': token}
            elif label.startswith('I-') and current_entity:
                if token.startswith('##'):
                    current_entity['text'] += token[2:]
                else:
                    current_entity['text'] += ' ' + token
            elif label == 'O':
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None

        if current_entity:
            entities.append(current_entity)

        return entities

In [None]:
def demonstrate_ner():
    # Initialize recognizer
    ner = BERTNamedEntityRecognizer()

    # Example text
    text = """
    Apple CEO Tim Cook announced new AI features at their headquarters
    in Cupertino, California. Microsoft and Google are also investing
    heavily in artificial intelligence research.
    """

    # Get entities
    entities = ner.recognize_entities(text)

    # Display results
    print("Found entities:")
    for entity in entities:
        print(f"- {entity['text']} ({entity['type']})")

# Running demonstration
demonstrate_ner()