# The following sample code was produced in part using GPT-4

In [20]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import numpy as np

## Multilingual Sentiment Classifier

In [21]:
class SentimentClassifier:
    def __init__(self, model_name):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(model_name)

    def classify(self, text):
        # Tokenize the input text and obtain model outputs
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs).logits
            probs = softmax(outputs, dim=1)
            rating = torch.dot(probs.view(-1), torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0])).item()
            # label = torch.argmax(probs, dim=1).item()

        return np.round(rating, 4)

classifier = SentimentClassifier("nlptown/bert-base-multilingual-uncased-sentiment")

# Sample
text_english = "The course was well structured."
text_spanish = "El curso estuvo bien estructurado."
print(classifier.classify(text_english))
print(classifier.classify(text_spanish))


4.185
3.7266


## LM Translation

In [23]:
# Import necessary libraries from Hugging Face's transformers
from transformers import MarianMTModel, MarianTokenizer

class TranslationService:
    def __init__(self):
        # Initialize the English-to-Spanish model and tokenizer
        self.en_to_es_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-es")
        self.en_to_es_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")

        # Initialize the Spanish-to-English model and tokenizer
        self.es_to_en_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-es-en")
        self.es_to_en_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")

    def translate(self, text, target_language="es"):
        """Perform the translation.

        Args:
        - text (str): Input text to be translated.
        - target_language (str): Target language for translation. "es" for English-to-Spanish, "en" for Spanish-to-English.

        Returns:
        - str: Translated text.
        """
        if target_language == "es":
            # Translate English-to-Spanish
            tokenized_text = self.en_to_es_tokenizer(text, return_tensors="pt")
            translation = self.en_to_es_model.generate(**tokenized_text)
            return self.en_to_es_tokenizer.decode(translation[0], skip_special_tokens=True)
        elif target_language == "en":
            # Translate Spanish-to-English
            tokenized_text = self.es_to_en_tokenizer(text, return_tensors="pt")
            translation = self.es_to_en_model.generate(**tokenized_text)
            return self.es_to_en_tokenizer.decode(translation[0], skip_special_tokens=True)
        else:
            raise ValueError("Invalid target language. Choose 'es' or 'en'.")

# Example usage
service = TranslationService()
translated_text_es = service.translate("Hello, how are you?", target_language="es")
print(translated_text_es)  # Expected output: "Hola, ¿cómo estás?"
translated_text_en = service.translate("Hola, ¿cómo estás?", target_language="en")
print(translated_text_en)  # Expected output: "Hello, how are you?"


Downloading (…)lve/main/config.json: 100%|██████████| 1.47k/1.47k [00:00<00:00, 5.51MB/s]
Downloading pytorch_model.bin: 100%|██████████| 312M/312M [00:19<00:00, 16.4MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 1.08MB/s]
Downloading (…)olve/main/source.spm: 100%|██████████| 802k/802k [00:00<00:00, 20.8MB/s]
Downloading (…)olve/main/target.spm: 100%|██████████| 826k/826k [00:00<00:00, 20.9MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.59M/1.59M [00:00<00:00, 19.4MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 44.0/44.0 [00:00<00:00, 226kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.44k/1.44k [00:00<00:00, 8.27MB/s]
Downloading pytorch_model.bin: 100%|██████████| 312M/312M [00:19<00:00, 16.2MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 293/293 [00:00<00:00, 1.63MB/s]
Downloading (…)olve/main/source.spm: 100%|██████████| 826k/826k [00:00<00:00, 19.0MB/s]
Downloading (…)olve/main/target.spm: 100%

Hola, ¿cómo estás?
Hey, how are you?


## T5 (LLM) Translation

In [22]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

class T5Translator:
    def __init__(self, model_name: str = "t5-small"):
        """
        Initialize the T5 Translator with the specified model.
        
        Args:
            model_name (str): Name of the T5 model. Default is "t5-small".
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        
    def translate(self, text: str, source_language: str, target_language: str) -> str:
        """
        Translate text from the source language to the target language.
        
        Args:
            text (str): The input text to be translated.
            source_language (str): The source language code (e.g., "en" for English).
            target_language (str): The target language code (e.g., "es" for Spanish).
            
        Returns:
            str: The translated text.
        """
        # Prepare the prompt text for the T5 model. For example: "translate English to Spanish: Hello"
        prompt = f"translate {source_language} to {target_language}: {text}"
        
        # Tokenize the prompt text
        inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True).to(self.device)
        
        # Generate the translated text
        with torch.no_grad():
            outputs = self.model.generate(inputs)
        
        # Decode the translated text
        translated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return translated_text

# Instantiate the T5Translator
translator = T5Translator()

# Translate English to Spanish
english_text = "Hello, how are you?"
spanish_translation = translator.translate(english_text, "en", "es")
print(f"English to Spanish: {spanish_translation}")

# Translate Spanish to English
spanish_text = "Hola, ¿cómo estás?"
english_translation = translator.translate(spanish_text, "es", "en")
print(f"Spanish to English: {english_translation}")


Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 7.22MB/s]
Downloading pytorch_model.bin: 100%|██████████| 242M/242M [00:14<00:00, 16.2MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 513kB/s]
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 13.0MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 9.11MB/s]


English to Spanish: en to es: Hello, how are you?
Spanish to English: es to en: Hola, cómo estás


## Multilingual Sentence Similarity

In [24]:
from sentence_transformers import SentenceTransformer, util
import torch

# 2. Load the multilingual Sentence Transformer model
model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')

def compute_similarity(sentence1: str, sentence2: str) -> float:
    """
    Computes the semantic similarity between two sentences using cosine similarity.

    Parameters:
    - sentence1 (str): The first sentence.
    - sentence2 (str): The second sentence.

    Returns:
    - float: The cosine similarity score between the two sentences.
    """
    
    # 3. Compute embeddings for both sentences
    embedding1 = model.encode(sentence1, convert_to_tensor=True)
    embedding2 = model.encode(sentence2, convert_to_tensor=True)
    
    # 4. Compute cosine similarity
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2).item()

    return cosine_similarity

# Example usage:
english_sentence = "I love programming"
spanish_sentence = "Me encanta programar"
similarity_score = compute_similarity(english_sentence, spanish_sentence)
print(f"Similarity score between the English and Spanish sentences: {similarity_score:.4f}")



Downloading (…)31d34/.gitattributes: 100%|██████████| 345/345 [00:00<00:00, 2.15MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 853kB/s]
Downloading (…)e4a1a31d34/README.md: 100%|██████████| 3.74k/3.74k [00:00<00:00, 21.8MB/s]
Downloading (…)a1a31d34/config.json: 100%|██████████| 718/718 [00:00<00:00, 3.89MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 454kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.11G/1.11G [00:59<00:00, 18.6MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 171kB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 16.3MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 425kB/s]
Downloading (…)31d34/tokenizer.json: 100%|██████████| 9.10M/9.10M [00:00<00:00, 19.0MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 550/550 [00:00<00:00, 1.98MB/s]
Downloading (…)1a31d34/modules.json: 100%|████

Similarity score between the English and Spanish sentences: 0.9509
