In [3]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from nltk.corpus import wordnet
from nltk.metrics.distance import edit_distance


nltk.download('punkt')
nltk.download('wordnet')

model_name = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def normalize_sentence(sentence):
    tokens = word_tokenize(sentence.lower())
    return tokens

def correct_spelling(word):

    synsets = wordnet.synsets(word)
    if synsets:
        return synsets[0].lemmas()[0].name()
    else:
        return word

def normalize_tokens(tokens):

    normalized_tokens = [correct_spelling(token) for token in tokens]
    return normalized_tokens

sentences = [
    "The girl in the pink dress.",
    "she hated exercising.",
    "She is writing a research paper."
]

results = []
for sentence in sentences:
    tokens = normalize_sentence(sentence)
    normalized_tokens = normalize_tokens(tokens)

    embeddings = {}
    for token in normalized_tokens:
        embeddings[token] = np.random.rand(300)

    sentence_length = len(normalized_tokens)
    embedding_dim = 300
    pos_enc = np.zeros((sentence_length, embedding_dim))
    for pos in range(sentence_length):
        for i in range(0, embedding_dim, 2):
            pos_enc[pos, i] = np.sin(pos / 10000 ** (2 * i / embedding_dim))
            pos_enc[pos, i + 1] = np.cos(pos / 10000 ** (2 * (i + 1) / embedding_dim))

    english_input = tokenizer.encode(" ".join(normalized_tokens), return_tensors="pt")
    translated = model.generate(english_input, max_length=50, num_beams=4, early_stopping=True)
    french_translation = tokenizer.decode(translated[0], skip_special_tokens=True)

    results.append({
        "English Sentence": sentence,
        "Normalized Tokens": normalized_tokens,
        "Word Embeddings": embeddings,
        "Positional Encodings": pos_enc,
        "French Translation": french_translation
    })

df = pd.DataFrame(results)
df


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,English Sentence,Normalized Tokens,Word Embeddings,Positional Encodings,French Translation
0,The girl in the pink dress.,"[the, girl, inch, the, pink, dress, .]","{'the': [0.2371990156251933, 0.848732353273496...","[[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,...",la fille de pouce la robe rose.
1,she hated exercising.,"[she, hate, exercise, .]","{'she': [0.5368751955655414, 0.790393718949663...","[[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,...",Elle déteste l'exercice.
2,She is writing a research paper.,"[she, be, writing, angstrom, research, paper, .]","{'she': [0.8097797069722391, 0.640307319194808...","[[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,...",Elle écrit un article de recherche sur l'angst...
