<a href="https://colab.research.google.com/github/allenjose24/Harnessing-Vector-Space-Models-for-Enhanced-Machine-Translations-and-Document-Search/blob/main/NLP_Research_Work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required libraries
!pip install googletrans==4.0.0-rc1
!pip install plotly autocorrect

import torch
from transformers import MarianMTModel, MarianTokenizer
import gensim.downloader as api
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import plotly.graph_objs as go
from googletrans import Translator
import pandas as pd

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.11.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [None]:
# Download the necessary NLTK data
nltk.download('punkt')
nltk.download('reuters')

# Download pre-trained GloVe embeddings (VSM)
glove_vectors = api.load("glove-wiki-gigaword-100")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...




In [None]:
# Define available languages and their codes for the MarianMT model
languages = {
    'English': 'en',
    'Spanish': 'es',
    'French': 'fr',
    'German': 'de',
    'Italian': 'it',
    'Dutch': 'nl',
    'Russian': 'ru',
    'Chinese (Simplified)': 'zh-cn',
    'Japanese': 'ja',
    'Korean': 'ko',
}

# Create a structured DataFrame for available languages
lang_df = pd.DataFrame(list(languages.items()), columns=['Language', 'Code'])
print("Available Languages and Codes:")
lang_df

Available Languages and Codes:


Unnamed: 0,Language,Code
0,English,en
1,Spanish,es
2,French,fr
3,German,de
4,Italian,it
5,Dutch,nl
6,Russian,ru
7,Chinese (Simplified),zh-cn
8,Japanese,ja
9,Korean,ko


In [None]:
# Google Translate API
translator = Translator()

import plotly.graph_objs as go

# Function to plot the back-translation similarity scores
def plot_back_translation_scores(translation_results):
    # Extract similarity scores for both MarianMT and Google Translate
    similarity_score_marian = translation_results['MarianMT']['similarity_score']
    similarity_score_google = translation_results['GoogleTranslate']['similarity_score']

    # Prepare the data for Plotly
    methods = ['MarianMT', 'Google Translate']
    scores = [similarity_score_marian, similarity_score_google]

    # Plotting the similarity scores
    fig = go.Figure()

    # Add a bar trace for each method
    fig.add_trace(go.Bar(x=methods, y=scores, name='Back-Translation Similarity Scores', marker_color=['blue', 'lightblue']))

    # Update layout for better visualization
    fig.update_layout(
        title='Back-Translation Similarity Scores for MarianMT and Google Translate',
        xaxis_title='Translation Method',
        yaxis_title='Similarity Score',
        yaxis_range=[0, 1],  # Assuming cosine similarity scores between 0 and 1
        barmode='group',
        width=600,  # Width equivalent to 10 inches
        height=500
    )

    # Show the plot
    fig.show()

# Function to translate a sentence using MarianMT
def translate_sentence_marian(sentence, source_lang, target_lang):
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'  # Dynamic model selection
    tokenizer_marian = MarianTokenizer.from_pretrained(model_name)
    model_marian = MarianMTModel.from_pretrained(model_name)

    inputs = tokenizer_marian(sentence, return_tensors="pt", padding=True)
    translated = model_marian.generate(**inputs)
    return tokenizer_marian.decode(translated[0], skip_special_tokens=True)

# Function to translate a sentence using Google Translate
def translate_sentence_google(sentence, target_lang):
    return translator.translate(sentence, dest=target_lang).text

# Function to get word embeddings for a sentence using GloVe
def get_word_embeddings(sentence):
    tokens = word_tokenize(sentence.lower())
    vectors = []
    for token in tokens:
        if token in glove_vectors:
            vectors.append(glove_vectors[token])
        else:
            vectors.append(np.zeros(100))  # For words not in vocabulary, use zero vector
    return np.array(vectors)

# Function to calculate document embeddings (mean vector for each document)
def get_embedding_for_documents(doc_list):
    doc_vectors = []
    for doc in doc_list:
        doc_vector = get_word_embeddings(doc)
        doc_vectors.append(np.mean(doc_vector, axis=0))  # Mean vector for document
    return np.array(doc_vectors)

# Function to translate and evaluate
def translate_and_evaluate(sentence, source_lang, target_lang):
    # Translate sentence using MarianMT
    translated_sentence_marian = translate_sentence_marian(sentence, source_lang, target_lang)
    # Back-translate to source language using MarianMT
    back_translated_sentence_marian = translate_sentence_marian(translated_sentence_marian, target_lang, source_lang)

    # Get embeddings for the original and back-translated sentence (MarianMT)
    original_vector_marian = np.mean(get_word_embeddings(sentence), axis=0)
    back_translated_vector_marian = np.mean(get_word_embeddings(back_translated_sentence_marian), axis=0)

    # Compute cosine similarity (MarianMT)
    similarity_score_marian = cosine_similarity([original_vector_marian], [back_translated_vector_marian])[0][0]

    # Translate sentence using Google Translate
    translated_sentence_google = translate_sentence_google(sentence, target_lang)
    # Back-translate to source language using Google Translate
    back_translated_sentence_google = translate_sentence_google(translated_sentence_google, source_lang)

    # Get embeddings for the original and back-translated sentence (Google Translate)
    original_vector_google = np.mean(get_word_embeddings(sentence), axis=0)
    back_translated_vector_google = np.mean(get_word_embeddings(back_translated_sentence_google), axis=0)

    # Compute cosine similarity (Google Translate)
    similarity_score_google = cosine_similarity([original_vector_google], [back_translated_vector_google])[0][0]

    # Return translation results and similarity scores for both MarianMT and Google Translate
    return {
        "MarianMT": {
            "translated_sentence": translated_sentence_marian,
            "back_translated_sentence": back_translated_sentence_marian,
            "similarity_score": similarity_score_marian
        },
        "GoogleTranslate": {
            "translated_sentence": translated_sentence_google,
            "back_translated_sentence": back_translated_sentence_google,
            "similarity_score": similarity_score_google
        }
    }

In [None]:
# Loading Reuters documents from NLTK
from nltk.corpus import reuters
documents = reuters.fileids()
english_docs = [reuters.raw(doc_id) for doc_id in documents if doc_id.startswith('training/')]

# Example: using only the first 10 documents for simplicity
english_documents = english_docs[:10]

# Step 1: Get embeddings for the English documents
document_vectors = get_embedding_for_documents(english_documents)

In [None]:
# Get the list of document IDs
documents = reuters.fileids()

# Select a sample document (for example, the first one)
sample_doc_id = documents[0]

# Display the raw text of the sample document
sample_document = reuters.raw(sample_doc_id)
print(sample_document)

ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian capitals a U.S.
  Move against Japan might boost protectionist sentiment in the
  U.S. And lead to curbs on American imports of their products.
      But some exporters said that while the conflict would hurt
  them in the long-run, in the short-term Tokyo's loss might be
  their gain.
      The U.S. Has said it will impose 300 mln dlrs of tariffs on
  imports of Japanese electronics goods on April 17, in
  retaliation for Japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      Unofficial Japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they would virtually halt exports

In [None]:
# AutoCorrect installation and setup
from autocorrect import Speller
spell = Speller(lang='en')

# User inputs for source language, target language, and query
source_language_code = input("Enter the source language code (e.g., 'en' for English): ")
target_language_code = input("Enter the target language code (e.g., 'es' for Spanish): ")
query = input("Enter the sentence to translate: ")

# Step: Correct any spelling errors in the input query
corrected_query = spell(query)

# Proceed with translation using the corrected query
translated_query_marian = translate_sentence_marian(corrected_query, source_language_code, target_language_code)
translated_query_google = translate_sentence_google(corrected_query, target_language_code)

print(f"\nCorrected Query: {corrected_query}")
print(f"Translated Query (MarianMT): {translated_query_marian}")
print(f"Translated Query (Google Translate): {translated_query_google}")

Enter the source language code (e.g., 'en' for English): en
Enter the target language code (e.g., 'es' for Spanish): de
Enter the sentence to translate: My name is Allen


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Corrected Query: My name is Allen
Translated Query (MarianMT): Mein Name ist Allen.
Translated Query (Google Translate): Mein Name ist Allen


In [None]:
# Step 3: Get embeddings for the original (source language) query and the translated queries
query_vector = np.mean(get_word_embeddings(corrected_query), axis=0)
translated_query_vector_marian = np.mean(get_word_embeddings(translated_query_marian), axis=0)
translated_query_vector_google = np.mean(get_word_embeddings(translated_query_google), axis=0)

# Step 4: Compute similarity scores for the translated queries and documents
similarity_scores_cosine_marian = cosine_similarity([translated_query_vector_marian], document_vectors)[0]
similarity_scores_cosine_google = cosine_similarity([translated_query_vector_google], document_vectors)[0]

# Step 5: Print the evaluation from `translate_and_evaluate`
translation_results = translate_and_evaluate(query, source_language_code, target_language_code)

# Print results from MarianMT and Google Translate
print(f"\nTranslated Sentence (MarianMT): {translation_results['MarianMT']['translated_sentence']}")
print(f"Back-Translated Sentence (MarianMT): {translation_results['MarianMT']['back_translated_sentence']}")
print(f"Similarity Score (MarianMT): {translation_results['MarianMT']['similarity_score']}")

print(f"\nTranslated Sentence (Google Translate): {translation_results['GoogleTranslate']['translated_sentence']}")
print(f"Back-Translated Sentence (Google Translate): {translation_results['GoogleTranslate']['back_translated_sentence']}")
print(f"Similarity Score (Google Translate): {translation_results['GoogleTranslate']['similarity_score']}")

# Step 6: Output the similarity scores with the English documents (for both MarianMT and Google Translate)
print(f"\nSimilarity scores with English documents (MarianMT): {similarity_scores_cosine_marian}")
print(f"Similarity scores with English documents (Google Translate): {similarity_scores_cosine_google}")



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Translated Sentence (MarianMT): Mein Name ist Allen.
Back-Translated Sentence (MarianMT): My name is Allen.
Similarity Score (MarianMT): 0.9896345734596252

Translated Sentence (Google Translate): Mein Name ist Allen
Back-Translated Sentence (Google Translate): My name is all
Similarity Score (Google Translate): 0.9581458568572998

Similarity scores with English documents (MarianMT): [0.60223426 0.61385108 0.54076646 0.61942167 0.61322769 0.60124079
 0.63365372 0.38557681 0.43327244 0.61906827]
Similarity scores with English documents (Google Translate): [0.37097818 0.38820708 0.32718655 0.4009632  0.38924232 0.38699939
 0.40896608 0.22016374 0.23026429 0.4090943 ]


In [None]:
# Prepare data for Plotly
doc_indices = list(range(len(english_documents)))

# Plotting the similarity scores using Plotly
fig = go.Figure()

# Cosine Similarity
fig.add_trace(go.Bar(x=doc_indices, y=similarity_scores_cosine_marian, name='Cosine Similarity (MarianMT)', marker_color='blue'))
fig.add_trace(go.Bar(x=[i+0.3 for i in doc_indices], y=similarity_scores_cosine_google, name='Cosine Similarity (Google)', marker_color='lightblue'))

fig.update_layout(title='Document Similarity Scores for Translated Queries',
                  xaxis_title='Document Index',
                  yaxis_title='Similarity Score',
                  barmode='group',
                  width = 600,
                  height = 500)

fig.show()

In [None]:
plot_back_translation_scores(translation_results)

In [None]:
# Export similarity scores to Excel
data = {
    'Document Index': list(range(1, len(similarity_scores_cosine_marian) + 1)),
    'Cosine Similarity (MarianMT)': similarity_scores_cosine_marian,
    'Cosine Similarity (Google Translate)': similarity_scores_cosine_google
}

similarity_df = pd.DataFrame(data)
file_name = 'cosine_similarity_scores.xlsx'
similarity_df.to_excel(file_name, index=False)

print(f"Cosine similarity scores have been saved to {file_name}")

Cosine similarity scores have been saved to cosine_similarity_scores.xlsx
