In [1]:
import pandas as pd
import nltk
import ast
import sys
from pathlib import Path
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from typing import List, Set
from scipy.stats import pearsonr
import numpy as np
from nltk.chunk import RegexpParser
import copy
from math import log

from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError  # Import WordNetError
import pandas as pd
import stanza


stanza.download('en')
nlp_stanza = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse', tokenize_pretokenized=True)
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('wordnet_ic')

brown_ic = wordnet_ic.ic('ic-brown.dat')

# Download required resource
nltk.download('averaged_perceptron_tagger_eng')

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 26.7MB/s]                    
2024-12-10 20:05:47 INFO: Downloaded file to C:\Users\maric\stanza_resources\resources.json
2024-12-10 20:05:47 INFO: Downloading default packages for language: en (English) ...
2024-12-10 20:05:49 INFO: File exists: C:\Users\maric\stanza_resources\en\default.zip
2024-12-10 20:05:56 INFO: Finished downloading models and saved to C:\Users\maric\stanza_resources
2024-12-10 20:05:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 26.5MB/s]                    
2024-12-10 20:05:56 INFO: Downloaded file to C:\Users\maric\stanza_resources\reso

In [2]:
# Load the training dataset
train_token_df = pd.read_csv('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_token_df.head()

Unnamed: 0,0,1,gs
0,"['But', 'other', 'sources', 'close', 'to', 'th...","['But', 'other', 'sources', 'close', 'to', 'th...",4.0
1,"['Micron', 'has', 'declared', 'its', 'first', ...","['Micron', ""'s"", 'numbers', 'also', 'marked', ...",3.75
2,"['The', 'fines', 'are', 'part', 'of', 'failed'...","['Perry', 'said', 'he', 'backs', 'the', 'Senat...",2.8
3,"['The', 'American', 'Anglican', 'Council', ','...","['The', 'American', 'Anglican', 'Council', ','...",3.4
4,"['The', 'tech-loaded', 'Nasdaq', 'composite', ...","['The', 'technology-laced', 'Nasdaq', 'Composi...",2.4


In [3]:
# Turn the 2 first columns from strings to actual lists of strings

n=len(train_token_df)

train_df = pd.DataFrame(columns=['0','1','gs'], index=range(n))
train_df.iloc[:, :2] = train_token_df.iloc[:, :2].map(ast.literal_eval)
train_df.loc[:, 'gs'] = train_token_df.loc[:, 'gs']

train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",3.75
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",2.8
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",3.4
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",2.4


In [4]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Remove punctuation, convert to lowercase and remove empty strings
train_df = preprocessor.remove_punctuation(train_df)
train_df = preprocessor.convert_to_lowercase(train_df)
train_df = preprocessor.remove_empty_strings(train_df)

# Create the syntactic features data frame

train_features_df=pd.DataFrame(columns=['translation_sim'], index=range(n))


In [5]:
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple

class MosesSMT:
    def __init__(self):
        """
        Initialize Moses Statistical Machine Translation system components
        """
        # Translation probability table (word-level translation probabilities)
        self.translation_prob_table = {}
        
        # Lexical translation probabilities
        self.lexical_translation_probs = {}
        
        # Language model probabilities
        self.language_model_probs = {}
        
        # Phrase translation probabilities
        self.phrase_translation_probs = {}
        
        # Distortion probabilities (for word reordering)
        self.distortion_probs = {}
    
    def train_translation_model(self, 
                                 parallel_corpus: List[Tuple[List[str], List[str]]], 
                                 num_iterations: int = 5):
        """
        Train translation model using IBM Model 1 approach
        
        Parameters:
        -----------
        parallel_corpus : List[Tuple[List[str], List[str]]]
            Parallel corpus of source and target language sentence pairs
        num_iterations : int
            Number of EM (Expectation-Maximization) iterations for training
        """
        # Initialize uniform translation probabilities
        def initialize_translation_probabilities(parallel_corpus):
            # Create unique vocabulary for source and target languages
            src_vocab = set(word for src, _ in parallel_corpus for word in src)
            tgt_vocab = set(word for _, tgt in parallel_corpus for word in tgt)
            
            # Initialize uniform probabilities
            translation_probs = {}
            for src_word in src_vocab:
                translation_probs[src_word] = {
                    tgt_word: 1.0 / len(tgt_vocab) 
                    for tgt_word in tgt_vocab
                }
            return translation_probs
        
        # Initial translation probability estimation
        self.translation_prob_table = initialize_translation_probabilities(parallel_corpus)
        
        # EM Algorithm for refining translation probabilities
        for _ in range(num_iterations):
            # Expectation step: compute expected counts
            expected_counts = {}
            for src_sent, tgt_sent in parallel_corpus:
                # Compute normalization
                for src_word in src_sent:
                    total_prob = sum(
                        self.translation_prob_table[src_word].get(tgt_word, 0.0)
                        for tgt_word in tgt_sent
                    )
                    
                    # Update expected counts
                    for tgt_word in tgt_sent:
                        count = self.translation_prob_table[src_word].get(tgt_word, 0.0) / total_prob
                        expected_counts[(src_word, tgt_word)] = count
            
            # Maximization step: update translation probabilities
            for (src_word, tgt_word), count in expected_counts.items():
                self.translation_prob_table[src_word][tgt_word] = count
    
    def decode(self, source_sentence: List[str]) -> List[str]:
        """
        Decode source sentence using learned translation probabilities
        
        Parameters:
        -----------
        source_sentence : List[str]
            Input source language sentence to translate
        
        Returns:
        --------
        List[str]
            Translated target language sentence
        """
        # Simple decoding strategy based on highest translation probability
        decoded_sentence = []
        for src_word in source_sentence:
            # Find target word with highest translation probability
            best_translation = max(
                self.translation_prob_table.get(src_word, {}).items(),
                key=lambda x: x[1],
                default=(src_word, 1.0)
            )[0]
            decoded_sentence.append(best_translation)
        
        return decoded_sentence
    
    def compute_translation_probability(self, 
                                        source_word: str, 
                                        target_word: str) -> float:
        """
        Compute translation probability for a word pair
        
        Parameters:
        -----------
        source_word : str
            Word in source language
        target_word : str
            Word in target language
        
        Returns:
        --------
        float
            Translation probability
        """
        return self.translation_prob_table.get(source_word, {}).get(target_word, 0.0)

# Example usage
def main():
    # Simulated parallel corpus (source, target language pairs)
    parallel_corpus = [
        (["the", "cat", "sat"], ["le", "chat", "assis"]),
        (["on", "the", "mat"], ["sur", "le", "tapis"]),
        (["the", "dog", "barked"], ["le", "chien", "aboyé"])
    ]
    
    # Initialize and train Moses SMT model
    moses_smt = MosesSMT()
    moses_smt.train_translation_model(parallel_corpus)
    
    # Translate a sample sentence
    source_sentence = ["the", "cat", "sat"]
    translated_sentence = moses_smt.decode(source_sentence)
    
    print("Source Sentence:", source_sentence)
    print("Translated Sentence:", translated_sentence)
    
    # Demonstrate translation probability computation
    print("\nTranslation Probabilities:")
    for src_word in source_sentence:
        print(f"Translation probabilities for '{src_word}':")
        for tgt_word, prob in moses_smt.translation_prob_table[src_word].items():
            print(f"  {src_word} -> {tgt_word}: {prob:.4f}")

if __name__ == "__main__":
    main()

Source Sentence: ['the', 'cat', 'sat']
Translated Sentence: ['assis', 'assis', 'assis']

Translation Probabilities:
Translation probabilities for 'the':
  the -> assis: 0.3333
  the -> chat: 0.3333
  the -> chien: 0.3333
  the -> aboyé: 0.3333
  the -> le: 0.3333
  the -> tapis: 0.3333
  the -> sur: 0.3333
Translation probabilities for 'cat':
  cat -> assis: 0.3333
  cat -> chat: 0.3333
  cat -> chien: 0.1429
  cat -> aboyé: 0.1429
  cat -> le: 0.3333
  cat -> tapis: 0.1429
  cat -> sur: 0.1429
Translation probabilities for 'sat':
  sat -> assis: 0.3333
  sat -> chat: 0.3333
  sat -> chien: 0.1429
  sat -> aboyé: 0.1429
  sat -> le: 0.3333
  sat -> tapis: 0.1429
  sat -> sur: 0.1429


In [40]:
# compute the similarity 

train_features_df = pos_tagging_syntactic_ngrams(train_df, train_features_df)
train_features_df.head()

Unnamed: 0,POS_tagging_unigrams,POS_tagging_bigrams,POS_tagging_trigrams,chunk_sim_p,chunk_sim_s,chunk_sim_o,total_sim_chunks,sim_dependencies
0,0.702703,0.594595,0.514286,,,,,
1,0.571429,0.421053,0.352941,,,,,
2,0.5,0.25,0.090909,,,,,
3,0.777778,0.764706,0.75,,,,,
4,0.307692,0.083333,0.0,,,,,
