<a href="https://colab.research.google.com/github/akarsh323/NLP/blob/main/Untitled30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
from collections import defaultdict

def tokenize(text):
    """
    Tokenize text into words. This is a simple tokenizer that
    splits on whitespace and removes punctuation.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

def train_translation_model(corpus):
    """
    Train a simple word-level translation model.

    :param corpus: List of tuples (source_sentence, target_sentence)
    :return: Dictionary mapping source words to {target_word: count}
    """
    translation_counts = defaultdict(lambda: defaultdict(int))

    for source, target in corpus:
        source_words = tokenize(source)
        target_words = tokenize(target)

        # Simplifying assumption: words align one-to-one in order
        for s, t in zip(source_words, target_words):
            translation_counts[s][t] += 1

    # Convert counts to probabilities
    translation_probs = {}
    for s_word, t_counts in translation_counts.items():
        total = sum(t_counts.values())
        translation_probs[s_word] = {t: count/total for t, count in t_counts.items()}

    return translation_probs

def train_language_model(sentences, n=2):
    """
    Train a simple n-gram language model.

    :param sentences: List of sentences in the target language
    :param n: n-gram size (default: bigrams)
    :return: Dictionary mapping n-grams to their probabilities
    """
    n_gram_counts = defaultdict(lambda: defaultdict(int))

    for sentence in sentences:
        words = tokenize(sentence) + ['<END>']
        for i in range(len(words)-n+1):
            n_gram = tuple(words[i:i+n])
            next_word = words[i+n] if i+n < len(words) else None
            n_gram_counts[n_gram][next_word] += 1

    # Convert counts to probabilities
    n_gram_probs = {}
    for gram, next_word_counts in n_gram_counts.items():
        total = sum(next_word_counts.values())
        n_gram_probs[gram] = {w: count/total for w, count in next_word_counts.items()}

    return n_gram_probs

def translate(text, translation_model, language_model):
    """
    Translate a sentence using the trained models.

    :param text: Source text to translate
    :param translation_model: Word-level translation probabilities
    :param language_model: N-gram language model probabilities
    :return: Most probable translation
    """
    words = tokenize(text)
    best_translation = []

    for word in words:
        if word in translation_model:
            # Choose the most probable translation for this word
            best_word = max(translation_model[word], key=translation_model[word].get)
        else:
            # If word is unknown, keep it as-is (could be a name or number)
            best_word = word
        best_translation.append(best_word)

    # Simple n-gram reordering and word choice
    # This is a very simplistic decoding and in real-world systems
    # would be much more complex (e.g., beam search)
    final_translation = []
    i = 0
    while i < len(best_translation):
        current_gram = tuple(best_translation[max(0, i-1):i+1])
        if current_gram in language_model:
            next_word = max(language_model[current_gram], key=language_model[current_gram].get)
            if next_word is not None:
                final_translation.append(next_word)
        else:
            final_translation.append(best_translation[i])
        i += 1

    return ' '.join(final_translation)

# Example usage
if __name__ == "__main__":
    # Toy corpus for demonstration (Spanish to English)
    corpus = [
        ("el gato negro", "the black cat"),
        ("el perro es grande", "the dog is big"),
        ("me gusta el gato", "i like the cat"),
        ("el perro corre", "the dog runs"),
    ]

    # Train models
    translation_model = train_translation_model(corpus)
    language_model = train_language_model([t for _, t in corpus])

    # Translate a sentence
    source_text = "el gato es grande"
    translated_text = translate(source_text, translation_model, language_model)

    print(f"Original: {source_text}")
    print(f"Translated: {translated_text}")

Original: el gato es grande
Translated: the cat is <END>
