In [12]:
# Read the English and Hindi text files
with open('eng.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open('hin.txt', 'r', encoding='utf-8') as hin_file:
    hindi_sentences = hin_file.readlines()

# Ensure both files have the same number of sentences
assert len(english_sentences) == len(hindi_sentences), "The number of English and Hindi sentences do not match!"

# Strip any extra whitespace from each sentence
english_sentences = [sentence.strip() for sentence in english_sentences]
hindi_sentences = [sentence.strip() for sentence in hindi_sentences]

# Create a DataFrame with these two columns
import pandas as pd

df = pd.DataFrame({
    'English': english_sentences,
    'Hindi': hindi_sentences
})

# Limit to 10,000 rows if needed
df = df.head(10000)

# Display the first few rows
print(df.head())


                                             English  \
0  However, Paes, who was partnering Australia's ...   
1  Whosoever desires the reward of the world, wit...   
2  "The value of insects in the biosphere is enor...   
3  Mithali To Anchor Indian Team Against Australi...   
4  After the assent of the Honble President on 8t...   

                                               Hindi  
0  आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाल...  
1  और जो शख्स (अपने आमाल का) बदला दुनिया ही में च...  
2  जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि ...  
3    आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को  
4  8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍व...  


In [13]:
from collections import Counter
import re

# Function to clean and tokenize text (remove punctuation and convert to lowercase)
def tokenize(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    return text.split()

# Tokenize English and Hindi sentences
english_words = [tokenize(sentence) for sentence in df['English']]
hindi_words = [tokenize(sentence) for sentence in df['Hindi']]

# Flatten the lists of words
english_words_flat = [word for sublist in english_words for word in sublist]
hindi_words_flat = [word for sublist in hindi_words for word in sublist]

# Count word frequencies
english_freq = Counter(english_words_flat)
hindi_freq = Counter(hindi_words_flat)

# Filter words by frequency between 5 and 50
english_freq_filtered = {word: count for word, count in english_freq.items() if 5 <= count <= 50}
hindi_freq_filtered = {word: count for word, count in hindi_freq.items() if 5 <= count <= 50}

# Display some of the filtered frequencies
print(f"English Frequency (5-50): {english_freq_filtered}")
print(f"Hindi Frequency (5-50): {hindi_freq_filtered}")


Hindi Frequency (5-50): {'आसटरलय': 11, 'पल': 44, 'जड़': 19, 'हरय': 9, 'शखस': 16, 'आखरत': 9, 'सनत': 16, 'सबक': 32, 'कड': 34, 'समदध': 15, 'जव': 17, 'वनड': 24, 'कमन': 11, 'मतल': 5, '8': 39, 'सतमबर': 7, '2016': 21, 'मननय': 8, 'सवकत': 18, 'सवधन': 34, 'सशधन': 20, 'असततव': 8, 'सनवई': 21, 'फरवर': 20, 'तरख': 8, 'तय': 29, 'टरक': 26, 'कपय': 8, 'तरत': 26, '22': 27, 'सआरपएफ': 9, 'सपशल': 6, 'ऑपरशन': 9, 'गरप': 21, 'इलक': 45, 'तलश': 26, 'अभयन': 36, 'चलय': 7, 'झरखड': 23, 'हमत': 7, 'सरन': 6, 'पटआई': 5, 'सकटर': 16, 'अरवद': 13, 'गगरप': 7, 'बतचत': 50, 'परभर': 16, 'अनल': 24, 'गठन': 25, 'रजयपल': 30, 'यश': 40, 'शसतर': 8, 'मसह': 32, 'दऊद': 6, 'परटय': 6, 'वरषठ': 49, 'नतओ': 29, 'पकष': 43, 'परचर': 23, 'बइक': 10, 'दखई': 24, 'कटरन': 11, 'शयर': 39, 'तरग': 5, 'कथत': 20, 'अनदर': 9, 'हफत': 14, 'समपरक': 7, 'कशश': 49, 'पय': 32, 'नन': 7, 'तलन': 36, '800': 5, 'परपरक': 5, 'पतन': 49, 'पटर': 10, 'समनय': 30, 'पढ': 31, 'पज': 43, 'सनन': 17, '2019': 47, 'लसट': 8, 'जनए': 8, 'लडग': 7, 'लकसभ': 46, 'पड़': 12, 'यसफ': 19, 'भइय': 10, 'वरण

In [14]:
# Convert the frequency dictionaries to DataFrames for easier analysis
english_freq_df = pd.DataFrame(list(english_freq_filtered.items()), columns=['Word', 'Frequency_English'])
hindi_freq_df = pd.DataFrame(list(hindi_freq_filtered.items()), columns=['Word', 'Frequency_Hindi'])

# Merge the frequency data on the word
merged_freq_df = pd.merge(english_freq_df, hindi_freq_df, on="Word", how="inner")

# Calculate the difference in frequency
merged_freq_df['Frequency_Difference'] = merged_freq_df['Frequency_English'] - merged_freq_df['Frequency_Hindi']

# Filter based on frequency difference between -10 and +10
filtered_freq_df = merged_freq_df[(merged_freq_df['Frequency_Difference'] >= -10) & (merged_freq_df['Frequency_Difference'] <= 10)]

# Display the result
print(filtered_freq_df.head())


   Word  Frequency_English  Frequency_Hindi  Frequency_Difference
0  2016                 24               21                     3
1   800                  5                5                     0
2  2019                 49               47                     2
3    17                 33               30                     3
4    27                 17               16                     1


In [20]:
# ----------------------------
# Step 1: Compute sentence-level word counts
# ----------------------------
df['Word_Count_English'] = df['English'].apply(lambda x: len(tokenize(x)))
df['Word_Count_Hindi'] = df['Hindi'].apply(lambda x: len(tokenize(x)))

# ----------------------------
# Step 2: Keep only sentences where counts are between 5 and 50
# ----------------------------
df = df[(df['Word_Count_English'].between(5, 50)) &
        (df['Word_Count_Hindi'].between(5, 50))]

# ----------------------------
# Step 3: Calculate difference between counts
# ----------------------------
df['Difference'] = df['Word_Count_English'] - df['Word_Count_Hindi']

# ----------------------------
# Step 4: Keep only rows where difference is between -10 and +10
# ----------------------------
df = df[df['Difference'].between(-10, 10)]

# ----------------------------
# Step 5: Save final cleaned dataset to Excel
# ----------------------------
df.to_excel("cleaned_dataset.xlsx", index=False)



✅ Final cleaned dataset saved as cleaned_dataset.xlsx


In [21]:


# Load cleaned dataset from Assignment 1
df = pd.read_excel("cleaned_dataset.xlsx")

# Take first 100 (or random 100)
sample_df = df.sample(n=100, random_state=42)
english_sentences = sample_df['English'].tolist()


In [22]:
from transformers import pipeline

# Load a pre-trained translation model (English → Hindi)
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")

# Translate English → Hindi
translations = [translator(sentence, max_length=200)[0]['translation_text'] for sentence in english_sentences]

# Add translations to DataFrame
sample_df['Model_Translation'] = translations


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [25]:
import sacrebleu

# Reference sentences (from your dataset)
references = sample_df['Hindi'].tolist()

# Candidate translations (from model)
candidates = sample_df['Model_Translation'].tolist()

# Calculate metrics
bleu = sacrebleu.corpus_bleu(candidates, [references])
chrf = sacrebleu.corpus_chrf(candidates, [references])
ter = sacrebleu.corpus_ter(candidates, [references])

# Save metrics to a .txt file
with open("translation_scores.txt", "w", encoding="utf-8") as f:
    f.write(f"BLEU Score: {bleu.score}\n")
    f.write(f"CHRF Score: {chrf.score}\n")
    f.write(f"TER Score: {ter.score}\n")


In [24]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [26]:
# Save only English + Model Translation columns
sample_df[['English', 'Model_Translation']].to_excel("translation_output.xlsx", index=False)


✅ Translation task completed: translation_output.xlsx & translation_scores.txt generated.
