In [52]:
import numpy as np
import pandas as pd
import tensorflow as tf
import torch
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModel

In [53]:
import pandas as pd

data=pd.read_csv('task3/team13_bn_train.csv')

print(data['target'])

0        আজ শনিবার ভোররাতে ঢাকার হজরত শাহজালাল আন্তর্জা...
1                            কিন্তু আমরা সেটি করতে পারিনি।
2                                জীবন চলে তার নিজের গতিতে।
3                             চালককে পরে উদ্ধার করে পুলিশ।
4        ( ইব্রীয় ১০: ৩৬ - ৩৮) হাত শিথিল করার অথবা শয়তা...
                               ...                        
69995                   এই ব্যাপারে আমাদের আরো ভাবা জরুরি।
69996                                          বেইজ্জত কী?
69997    মন্ত্রী জানান, যাঁদের এখন চিকিৎসা চলছে, তাঁদের...
69998    এবং কেন নয়, যেহেতু তিনি এর সবেরই অভিজ্ঞতা লাভ ...
69999                             কংগ্রেসকে জবাব দিতে হবে।
Name: target, Length: 70000, dtype: object


In [54]:
# Load GloVe embeddings
def load_glove_embeddings():
  embeddings_index = {}
  with open("glove.6B/glove.6B.300d.txt", 'r', encoding='utf-8') as f:
      for line in f:
          values = line.split()
          word = values[0]
          vector = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = vector
  return embeddings_index

# Prepare GloVe embedding matrix
def create_embedding_matrix(tokenizer, embedding_dim, embeddings_index):
  embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
  for word, i in tokenizer.word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector
  return embedding_matrix

# Tokenize and pad sequences
def tokenize_and_pad(texts, tokenizer, max_len):
  sequences = tokenizer.texts_to_sequences(texts)
  padded = pad_sequences(sequences, maxlen=max_len, padding='post')
  return padded[0]

def load_data(filename, with_references=False):
  data=pd.read_csv(filename)
  en=data.iloc[:, -2].values
  bn=data.iloc[:, -1].values

  return data,en,bn

def indicbert_encode(text, tokenizer, max_beng_len):
  input_ids = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_beng_len).input_ids
  #convert to numpy array
  input_ids = input_ids.numpy()[0]
  return list(input_ids)

def indicbert_embed(text, tokenizer, model, max_beng_len):
  input_ids = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_beng_len).input_ids
  # print(input_ids)
  with torch.no_grad():
    outputs = model(input_ids)
  ans = outputs.last_hidden_state.numpy()
  #convert into 2d array
  ans = ans[0]
  # print(ans.shape)
  return ans

In [65]:
# Load data
# Load training, validation and test data
train_data, train_en, train_bn = load_data("task3/team13_bn_train.csv")
val_data, val_en, val_bn = load_data("task3/team13_bn_valid.csv")
test_data, test_en, test_bn = load_data("task3/team13_bn_test.csv")

In [56]:
# Initialize English tokenizer
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(train_en)

# Load GloVe embeddings
embeddings_index = load_glove_embeddings()
embedding_dim = 300
embedding_matrix = create_embedding_matrix(eng_tokenizer, embedding_dim, embeddings_index)

# Prepare English data
max_eng_len = max(len(seq.split()) for seq in train_en)
# encoder_input_data = tokenize_and_pad(train_en, eng_tokenizer, max_eng_len)

In [57]:
print(tokenize_and_pad([train_en[3]], eng_tokenizer, max_eng_len))
print(train_en[3])

[   1   39   44  111 9807    1  613    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [60]:
# Initialize Bengali tokenizer using IndicBERT
bert_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
indicbert_model = AutoModel.from_pretrained('ai4bharat/indic-bert')

In [61]:
l = (indicbert_encode(train_bn[0],bert_tokenizer, 220))
l1= indicbert_embed(train_bn[0], bert_tokenizer, indicbert_model, 220)
# print(tf.shape((l[0])))
print(l)
# print(indicbert_model.config.vocab_size)

[2, 1089, 8, 10239, 30032, 23081, 32417, 33209, 8671, 80619, 144742, 34480, 651, 5376, 26080, 2354, 116503, 4083, 17498, 29716, 45428, 17223, 442, 29196, 30032, 442, 3865, 6441, 33727, 46942, 10239, 41417, 2213, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [62]:
# Prepare Bengali data
max_beng_len = 220 #tested on tokensized bengali data

# decoder_input_data = np.zeros((300, max_beng_len, indicbert_model.config.hidden_size))
# print(decoder_input_data.shape)


In [147]:
def gen_data(data):
  encoder_input_data = []
  decoder_input_data = []
  decoder_output_data = []
  data_len = len(data)
  for i in range(data_len):
    eng_sent = data['source'][i]
    bn_sent = data['target'][i]
    bn_sent_encoded = indicbert_encode(bn_sent, bert_tokenizer, max_beng_len)
    if bn_sent_encoded[-1] != 0:
      bn_sent_encoded.append(0)
    n = bn_sent_encoded.index(0)
    for j in range(1,n-1):
      encoder_input_data.append(tokenize_and_pad([eng_sent], eng_tokenizer, max_eng_len))
      decoder_input_data.append(indicbert_embed([bn_sent[:j]], bert_tokenizer, indicbert_model, max_beng_len))
      out_index = bn_sent_encoded[j+1]
      #convert it to one hot
      # out = np.zeros(indicbert_model.config.vocab_size)
      # out[out_index] = 1
      decoder_output_data.append(out_index)
  encoder_input_data = np.array(encoder_input_data)
  decoder_input_data = np.array(decoder_input_data)
  decoder_output_data  = keras.utils.to_categorical(decoder_output_data, num_classes=indicbert_model.config.vocab_size)
  decoder_output_data = np.array(decoder_output_data)
  return encoder_input_data, decoder_input_data, decoder_output_data

In [63]:
# Model architecture
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_eng_len,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=len(eng_tokenizer.word_index) + 1, output_dim=embedding_dim,
                              weights=[embedding_matrix], input_length=max_eng_len, trainable=False)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]



In [64]:
# Decoder
decoder_inputs = Input(shape=(max_beng_len,indicbert_model.config.hidden_size), name='decoder_inputs')
decoder_lstm = LSTM(latent_dim, return_sequences=False, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(indicbert_model.config.vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [160]:
# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, 236)]                0         []                            
 )                                                                                                
                                                                                                  
 embedding_12 (Embedding)    (None, 236, 300)             1360500   ['encoder_inputs[0][0]']      
                                                          0                                       
                                                                                                  
 decoder_inputs (InputLayer  [(None, 220, 768)]           0         []                            
 )                                                                                         

In [172]:
#generate data
encoder_input_data, decoder_input_data, decoder_target_data = gen_data(train_data[:5])
# encoder_input_data = np.transpose(encoder_input_data)
# encoder_input_data = np.expand_dims(encoder_input_data, axis=0)
# encoder_input_data = encoder_input_data.reshape(-1)

# Training
model.fit([encoder_input_data, decoder_input_data],decoder_target_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x4a985cdd0>

In [182]:
# print(len(train_data))
# training data is of size 70000. So we take 100 samples for each batch

# Train model
for i in range(5): #number of epochs
  for i in range(0, len(train_data),100):
    df= train_data[i:i+100]
    df.reset_index(drop=True, inplace=True)
    encoder_input_data, decoder_input_data, decoder_target_data = gen_data(df)
    model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=1, validation_split=0.2)

 2/32 [>.............................] - ETA: 29s - loss: 5.3420 - accuracy: 0.0859

In [167]:
# Inference models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_output, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_output)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
# Translation function
def decode_sequence(input_seq):
  states_value = encoder_model.predict(input_seq)
  target_seq = np.zeros((1, 1, indicbert_model.config.hidden_size))
  stop_condition = False
  decoded_sentence = ''

  while not stop_condition:
      output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
      sampled_token_index = np.argmax(output_tokens[0, -1, :])
      sampled_word = bert_tokenizer.decode(sampled_token_index)
      decoded_sentence += ' ' + sampled_word

      if sampled_word == '[SEP]' or len(decoded_sentence) > max_beng_len:
          stop_condition = True

      target_seq = np.zeros((1, 1, indicbert_model.config.hidden_size))
      target_seq[0, 0, :] = output_tokens[0, -1, :]

      states_value = [h, c]

  return decoded_sentence

In [None]:
# BLEU evaluation
from nltk.translate.bleu_score import corpus_bleu

def evaluate_bleu(model, data, eng_tokenizer, bert_tokenizer):
  references = []
  candidates = []

  for i, row in data.iterrows():
      input_seq = tokenize_and_pad([row['source']], eng_tokenizer, max_eng_len)
      decoded_sentence = decode_sequence(input_seq)
      references.append([row['source'].split()])
      candidates.append(decoded_sentence.split())

  bleu_scores = {
      'BLEU-1': corpus_bleu(references, candidates, weights=(1, 0, 0, 0)),
      'BLEU-2': corpus_bleu(references, candidates, weights=(0.5, 0.5, 0, 0)),
      'BLEU-3': corpus_bleu(references, candidates, weights=(0.33, 0.33, 0.33, 0)),
      'BLEU-4': corpus_bleu(references, candidates, weights=(0.25, 0.25, 0.25, 0.25))
  }

  return bleu_scores

# Evaluate on test data
test_bleu_scores = evaluate_bleu(model, test_data, eng_tokenizer, bert_tokenizer)
print(f"Test BLEU Scores: {test_bleu_scores}")


In [37]:
print(type(train_data)) 

<class 'pandas.core.frame.DataFrame'>


In [87]:
import random

#choose random 10 samples from train data
train_data_sample = train_data.sample(n=10)
train_data_sample.reset_index(drop=True, inplace=True)

for i in range(10):
    print(f"{train_data_sample['source'][i]}",end=" , ")
    print(f"{train_data_sample['target'][i]}")


At the instigation of the Corporation of London, city architect Horace Jones proposed a Gothic-style drawbridge to be built downstream from London Bridge. , কর্পোরেশন অভ্ লন্ডন এর অনুরোধে, শহরের স্থপতি হরেস জোনস্ লন্ডন ব্রিজ থেকে নদীর অভিমুখে গথিক ধাঁচে একটা টানা সেতু নির্মাণের প্রস্তাব দিয়েছিলেন।
I came here for work. , ইয়ে একটা কাজে তোমার কাছে এসেছিলাম।
There are many things. , অনেক বিষয় এসেছে।
The garment industry in Bangladesh employs a lot of women. , বাংলাদেশে গার্মেন্ট খাতের শ্রমিকদের সিংহভাগই নারী।
The police then fired back at them. , এরপরই কার্যত তাদের উপর ঝাঁপিয়ে পড়ে পুলিস।
Sidhu''s wife Navjot Kaur had blamed the chief minister for denial of party ticket to her for Chandigarh seat , সিধুর স্ত্রীকে লোকসভা ভোটের টিকিট না দেওয়ার জন্য অমরিন্দরকেই পাল্টা দোষারোপ করেছিলেন সিধু।
"""They are undergoing treatment at different hospitals." , তিনি বলেন, “একটি হাসপাতালে বিভিন্ন ধরনের রোগী থাকে।
Paul told Titus, who served congregations in Crete and who appointed overseers, that each ap