<a href="https://colab.research.google.com/github/babbugit9/Dl-Assignment-2/blob/main/Assignment%202%20again.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==2.12.0
!pip install pandas
!pip install gdown



In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [34]:
import pandas as pd

def fetch_translit_data():
    file_paths = {
        "training": "/content/hi.translit.sampled.train.tsv",
        "validation": "/content/hi.translit.sampled.dev.tsv",
        "testing": "/content/hi.translit.sampled.test.tsv"
    }

    def read_file(path):
        return pd.read_csv(path, delimiter='\t', header=None, names=["target_script", "source_script", "frequency"])

    dataset_train = read_file(file_paths["training"])
    dataset_dev = read_file(file_paths["validation"])
    dataset_test = read_file(file_paths["testing"])

    return dataset_train, dataset_dev, dataset_test

train_df, dev_df, test_df = fetch_translit_data()

print("Few samples from training dataset:")
print(train_df.sample(5))


Few samples from training dataset:
      target_script   source_script  frequency
4527        उज्ज्वल          ujjwal          1
3451     आवश्यकताएं  aavshyaktaayen          1
633        अद्वितीय        advitiya          2
43208       हटिंगटन      hatingatan          1
33701   राष्ट्रवादी     rashtrawadi          1


In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf

def prepare_texts(train, dev, test, seq_len=20):
    # Drop any missing entries
    train = train.dropna()
    dev = dev.dropna()
    test = test.dropna()

    # Rename for consistency
    train = train.rename(columns={"source_script": "latin", "target_script": "hindi"})
    dev = dev.rename(columns={"source_script": "latin", "target_script": "hindi"})
    test = test.rename(columns={"source_script": "latin", "target_script": "hindi"})

    # Ensure text columns are string
    for df in [train, dev, test]:
        df["latin"] = df["latin"].astype(str)
        df["hindi"] = df["hindi"].astype(str)

    # Combine for tokenizer training
    all_latin = pd.concat([train["latin"], dev["latin"], test["latin"]])
    all_hindi = pd.concat([train["hindi"], dev["hindi"], test["hindi"]])

    latin_tokenizer = Tokenizer(char_level=True)
    hindi_tokenizer = Tokenizer(char_level=True)
    latin_tokenizer.fit_on_texts(all_latin)
    hindi_tokenizer.fit_on_texts(all_hindi)

    def encode_sequences(texts, tokenizer, maxlen):
        seq = tokenizer.texts_to_sequences(texts)
        return pad_sequences(seq, maxlen=maxlen, padding='post')

    def encode_targets(texts, tokenizer, maxlen):
        seq = tokenizer.texts_to_sequences(texts)
        start_token = tokenizer.word_index.get('', 0)
        seq = [[start_token] + s + [start_token] for s in seq]
        return pad_sequences(seq, maxlen=maxlen + 2, padding='post')

    # Input sequences
    X_train = encode_sequences(train["latin"], latin_tokenizer, seq_len)
    X_val = encode_sequences(dev["latin"], latin_tokenizer, seq_len)
    X_test = encode_sequences(test["latin"], latin_tokenizer, seq_len)

    # Target sequences
    y_train = encode_targets(train["hindi"], hindi_tokenizer, seq_len)
    y_val = encode_targets(dev["hindi"], hindi_tokenizer, seq_len)
    y_test = encode_targets(test["hindi"], hindi_tokenizer, seq_len)

    # Decoder input/output
    decoder_input_train = y_train[:, :-1]
    decoder_target_train = y_train[:, 1:]

    decoder_input_val = y_val[:, :-1]
    decoder_target_val = y_val[:, 1:]

    vocab_size = len(hindi_tokenizer.word_index) + 1

    def one_hot(sequences, vocab_size):
        return np.array([tf.keras.utils.to_categorical(s, num_classes=vocab_size) for s in sequences])

    decoder_target_train = one_hot(decoder_target_train, vocab_size)
    decoder_target_val = one_hot(decoder_target_val, vocab_size)

    return (X_train, decoder_input_train, decoder_target_train,
            X_val, decoder_input_val, decoder_target_val,
            latin_tokenizer, hindi_tokenizer)


In [37]:
def make_translit_model(src_vocab, tgt_vocab, embedding_size=50, hidden_units=128, rnn_type='gru'):
    # Encoder
    inp_encoder = Input(shape=(None,))
    embed_enc = Embedding(src_vocab, embedding_size)(inp_encoder)

    if rnn_type == 'lstm':
        encoder_outputs, state_h, state_c = LSTM(hidden_units, return_state=True)(embed_enc)
        enc_states = [state_h, state_c]
    elif rnn_type == 'gru':
        encoder_outputs, state_h = GRU(hidden_units, return_state=True)(embed_enc)
        enc_states = [state_h]
    else:
        encoder_outputs, state_h = SimpleRNN(hidden_units, return_state=True)(embed_enc)
        enc_states = [state_h]

    # Decoder
    inp_decoder = Input(shape=(None,))
    embed_dec = Embedding(tgt_vocab, embedding_size)(inp_decoder)

    if rnn_type == 'lstm':
        dec_rnn = LSTM(hidden_units, return_sequences=True, return_state=True)
        dec_outputs, _, _ = dec_rnn(embed_dec, initial_state=enc_states)
    elif rnn_type == 'gru':
        dec_rnn = GRU(hidden_units, return_sequences=True, return_state=True)
        dec_outputs, _ = dec_rnn(embed_dec, initial_state=enc_states)
    else:
        dec_rnn = SimpleRNN(hidden_units, return_sequences=True, return_state=True)
        dec_outputs, _ = dec_rnn(embed_dec, initial_state=enc_states)

    final_dense = Dense(tgt_vocab, activation='softmax')
    dec_outputs = final_dense(dec_outputs)

    model = Model([inp_encoder, inp_decoder], dec_outputs)

    # Inference models
    encoder_model = Model(inp_encoder, enc_states)

    decoder_state_inputs = [Input(shape=(hidden_units,)) for _ in enc_states]
    embedded_inf_dec = Embedding(tgt_vocab, embedding_size)(inp_decoder)

    if rnn_type == 'lstm':
        dec_inf_out, *dec_inf_states = dec_rnn(embedded_inf_dec, initial_state=decoder_state_inputs)
    else:
        dec_inf_out, *dec_inf_states = dec_rnn(embedded_inf_dec, initial_state=decoder_state_inputs)

    dec_inf_out = final_dense(dec_inf_out)
    decoder_model = Model([inp_decoder] + decoder_state_inputs, [dec_inf_out] + dec_inf_states)

    return model, encoder_model, decoder_model


In [38]:
# Unpack all three models: training model, encoder model, decoder model
model, encoder_model, decoder_model = make_translit_model(
    len(latin_token.word_index) + 1,
    len(hindi_token.word_index) + 1,
    embedding_size=64,
    hidden_units=128,
    rnn_type='lstm'  # can be 'gru' or 'rnn' as well
)

# Compile the training model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the training model
model.summary()

# Train the model
model.fit(
    [X_train, dec_in_train],
    dec_out_train,
    epochs=30,
    batch_size=64,
    validation_data=([X_val, dec_in_val], dec_out_val)
)


Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_12 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, None, 64)     1728        ['input_11[0][0]']               
                                                                                                  
 embedding_9 (Embedding)        (None, None, 64)     4096        ['input_12[0][0]']               
                                                                                            

<keras.callbacks.History at 0x799c2dd0cf50>

In [39]:
def transliterate_word(input_seq, encoder_model, decoder_model, tokenizer_src, tokenizer_tgt, max_len=20):
    states = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    start_token = tokenizer_tgt.word_index.get('', 0)
    target_seq[0, 0] = start_token

    reverse_index = {v: k for k, v in tokenizer_tgt.word_index.items()}
    output_chars = []

    for _ in range(max_len):
        decoder_inputs = [target_seq] + states
        preds = decoder_model.predict(decoder_inputs)
        token_index = np.argmax(preds[0][0, -1, :])
        predicted_char = reverse_index.get(token_index, '')

        if predicted_char == '':
            break

        output_chars.append(predicted_char)
        target_seq[0, 0] = token_index
        states = preds[1:]

    return ''.join(output_chars)



In [41]:
print("\nExample predictions:")
for idx in range(5):
    input_seq = X_val[idx:idx+1]
    prediction = transliterate_word(input_seq, encoder_model, decoder_model, latin_token, hindi_token)

    source_input = val_df['latin'].iloc[idx]
    target_output = val_df['hindi'].iloc[idx]

    print(f"Input (Latin): {source_input}")
    print(f"Target (Devanagari): {target_output}")
    print(f"Predicted (Devanagari): {prediction}")
    print("-" * 50)



Example predictions:
Input (Latin): ankan
Target (Devanagari): अंकन
Predicted (Devanagari): अंणण
--------------------------------------------------
Input (Latin): angkor
Target (Devanagari): अंगकोर
Predicted (Devanagari): अंंको
--------------------------------------------------
Input (Latin): angira
Target (Devanagari): अंगिरा
Predicted (Devanagari): अंगररा
--------------------------------------------------
Input (Latin): angithi
Target (Devanagari): अंगीठी
Predicted (Devanagari): अंििथ
--------------------------------------------------
Input (Latin): angrej
Target (Devanagari): अंग्रेज
Predicted (Devanagari): अंरगज
--------------------------------------------------
