In [1]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [2]:
! kaggle datasets download mathurinache/samanantar

samanantar.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
! unzip samanantar.zip

Archive:  samanantar.zip
replace final_data/en-as/train.as? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [7]:
!pip install torch torchtext torchmetrics transformers datasets spacy tqdm




In [4]:
# Assuming you have downloaded and extracted the Samantar dataset
# You can modify the path accordingly

english_data_path = '/content/final_data/en-hi/train.en'
hindi_data_path = '/content/final_data/en-hi/train.hi'

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return [line.strip() for line in data]

# Load English and Hindi sentences
english_sentences = load_data(english_data_path)
hindi_sentences = load_data(hindi_data_path)

# Display a few examples
for i in range(5):
    print(f"English: {english_sentences[i]}")
    print(f"Hindi: {hindi_sentences[i]}")
    print("-" * 50)


English: In reply, Pakistan got off to a solid start.
Hindi: जिसके जवाब में पाक ने अच्छी शुरुआत की थी.
--------------------------------------------------
English: The European Union has seven principal decision-making bodies, its institutions: the European Parliament, the European Council, the Council of the European Union, the European Commission, the Court of Justice of the European Union, the European Central Bank and the European Court of Auditors.
Hindi: यूरोपीय संघ के महत्वपूर्ण संस्थानों में यूरोपियन कमीशन, यूरोपीय संसद, यूरोपीय संघ परिषद, यूरोपीय न्यायलय एवं यूरोपियन सेंट्रल बैंक इत्यादि शामिल हैं।
--------------------------------------------------
English: The Congress leader represents Sivaganga Lok Sabha segment from Tamil Nadu.
Hindi: कांग्रेस नेता तमिलनाडु से शिवगंगा लोकसभा क्षेत्र का प्रतिनिधित्व करते हैं.
--------------------------------------------------
English: Prompt the user about connection attempts
Hindi: संबंधन प्रयास के बारे में उपयोक्ता को प्रांप्ट करें
-------

In [5]:
type(english_sentences)

list

In [8]:
!pip install --upgrade tensorflow
import tensorflow as tf
tf.config.run_functions_eagerly(True)



In [None]:
import tensorflow as tf
import numpy as np
import string
import re

def preprocess(text):
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    text = text.lower()
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.strip()
    return text

total_sentences = 2500  # Total sentences - reducing from the original
maxlen = 10  # Max length of sentences for both English and Hindi

en_data = []
hi_data = []

cnt = 0

for (en, hi) in zip(english_sentences, hindi_sentences):
    l = min(len(en.split()), len(hi.split()))
    if l <= maxlen:
        en_data.append(preprocess(en))
        hi_data.append(preprocess(hi))
        cnt += 1
    if cnt == total_sentences:
        break

# Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

# Padding sequences
en_sequences = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, padding='post')
hi_sequences = tf.keras.preprocessing.sequence.pad_sequences(hi_sequences, padding='post')

# Calculate vocabulary sizes
english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1

batch_size = 32

# Prepare decoder data
decoder_inputs = hi_sequences[:, :-1]  # Remove the last token
decoder_outputs = hi_sequences[:, 1:]   # Remove the first token

# One-hot encode the decoder outputs
decoder_outputs = tf.keras.utils.to_categorical(decoder_outputs, num_classes=hindi_vocab_size)

# Choose a divisible number for the number of samples
num_samples = (len(en_sequences) // batch_size) * batch_size

# Use the chosen number of samples for training
en_sequences = en_sequences[:num_samples]
decoder_inputs = decoder_inputs[:num_samples]
decoder_outputs = decoder_outputs[:num_samples]

# Define Transformer model
d_model = 256  # dimension of the internal vector representations (h, c, and embedding vectors)
num_layers = 4
num_heads = 8
dff = 512

# Encoder layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training):
        attn_output = self.mha(x, x, x)  # self-attention
        out1 = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + out1)  # residual connection

        ffn_output = self.ffn(out1)  # feed-forward network
        out2 = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + out2)  # residual connection

        return out2

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training):
        seq_len = tf.shape(x)[1]

        # Adding embedding and position encoding
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training)

        return x

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

# Create Transformer model
encoder = Encoder(num_layers, d_model, num_heads, dff, english_vocab_size, maximum_position_encoding=10000)

# Define inputs
inputs = tf.keras.layers.Input(shape=(None,))
x = encoder(inputs, training=True)

# Flatten and Dense layer for the output
x = tf.keras.layers.GlobalAveragePooling1D()(x)

outputs = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')(x)


# Build model
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

# Compile model
loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
epochs = 10
validation_split = 0.2

model.fit(en_sequences, decoder_outputs, epochs=epochs, validation_split=validation_split)


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 encoder_3 (Encoder)         (None, None, 256)         3239680   
                                                                 
 global_average_pooling1d_3  (None, 256)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_36 (Dense)            (None, 5327)              1369039   
                                                                 
Total params: 4608719 (17.58 MB)
Trainable params: 4608719 (17.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
