## sentence completion

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random

In [1]:
# Step 1: Load and Preprocess Data
file_path = r"Dataset\\en.txt"

# Open the file and read the sentences
with open(file_path, "r", encoding="utf-8") as file:
    sentences = file.readlines()

# Clean the sentences (remove newline characters and extra spaces)
sentences = [sentence.strip() for sentence in sentences]



In [3]:
len(sentences)

1223596

In [9]:
dataset = sentences[:10000]

In [10]:
len(dataset)

10000

In [11]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset)
vocab_size = len(tokenizer.word_index) + 1

In [12]:
# Convert text to sequences
input_sequences = []
for line in dataset:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
input_sequences

In [14]:
# Padding sequences
max_seq_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = keras.utils.to_categorical(y, num_classes=vocab_size)


In [15]:

# Step 2: Define LSTM Model
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 128, input_length=max_seq_length - 1),
    keras.layers.LSTM(256, return_sequences=True),
    keras.layers.LSTM(256),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [17]:
# Step 3: Train Model
model.fit(X, y, epochs=1, verbose=1)

[1m2183/2183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1499s[0m 687ms/step - accuracy: 0.0519 - loss: 6.7083


<keras.src.callbacks.history.History at 0x24103228800>

In [18]:
# Step 4: Generate Sentence Completions
def generate_text(seed_text, next_words=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_length - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
# Example Completion
print(generate_text("where is  ", 2 ))


Where is the meeting


### pytorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

# Load WikiText-2 dataset
from torchtext.datasets import WikiText2

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')

# Tokenize the dataset
counter = Counter()
for line in train_iter:
    counter.update(tokenizer(line))

# Build vocabulary
vocab = Vocab(counter, min_freq=2, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab['<unk>'])

print(f"Vocabulary Size: {len(vocab)}")


In [None]:
def text_to_tensor(text_iter, vocab, tokenizer, seq_length=30):
    sequences = []
    for line in text_iter:
        tokens = tokenizer(line)
        indices = [vocab[token] for token in tokens]
        sequences.extend(indices)
    
    # Convert list to PyTorch tensor
    text_tensor = torch.tensor(sequences, dtype=torch.long)
    
    # Create input-output pairs
    inputs = []
    targets = []
    for i in range(len(text_tensor) - seq_length):
        inputs.append(text_tensor[i:i+seq_length])
        targets.append(text_tensor[i+1:i+seq_length+1])

    return torch.stack(inputs), torch.stack(targets)

train_iter = WikiText2(split='train')
X_train, y_train = text_to_tensor(train_iter, vocab, tokenizer)
print(f"Training Data Shape: {X_train.shape}, {y_train.shape}")


In [None]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden

# Define model parameters
vocab_size = len(vocab)
embed_size = 128
hidden_size = 256
num_layers = 2

# Instantiate model
model = LSTMLanguageModel(vocab_size, embed_size, hidden_size, num_layers)


In [None]:
# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
batch_size = 64
seq_length = 30
hidden = None

for epoch in range(num_epochs):
    total_loss = 0
    for i in range(0, X_train.size(0), batch_size):
        inputs = X_train[i:i+batch_size].to(device)
        targets = y_train[i:i+batch_size].to(device)

        optimizer.zero_grad()
        output, hidden = model(inputs, hidden)

        loss = criterion(output.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(X_train):.4f}")


In [2]:
import random

def generate_text(prompt, model, vocab, tokenizer, seq_length=10, max_words=50):
    model.eval()
    
    words = tokenizer(prompt)
    indices = [vocab[token] for token in words]
    input_seq = torch.tensor(indices, dtype=torch.long).unsqueeze(0).to(device)

    generated_words = words

    hidden = None
    for _ in range(max_words):
        output, hidden = model(input_seq, hidden)
        next_word_index = output.argmax(dim=-1)[:, -1].item()
        next_word = vocab.lookup_token(next_word_index)
        
        generated_words.append(next_word)
        input_seq = torch.cat([input_seq[:, 1:], torch.tensor([[next_word_index]], dtype=torch.long).to(device)], dim=1)

    return " ".join(generated_words)

# Example Usage
prompt = "The future of AI is"
generated_text = generate_text(prompt, model, vocab, tokenizer)
print("Generated Text:", generated_text)


NameError: name 'vocab' is not defined

# language translate

### torch

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import numpy as np


file_path = r"Dataset\\en.txt"
with open(file_path, "r", encoding="utf-8") as file:
    sentences = file.readlines()
english_sentences = [sentence.strip() for sentence in sentences]


file_path = r"Dataset\\ta.txt"

with open(file_path, "r", encoding="utf-8") as file:
    sentences = file.readlines()
tamil_sentences = [sentence.strip() for sentence in sentences]


# Tokenization function
def tokenize(sentences):
    tokenized = [sentence.lower().replace(",", "").replace("?", "").split() for sentence in sentences]
    return tokenized

# Create vocabularies
def build_vocab(tokenized_sentences):
    counter = Counter()
    for sentence in tokenized_sentences:
        counter.update(sentence)
    vocab = {word: i + 2 for i, (word, _) in enumerate(counter.most_common())}  # Start index from 2
    vocab["<pad>"] = 0
    vocab["<unk>"] = 1
    return vocab

# Convert sentences to numerical sequences
def encode_sentences(tokenized_sentences, vocab):
    return [[vocab.get(word, vocab["<unk>"]) for word in sentence] for sentence in tokenized_sentences]

# Tokenization
eng_tokenized = tokenize(english_sentences)
tam_tokenized = tokenize(tamil_sentences)

# Build vocabulary
eng_vocab = build_vocab(eng_tokenized)
tam_vocab = build_vocab(tam_tokenized)

# Convert text to sequences
eng_sequences = encode_sentences(eng_tokenized, eng_vocab)
tam_sequences = encode_sentences(tam_tokenized, tam_vocab)

# Pad sequences
eng_sequences = pad_sequence([torch.tensor(seq) for seq in eng_sequences], batch_first=True, padding_value=0)
tam_sequences = pad_sequence([torch.tensor(seq) for seq in tam_sequences], batch_first=True, padding_value=0)


In [44]:

# Define the LSTM Encoder-Decoder Model
class Seq2SeqModel(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim=128, hidden_dim=256):
        super(Seq2SeqModel, self).__init__()
        
        # Encoder
        self.encoder_embedding = nn.Embedding(input_dim, embed_dim, padding_idx=0)
        self.encoder_lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        
        # Decoder
        self.decoder_embedding = nn.Embedding(output_dim, embed_dim, padding_idx=0)
        self.decoder_lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, encoder_input, decoder_input):
        # Encoder
        enc_emb = self.encoder_embedding(encoder_input)
        _, (hidden, cell) = self.encoder_lstm(enc_emb)

        # Decoder
        dec_emb = self.decoder_embedding(decoder_input)
        decoder_output, _ = self.decoder_lstm(dec_emb, (hidden, cell))
        output = self.fc(decoder_output)  # Predict next words
        
        return output


In [45]:

# Model setup
input_dim = len(eng_vocab)
output_dim = len(tam_vocab)
model = Seq2SeqModel(input_dim, output_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Prepare training data
X_train = eng_sequences
y_train = tam_sequences

# Shift decoder input (teacher forcing)
decoder_input = y_train[:, :-1]  # Remove last token
decoder_target = y_train[:, 1:]  # Remove first token


In [46]:

# Training loop
num_epochs = 1000
batch_size = 4

for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(X_train, decoder_input)
    loss = criterion(outputs.view(-1, output_dim), decoder_target.reshape(-1))
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")



Epoch [0/1000], Loss: 5.9562
Epoch [10/1000], Loss: 5.4402
Epoch [20/1000], Loss: 4.1186
Epoch [30/1000], Loss: 2.4821
Epoch [40/1000], Loss: 1.1143
Epoch [50/1000], Loss: 0.4310
Epoch [60/1000], Loss: 0.1852
Epoch [70/1000], Loss: 0.0945
Epoch [80/1000], Loss: 0.0576
Epoch [90/1000], Loss: 0.0402
Epoch [100/1000], Loss: 0.0308
Epoch [110/1000], Loss: 0.0251
Epoch [120/1000], Loss: 0.0212
Epoch [130/1000], Loss: 0.0183
Epoch [140/1000], Loss: 0.0161
Epoch [150/1000], Loss: 0.0143
Epoch [160/1000], Loss: 0.0129
Epoch [170/1000], Loss: 0.0116
Epoch [180/1000], Loss: 0.0106
Epoch [190/1000], Loss: 0.0097
Epoch [200/1000], Loss: 0.0089
Epoch [210/1000], Loss: 0.0082
Epoch [220/1000], Loss: 0.0076
Epoch [230/1000], Loss: 0.0071
Epoch [240/1000], Loss: 0.0066
Epoch [250/1000], Loss: 0.0062
Epoch [260/1000], Loss: 0.0058
Epoch [270/1000], Loss: 0.0055
Epoch [280/1000], Loss: 0.0052
Epoch [290/1000], Loss: 0.0049
Epoch [300/1000], Loss: 0.0046
Epoch [310/1000], Loss: 0.0044
Epoch [320/1000], L

In [47]:

# Translation function
def translate(sentence):
    model.eval()
    with torch.no_grad():
        sequence = encode_sentences(tokenize([sentence]), eng_vocab)
        sequence = pad_sequence([torch.tensor(seq) for seq in sequence], batch_first=True, padding_value=0)
        
        decoder_input = torch.zeros((1, sequence.shape[1]), dtype=torch.long)  # Empty decoder input
        states_value = model(sequence, decoder_input)
        predicted_seq = torch.argmax(states_value, dim=-1).squeeze(0).tolist()
        
        output_sentence = ' '.join([word for word, index in tam_vocab.items() if index in predicted_seq])
        return output_sentence


In [48]:
# Test Translation
print(translate("I like that movie."))

நான்


### tensorflow

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np


english_sentences = [
    "Hello, how are you?",
    "I love deep learning.",
    "What is your name?",
    "Where do you live?"
]

tamil_sentences = [
    "வணக்கம், நீங்கள் எப்படி இருக்கிறீர்கள்?",
    "எனக்கு ஆழ்ந்த கற்றல் பிடிக்கும்.",
    "உங்கள் பெயர் என்ன?",
    "நீங்கள் எங்கு வாழ்கிறீர்கள்?"
]

# Tokenize English sentences
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)

# Tokenize Tamil sentences
tam_tokenizer = Tokenizer()
tam_tokenizer.fit_on_texts(tamil_sentences)
tam_vocab_size = len(tam_tokenizer.word_index) + 1
tam_sequences = tam_tokenizer.texts_to_sequences(tamil_sentences)

# Pad sequences to ensure uniform length
max_length = max(max(len(seq) for seq in eng_sequences), max(len(seq) for seq in tam_sequences))
eng_sequences = pad_sequences(eng_sequences, maxlen=max_length, padding='post')
tam_sequences = pad_sequences(tam_sequences, maxlen=max_length, padding='post')

# Split input-output pairs
X_train = eng_sequences
y_train = tam_sequences

# Define Seq2Seq Model (Encoder-Decoder)
embedding_dim = 128
units = 256

# Encoder
encoder_inputs = keras.layers.Input(shape=(max_length,))
enc_emb = keras.layers.Embedding(eng_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = keras.layers.Input(shape=(max_length,))
dec_emb = keras.layers.Embedding(tam_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = keras.layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(tam_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile Model
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Shift decoder input (y_train) correctly
decoder_input_data = y_train[:, :-1]  # Remove last token
decoder_target_data = y_train[:, 1:]  # Remove first token

# Ensure shapes match
print("Encoder Input Shape:", X_train.shape)  # (num_samples, max_length)
print("Decoder Input Shape:", decoder_input_data.shape)  # (num_samples, max_length - 1)
print("Decoder Target Shape:", decoder_target_data.shape)  # (num_samples, max_length - 1)

# Fix shape mismatch by adjusting max_length
decoder_input_data = pad_sequences(decoder_input_data, maxlen=max_length, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=max_length, padding='post')

# Train Model
model.fit([X_train, decoder_input_data], decoder_target_data, batch_size=64, epochs=100)




Encoder Input Shape: (4, 4)
Decoder Input Shape: (4, 3)
Decoder Target Shape: (4, 3)
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0625 - loss: 2.6393
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.6250 - loss: 2.6221
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.6875 - loss: 2.6043
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.6875 - loss: 2.5854
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.7500 - loss: 2.5649
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.7500 - loss: 2.5420
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.7500 - loss: 2.5160
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accur

<keras.src.callbacks.history.History at 0x1f9fdce2db0>

In [2]:
# Function to translate English → Tamil
def translate(sentence):
    sequence = eng_tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    states_value = model.predict([sequence, np.zeros((1, max_length))])
    predicted_seq = np.argmax(states_value, axis=-1)[0]
    output_sentence = ' '.join([word for word, index in tam_tokenizer.word_index.items() if index in predicted_seq])
    return output_sentence

In [9]:
# Test Translation
print(translate("what is  name"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
பெயர் என்ன
