In [1]:
import pandas as pd
import numpy as np

Reading CSV File

In [2]:
df = pd.read_csv("./Final_IC.csv")
df.head(5)

Unnamed: 0,article_id,article_desc
0,Article 1 of Indian Constitution,"Name and territory of the Union India, that is..."
1,Article 2 of Indian Constitution,Admission or establishment of new States: Parl...
2,Article 2A of Indian Constitution,Sikkim to be associated with the Union Rep by ...
3,Article 3 of Indian Constitution,Formation of new States and alteration of area...
4,Article 4 of Indian Constitution,Laws made under Articles 2 and 3 to provide fo...


In [3]:
df['ques'] = 'What is ' + df['article_id'].astype(str)

Preprocessing of the text

In [4]:
df['ans'] = df['article_id'].astype(str) + ' states that ' + df['article_desc']
df.drop('article_id',inplace=True,axis=1)
df.drop('article_desc',inplace=True,axis = 1)

In [5]:
df.head(5)

Unnamed: 0,ques,ans
0,What is Article 1 of Indian Constitution,Article 1 of Indian Constitution states that N...
1,What is Article 2 of Indian Constitution,Article 2 of Indian Constitution states that A...
2,What is Article 2A of Indian Constitution,Article 2A of Indian Constitution states that ...
3,What is Article 3 of Indian Constitution,Article 3 of Indian Constitution states that F...
4,What is Article 4 of Indian Constitution,Article 4 of Indian Constitution states that L...


In [6]:
import tensorflow as tf
Tokenizer = tf.keras.preprocessing.text.Tokenizer
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
Sequential = tf.keras.models.Sequential
Embedding = tf.keras.layers.Embedding
SimpleRNN = tf.keras.layers.SimpleRNN
Dense = tf.keras.layers.Dense
LSTM = tf.keras.layers.LSTM
Dropout = tf.keras.layers.Dropout

In [7]:
questions = df['ques'].tolist()
answers = df['ans'].tolist()

In [8]:
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(answers)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(answers)[0]

# Prepare input and target sequences
input_sequences = []
output_sequences = []

sequence_length = 100
for i in range(len(sequences) - sequence_length):
    input_sequences.append(sequences[i:i + sequence_length])
    output_sequences.append(sequences[i + sequence_length])

input_sequences = np.array(input_sequences)
output_sequences = np.array(output_sequences)

vocab_size = len(tokenizer.word_index) + 1

In [9]:
model = Sequential([
    Embedding(vocab_size, 32, input_length=sequence_length),
    LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(vocab_size, activation="softmax"),
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           1408      
                                                                 
 lstm (LSTM)                 (None, 100, 128)          82432     
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 44)                5676      
                                                                 
Total params: 221100 (863.67 KB)
Trainable params: 221100 (863.67 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
epochs = 50
batch_size = 32
model.fit(input_sequences, output_sequences, epochs=epochs, batch_size=batch_size)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1ec1c447dc0>

In [17]:
def generate_text(seed_text, model, tokenizer, sequence_length, num_chars_to_generate):
    generated_text = seed_text

    for _ in range(num_chars_to_generate):
        token_list = tokenizer.texts_to_sequences([generated_text])
        token_list = pad_sequences(token_list, maxlen=sequence_length, padding="pre")
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_token = np.argmax(predicted_probs, axis=-1)[0]  # Get the index of the predicted token

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_token:
                output_word = word
                break

        generated_text += output_word

    return generated_text


In [18]:
seed_text = "Article 2 of Indian Constitution"

generated_text = generate_text(seed_text, model, tokenizer, sequence_length, num_chars_to_generate=150)
print(generated_text)

Article 2 of Indian Constitution territories the territories the territories the territories the territories the territories the territories the territories the territories the terri
