In [157]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

# Load the dataset
df = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/datasets/Cbl_sumry.csv")

In [158]:
tokenizer_cobol = Tokenizer(filters='')
tokenizer_cobol.fit_on_texts(df['cobol_code'])
tokenizer_summary = Tokenizer(filters='')
tokenizer_summary.fit_on_texts(df['summary'])

# Define the vocabulary sizes
vocab_size_cobol = len(tokenizer_cobol.word_index) + 1
vocab_size_summary = len(tokenizer_summary.word_index) + 1

In [159]:
#tokenizer_cobol.index_word

In [160]:
#tokenizer_summary.index_word

In [161]:
# Prepare input-output pairs
X = tokenizer_cobol.texts_to_sequences(df['cobol_code'])
Y = tokenizer_summary.texts_to_sequences(df['summary'])
# Pad sequences
max_len_cobol = max([len(seq) for seq in X])
max_len_summary = max([len(seq) for seq in Y])

X = pad_sequences(X, maxlen=max_len_cobol, padding='post')
Y = pad_sequences(Y, maxlen=max_len_summary, padding='post')

In [162]:
# Shift the target sequences for training
Y_input = Y[:, :-1]
Y_output = Y[:, 1:]

# Define the Seq2Seq model
latent_dim = 512

encoder_inputs = Input(shape=(max_len_cobol,))
encoder_embedding = Embedding(vocab_size_cobol, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_len_summary-1,))
decoder_embedding = Embedding(vocab_size_summary, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_summary, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [55]:
# Compile and train the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.fit([X, Y[:,:-1]], Y.reshape(Y.shape[0], Y.shape[1], 1)[:,1:],batch_size=10, epochs=100, validation_split=0.2) 


Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 986ms/step - accuracy: 0.0837 - loss: 5.1506 - val_accuracy: 0.5556 - val_loss: 5.1335
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 483ms/step - accuracy: 0.7043 - loss: 5.0914 - val_accuracy: 0.5556 - val_loss: 5.0535
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 483ms/step - accuracy: 0.7046 - loss: 4.9118 - val_accuracy: 0.0313 - val_loss: 5.0209
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 475ms/step - accuracy: 0.2097 - loss: 4.6802 - val_accuracy: 0.5556 - val_loss: 5.4768
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 476ms/step - accuracy: 0.7080 - loss: 4.5576 - val_accuracy: 0.0256 - val_loss: 5.2646
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 473ms/step - accuracy: 0.2163 - loss: 4.2314 - val_accuracy: 0.5584 - val_loss: 5.4687
Epoch 7/100
[1m4/4[0m [32m━━━━

<keras.src.callbacks.history.History at 0x1a5eb64e250>

In [56]:
# Save the model
model.save('seq2seq_model_c2s.keras') 

In [163]:
#testing

import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Assuming your test data is loaded and preprocessed similarly to the training data
df_test = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/datasets/Cbl_sumry.csv")
X_test = tokenizer_cobol.texts_to_sequences(df_test['cobol_code'])
Y_test = tokenizer_summary.texts_to_sequences(df_test['summary'])

X_test = pad_sequences(X_test, maxlen=max_len_cobol, padding='post')
Y_test = pad_sequences(Y_test, maxlen=max_len_summary, padding='post')

# Load the trained model
model = load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/seq2seq_model_c2s.keras")

# Function to calculate sequence accuracy
def sequence_accuracy(y_true, y_pred):
    correct = 0
    total = 0
    for true_seq, pred_seq in zip(y_true, y_pred):
        true_seq = true_seq[true_seq != 0]  # Remove padding
        pred_seq = pred_seq[:len(true_seq)]  # Truncate to the length of the true sequence
        if np.array_equal(true_seq, pred_seq):
            correct += 1
        total += 1
    return correct / total

# Generate predictions
Y_pred = model.predict([X_test, Y_test[:, :-1]])

# Convert predictions to sequences of token ids
Y_pred_sequences = np.argmax(Y_pred, axis=-1)

# Calculate the accuracy
accuracy = sequence_accuracy(Y_test[:, 1:], Y_pred_sequences)
print(f'Sequence Accuracy: {accuracy * 100:.2f}%')


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 662ms/step
Sequence Accuracy: 46.51%


In [164]:
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/seq2seq_model_c2s.keras")

def preprocess_input(cobol_code, tokenizer_cobol, max_len_cobol):
    # Tokenize the input COBOL code
    sequence = tokenizer_cobol.texts_to_sequences([cobol_code])
    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_len_cobol, padding='post')
    return padded_sequence


In [165]:
def predict_summary(cobol_code, model, tokenizer_cobol, tokenizer_summary, max_len_cobol, max_len_summary):
    # Preprocess the input COBOL code
    input_seq = preprocess_input(cobol_code, tokenizer_cobol, max_len_cobol)
    
    # Initialize the decoder input
    decoder_input = np.zeros((1, max_len_summary - 1))
    
    # Predict the summary
    for i in range(max_len_summary - 1):
        output_tokens = model.predict([input_seq, decoder_input])
        sampled_token_index = np.argmax(output_tokens[0, i, :])
        decoder_input[0, i] = sampled_token_index
        
        # Stop if a zero token (padding) is predicted, indicating no more meaningful tokens
        if sampled_token_index == 0:
            break
    
    # Convert token indices back to words
    predicted_summary = []
    for token in decoder_input[0]:
        if token == 0:
            continue
        word = tokenizer_summary.index_word.get(token, '')
        predicted_summary.append(word)
    
    return ' '.join(predicted_summary)


In [178]:
# Define your COBOL code to be summarized
cobol_code_example = "PRODUCT-ID"

# Predict the summary
summary = predict_summary(cobol_code_example, model, tokenizer_cobol, tokenizer_summary, max_len_cobol, max_len_summary)
print(f'Summary: {summary}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
Summary: is a 01 variable stores numeric
