In [62]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

# Load the dataset
df = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/cobol_sum.csv")

In [64]:
summary = df['summary'].values
summary = ['<start> ' + text + ' <end>' for text in summary]

In [65]:
summary 

['<start> display the message "HELLO WORLD!" on the screen <end>',
 '<start>  program accepts user input, displays the input along with a message, and then displays "HELLO WORLD! <end>',
 '<start> COBOL code defines a variable named USERINP with a size of 10 characters, allowing it to store up to 10 characters of alphanumeric user input <end>',
 '<start> Declares a numeric variable NUM1 with a picture clause of 2 digits and an initial value of 10. <end>',
 '<start> A loop that iterates from 1 to 10, displaying the value of {LOOP_VAR} in each iteration. <end>',
 '<start> Conditional statement checking if {VAR1} equals 10 and displaying appropriate messages. <end>',
 '<start> Opens a file named {FILE} for input. <end>',
 '<start> Reads a record from {FILE} into {RECORD}. <end>',
 '<start> Closes the file {FILE}. <end>',
 '<start> Declares an array of 10 numeric items. <end>',
 '<start> Calls a subroutine with a parameter. <end>',
 "<start> Declares a string variable {STRING1} with a leng

In [66]:
import pandas as pd

# Assuming df is your DataFrame with the 'summary' column
df['summary'] = '<start> ' + df['summary'] + ' <end>'


In [67]:
tokenizer_cobol = Tokenizer(filters='')
tokenizer_cobol.fit_on_texts(df['cobol_code'])
tokenizer_summary = Tokenizer(filters='')
tokenizer_summary.fit_on_texts(df['summary'])

# Define the vocabulary sizes
vocab_size_cobol = len(tokenizer_cobol.word_index) + 1
vocab_size_summary = len(tokenizer_summary.word_index) + 1

In [68]:
#tokenizer_cobol.index_word

In [70]:
#tokenizer_summary.index_word

In [71]:
# Prepare input-output pairs
X = tokenizer_cobol.texts_to_sequences(df['cobol_code'])
Y = tokenizer_summary.texts_to_sequences(df['summary'])
# Pad sequences
max_len_cobol = max([len(seq) for seq in X])
max_len_summary = max([len(seq) for seq in Y])

X = pad_sequences(X, maxlen=max_len_cobol, padding='post')
Y = pad_sequences(Y, maxlen=max_len_summary, padding='post')

In [29]:
# Shift the target sequences for training
Y_input = Y[:, :-1]
Y_output = Y[:, 1:]

# Define the Seq2Seq model
latent_dim = 512

encoder_inputs = Input(shape=(max_len_cobol,))
encoder_embedding = Embedding(vocab_size_cobol, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_len_summary-1,))
decoder_embedding = Embedding(vocab_size_summary, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_summary, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [31]:
# Compile and train the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',)
model.fit([X, Y[:,:-1]], Y.reshape(Y.shape[0], Y.shape[1], 1)[:,1:],batch_size=30, epochs=100, validation_split=0.2)


Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 339ms/step - loss: 3.4975 - val_loss: 5.1694
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 225ms/step - loss: 3.3106 - val_loss: 5.1126
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 236ms/step - loss: 3.1400 - val_loss: 5.1217
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 224ms/step - loss: 3.1676 - val_loss: 5.1615
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 222ms/step - loss: 3.0582 - val_loss: 5.1187
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 224ms/step - loss: 3.0238 - val_loss: 5.2160
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 222ms/step - loss: 2.9855 - val_loss: 5.0671
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 222ms/step - loss: 2.8778 - val_loss: 5.0318
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x221b3461fd0>

In [33]:
# Save the model
model.save('seq2seq_model.keras') 

In [37]:
#testing

import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Assuming your test data is loaded and preprocessed similarly to the training data
df_test = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/test.csv")
X_test = tokenizer_cobol.texts_to_sequences(df_test['cobol_code'])
Y_test = tokenizer_summary.texts_to_sequences(df_test['summary'])

X_test = pad_sequences(X_test, maxlen=max_len_cobol, padding='post')
Y_test = pad_sequences(Y_test, maxlen=max_len_summary, padding='post')

# Load the trained model
model = load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/seq2seq_model.keras")

# Function to calculate sequence accuracy
def sequence_accuracy(y_true, y_pred):
    correct = 0
    total = 0
    for true_seq, pred_seq in zip(y_true, y_pred):
        true_seq = true_seq[true_seq != 0]  # Remove padding
        pred_seq = pred_seq[:len(true_seq)]  # Truncate to the length of the true sequence
        if np.array_equal(true_seq, pred_seq):
            correct += 1
        total += 1
    return correct / total

# Generate predictions
Y_pred = model.predict([X_test, Y_test[:, :-1]])

# Convert predictions to sequences of token ids
Y_pred_sequences = np.argmax(Y_pred, axis=-1)

# Calculate the accuracy
accuracy = sequence_accuracy(Y_test[:, 1:], Y_pred_sequences)
print(f'Sequence Accuracy: {accuracy * 100:.2f}%')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486ms/step
Sequence Accuracy: 66.67%


In [38]:
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/seq2seq_model.keras")

def preprocess_input(cobol_code, tokenizer_cobol, max_len_cobol):
    # Tokenize the input COBOL code
    sequence = tokenizer_cobol.texts_to_sequences([cobol_code])
    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_len_cobol, padding='post')
    return padded_sequence


In [43]:
def predict_summary(cobol_code, model, tokenizer_cobol, tokenizer_summary, max_len_cobol, max_len_summary):
    # Preprocess the input COBOL code
    input_seq = preprocess_input(cobol_code, tokenizer_cobol, max_len_cobol)
    
    # Initialize the decoder input
    decoder_input = np.zeros((1, max_len_summary - 1))
    
    # Predict the summary
    for i in range(max_len_summary - 1):
        output_tokens = model.predict([input_seq, decoder_input])
        sampled_token_index = np.argmax(output_tokens[0, i, :])
        decoder_input[0, i] = sampled_token_index
        
        # Stop if a zero token (padding) is predicted, indicating no more meaningful tokens
        if sampled_token_index == 0:
            break
    
    # Convert token indices back to words
    predicted_summary = []
    for token in decoder_input[0]:
        if token == 0:
            continue
        word = tokenizer_summary.index_word.get(token, '')
        predicted_summary.append(word)
    
    return ' '.join(predicted_summary)


In [50]:
# Define your COBOL code to be summarized
cobol_code_example = "SORT {DATASET} ON ASCENDING KEY {KEY1} USING {INPUT-FILE} GIVING {OUTPUT-FILE}."

# Predict the summary
summary = predict_summary(cobol_code_example, model, tokenizer_cobol, tokenizer_summary, max_len_cobol, max_len_summary)
print(f'Summary: {summary}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Summary: data by and the in into into {result}.
