In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

In [2]:
df = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/cob_py.csv")

In [3]:
#df

In [4]:
cobol_tokenizer = Tokenizer(filters='', char_level=True)
cobol_tokenizer.fit_on_texts(df['cobol_code'])
cobol_seq = cobol_tokenizer.texts_to_sequences(df['cobol_code'])
cobol_seq_padded = pad_sequences(cobol_seq, padding='post')

In [7]:
#cobol_seq_padded

In [8]:
# Tokenization
df['python_code'] = df['python_code'].astype(str)
python_tokenizer = Tokenizer(filters='', char_level=True)
python_tokenizer.fit_on_texts(df['python_code'])
python_seq = python_tokenizer.texts_to_sequences(df['python_code'])
python_seq_padded = pad_sequences(python_seq, padding='post')

In [9]:
#python_seq_padded

In [10]:
# Define the model
cobol_vocab_size = len(cobol_tokenizer.word_index) + 1
python_vocab_size = len(python_tokenizer.word_index) + 1
latent_dim = 256

In [11]:
# Define the model
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(cobol_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
_, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(python_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm1(decoder_embedding, initial_state=encoder_states)
decoder_outputs, _, _ = decoder_lstm2(decoder_outputs, initial_state=encoder_states)
decoder_dense = Dense(python_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [12]:
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [19]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [20]:
# Train the model
model.fit([cobol_seq_padded, python_seq_padded], np.expand_dims(python_seq_padded, -1), batch_size=10, epochs=44, validation_split=0.2)


Epoch 1/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4s/step - loss: 4.0578 - val_loss: 3.7041
Epoch 2/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5s/step - loss: 3.6040 - val_loss: 3.5955
Epoch 3/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6s/step - loss: 3.4414 - val_loss: 3.5467
Epoch 4/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5s/step - loss: 3.3799 - val_loss: 3.5417
Epoch 5/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6s/step - loss: 3.3584 - val_loss: 3.5413
Epoch 6/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5s/step - loss: 3.3626 - val_loss: 3.5118
Epoch 7/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7s/step - loss: 3.3404 - val_loss: 3.4926
Epoch 8/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5s/step - loss: 3.3313 - val_loss: 3.4818
Epoch 9/44
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0

<keras.src.callbacks.history.History at 0x27912ab1010>

In [22]:
# Save the model
model.save('cob_to_py.keras')

In [26]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/cob_py.csv")

# Tokenization
# Assume you have already tokenized and padded the sequences as shown in your previous code

# Load the trained seq2seq model
model = load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/cob_to_py.keras")

# Evaluate the model on the dataset
predictions = model.predict([cobol_seq_padded, python_seq_padded])
predictions = np.argmax(predictions, axis=2)

# Flatten the predictions and ground truth sequences for computing accuracy
predictions_flat = predictions.flatten()
python_seq_padded_flat = python_seq_padded.flatten()

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(python_seq_padded_flat, predictions_flat)
conf_matrix = confusion_matrix(python_seq_padded_flat, predictions_flat)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)

  saveable.load_own_variables(weights_store.get(inner_path))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 849ms/step
Accuracy: 0.43292546026044004
Confusion Matrix:
[[   0    0    0 ...    0    0    0]
 [   0 1131    0 ...    0    0    0]
 [   0    0  488 ...    0    0    0]
 ...
 [   0    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    0]]


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the saved model
model = tf.keras.models.load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/cob_to_py.keras")

MAX_SEQUENCE_LENGTH=80
# Tokenize and pad the COBOL code snippet
cobol_code = "IDENTIFICATION DIVISION. PROGRAM-ID. HelloWorld. PROCEDURE DIVISION. DISPLAY 'Hello, World!'. STOP RUN."
cobol_seq = cobol_tokenizer.texts_to_sequences([cobol_code])
cobol_seq_padded = pad_sequences(cobol_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

cobol_tokenizer = Tokenizer(filters='', char_level=True)
cobol_tokenizer.fit_on_texts(df['cobol_code'])

python_tokenizer = Tokenizer(filters='\t\n', char_level=True)
python_tokenizer.fit_on_texts(df['python_code'].astype(str))
python_tokenizer.word_index['\n'] = len(python_tokenizer.word_index) + 1


# Generate the Python code snippet
zero_input = np.zeros((1, 1))

predicted_sequence = []
for _ in range(MAX_SEQUENCE_LENGTH):
    output = model.layers[2](zero_input)
    predicted_id = tf.argmax(output[0]).numpy()
    if (predicted_id == python_tokenizer.word_index['\n']).any():
      break
    predicted_sequence.append(predicted_id)
    zero_input = np.array([[predicted_id]])

predicted_python_code = python_tokenizer.sequences_to_texts([predicted_sequence])[0]
print("Predicted Python Code:")
print(predicted_python_code)


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
import numpy as np
import pandas as pd

# Load the saved model
model = tf.keras.models.load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/cob_to_py.keras")

# Load the dataset
df = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/cob_py.csv")

MAX_SEQUENCE_LENGTH = 80

# Tokenize and pad the COBOL code snippet
cobol_tokenizer = Tokenizer(filters='', char_level=True)
cobol_tokenizer.fit_on_texts(df['cobol_code'])
cobol_code = "IDENTIFICATION DIVISION. PROGRAM-ID. HelloWorld. PROCEDURE DIVISION. DISPLAY 'Hello, World!'. STOP RUN."
cobol_seq = cobol_tokenizer.texts_to_sequences([cobol_code])
cobol_seq_padded = pad_sequences(cobol_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Tokenize and pad the Python code snippets
python_tokenizer = Tokenizer(filters='\t\n', char_level=True)
python_tokenizer.fit_on_texts(df['python_code'].astype(str))
python_tokenizer.word_index['\n'] = len(python_tokenizer.word_index) + 1

# Define the model with Embedding layer
python_vocab_size = len(python_tokenizer.word_index) + 1
latent_dim = 256

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(python_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(python_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Generate the Python code snippet
zero_input = np.zeros((1, 1))

predicted_sequence = []
for _ in range(MAX_SEQUENCE_LENGTH):
    output = model.layers[2](zero_input)
    predicted_id = tf.argmax(output[0]).numpy()
    if (predicted_id == python_tokenizer.word_index['\n']).any():
        break
    predicted_sequence.append(predicted_id)
    zero_input = np.array([[predicted_id]])

predicted_python_code = python_tokenizer.sequences_to_texts([predicted_sequence])[0]
print("Predicted Python Code:")
print(predicted_python_code)


In [10]:
## Date : 12-05-2024

In [102]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, RepeatVector, Attention, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Load the CSV dataset
data = pd.read_csv("C:/Users/Jabasingh Daniel/Desktop/EGDK/Dataset/cob_py.csv")

# Preprocess the data
cobol_text = data["cobol_code"].tolist()
python_text = data["python_code"].tolist()
python_text = [str(item) for item in python_text]

# Create separate tokenizers for COBOL and Python
cobol_tokenizer = Tokenizer(num_words=5000)  # Adjust vocabulary size as needed
cobol_tokenizer.fit_on_texts(cobol_text)

python_tokenizer = Tokenizer(num_words=5000)
python_tokenizer.fit_on_texts(python_text)

# Convert text data to sequences of token indices
cobol_sequences = cobol_tokenizer.texts_to_sequences(cobol_text)
python_sequences = python_tokenizer.texts_to_sequences(python_text)

# Pad sequences to have the same length (max_cobol_length and max_python_length)
max_cobol_length = max(len(seq) for seq in cobol_sequences)
max_python_length = max(len(seq) for seq in python_sequences)



In [103]:
cobol_sequences = pad_sequences(cobol_sequences, maxlen=max_cobol_length, padding="post")
python_sequences = pad_sequences(python_sequences, maxlen=max_python_length, padding="post")

# Separate target sequences from decoder inputs (teacher forcing)
decoder_target_data = python_sequences[:, 1:]  # Exclude the first token (start of sequence)
decoder_input_data = python_sequences[:, :-1]  # Exclude the last token (end of sequence)

# Create embedding layers for COBOL and Python tokens
cobol_embedding_dim = 128  # Adjust embedding dimension as needed
python_embedding_dim = cobol_embedding_dim

cobol_embedding = Embedding(cobol_tokenizer.num_words, cobol_embedding_dim, mask_zero=True)
python_embedding = Embedding(python_tokenizer.num_words, python_embedding_dim, mask_zero=True)


In [104]:
#max_cobol_length

In [105]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout

# Define the encoder
encoder_inputs = tf.keras.Input(shape=(max_cobol_length,))
encoded = cobol_embedding(encoder_inputs)
encoded = LSTM(1024, return_sequences=True)(encoded)  # Adjust LSTM units as needed
encoded = LSTM(512, return_sequences=True)(encoded) 
encoded = LSTM(360, return_sequences=True)(encoded)
encoded = LSTM(128)(encoded)  # Adjust LSTM units as needed

# Define the decoder
decoder_inputs = tf.keras.Input(shape=(max_python_length - 1,))
decoder_embedding = python_embedding(decoder_inputs)

# Decoder LSTM with dropout for regularization
decoder_lstm = LSTM(1024, return_sequences=True)(decoder_embedding)
decoder_lstm = LSTM(512, return_sequences=True)(decoder_lstm)
decoder_lstm = LSTM(360, return_sequences=True)(decoder_lstm)
decoder_lstm = LSTM(150, return_sequences=True)(decoder_lstm)

decoder_lstm = Dropout(0.2)(decoder_lstm)  # Adjust dropout rate as needed

# Decoder output layer
decoder_outputs = Dense(python_tokenizer.num_words, activation="softmax")(decoder_lstm)

from tensorflow.keras.utils import to_categorical

# Convert target data to one-hot encoding
decoder_target_data_one_hot = to_categorical(decoder_target_data, num_classes=python_tokenizer.num_words)


In [106]:
# Model definition
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

# Compile the model (optimizer, loss function, metrics)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Early stopping to prevent overfitting
#early_stopping = EarlyStopping(monitor="val_loss", patience=3)

In [107]:
model.fit([cobol_sequences, decoder_input_data], decoder_target_data_one_hot, epochs=1000, batch_size=20, validation_split=0.15)

Epoch 1/1000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 6s/step - accuracy: 0.0021 - loss: 8.5168 - val_accuracy: 0.0000e+00 - val_loss: 8.5108
Epoch 2/1000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4s/step - accuracy: 0.0290 - loss: 8.5031 - val_accuracy: 0.0153 - val_loss: 8.4193
Epoch 3/1000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4s/step - accuracy: 0.0170 - loss: 8.3679 - val_accuracy: 0.0153 - val_loss: 8.2644
Epoch 4/1000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4s/step - accuracy: 0.0189 - loss: 8.1350 - val_accuracy: 0.0153 - val_loss: 8.0479
Epoch 5/1000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4s/step - accuracy: 0.0189 - loss: 7.7943 - val_accuracy: 0.0153 - val_loss: 7.7851
Epoch 6/1000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5s/step - accuracy: 0.0171 - loss: 7.3888 - val_accuracy: 0.0153 - val_loss: 7.5127
Epoch 7/1000
[1m2/2[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1e91e44d850>

In [112]:
python_tokenizer.index_word

{1: "'",
 2: 'print',
 3: 'input',
 4: 'name',
 5: '0',
 6: 'num',
 7: 'int',
 8: 'if',
 9: '1',
 10: "'enter",
 11: 'total',
 12: 'number',
 13: 'main',
 14: 'is',
 15: 'else',
 16: 'idx',
 17: 'num1',
 18: 'code',
 19: 'a',
 20: 'ws',
 21: 'f',
 22: 'num2',
 23: 'def',
 24: 'max',
 25: 'amount',
 26: 'i',
 27: 'the',
 28: 'in',
 29: 'para',
 30: 'avg',
 31: 'rem',
 32: 'list',
 33: 'sum',
 34: 'import',
 35: 'format',
 36: 'temp',
 37: 'inp',
 38: '2',
 39: 'quo',
 40: 'time',
 41: 'for',
 42: 'enter',
 43: 'age',
 44: '100',
 45: 'and',
 46: 'date',
 47: 'range',
 48: 'option',
 49: 'telephone',
 50: 'w',
 51: '00',
 52: 'abend',
 53: 'mark1',
 54: 'mark2',
 55: 'mark3',
 56: '5',
 57: 'grade',
 58: 'given',
 59: 'start',
 60: 'accept',
 61: 'character',
 62: 'val',
 63: 'of',
 64: 'result',
 65: 'reason',
 66: 'str',
 67: 'm',
 68: "'total",
 69: '60',
 70: '3',
 71: 'end',
 72: 'first',
 73: 'last',
 74: 'indi',
 75: 'even',
 76: 'odd',
 77: '10',
 78: 'cleanup',
 79: 'datetime',


In [110]:
# Save the model architecture, weights, and training configuration
model.save("cobol_to_python_translator.keras")


In [108]:

# Define a function to translate COBOL code to Python
def translate_cobol_to_python(cobol_code):
  # Preprocess the COBOL code
  cobol_sequence = cobol_tokenizer.texts_to_sequences([cobol_code])[0]
  cobol_sequence = pad_sequences([cobol_sequence], maxlen=max_cobol_length, padding="post")

In [122]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the saved model
translator_model = load_model("C:/Users/Jabasingh Daniel/Desktop/EGDK/modeling/cobol_to_python_translator.keras")

# Define a function to translate COBOL code to Python
def translate_cobol_to_python(cobol_code):
  # Preprocess the COBOL code
  cobol_sequence = cobol_tokenizer.texts_to_sequences([cobol_code])[0]
  cobol_sequence = pad_sequences([cobol_sequence], maxlen=max_cobol_length, padding="post")

  # Predict the Python code sequence
  predicted_python_sequence = translator_model.predict([cobol_sequence, np.zeros((1, max_python_length - 1))])[0]
  END_TOKEN = 270
  # Convert the predicted sequence to Python code
  predicted_word = None  # Initialize outside the loop

  translated_python_code = []

  for i in range(max_python_length):
    # ... (rest of your loop logic)
    translated_python_code.append(predicted_word)
    if predicted_word == END_TOKEN:
      break

  # Check if translated_python_code has elements before slicing
  if translated_python_code:
    translated_python_code = translated_python_code[:translated_python_code.index(END_TOKEN) if END_TOKEN in translated_python_code else None]
  else:
    # Handle empty sequence (e.g., print a message)
    print("Model did not predict any Python code.")

  return " ".join(translated_python_code)

# Define the end-of-sequence token (replace with the actual token index)


# Example usage
cobol_code = "IDENTIFICATION DIVISION. PROGRAM-ID. HelloWorld. PROCEDURE DIVISION. DISPLAY 'Hello, World!'. STOP RUN.."
translated_python_code = translate_cobol_to_python(cobol_code)
print(translated_python_code)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


TypeError: sequence item 0: expected str instance, NoneType found

In [1]:
###13-05-2024

In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Single example of COBOL and Python code snippets
cobol_code = "IDENTIFICATION DIVISION. PROGRAM-ID. HelloWorld. PROCEDURE DIVISION. DISPLAY 'Hello, World!'. STOP RUN."
python_code = "print('Hello, World!')"

# Create a dataset (you can expand this to include more examples)
data = [(cobol_code, python_code)]

# Convert the dataset to a DataFrame
df = pd.DataFrame(data, columns=['cobol_code', 'python_code'])

# Tokenization
cobol_tokenizer = Tokenizer(filters='', char_level=True)
cobol_tokenizer.fit_on_texts(df['cobol_code'])
cobol_seq = cobol_tokenizer.texts_to_sequences(df['cobol_code'])
cobol_seq_padded = pad_sequences(cobol_seq, padding='post')

python_tokenizer = Tokenizer(filters='', char_level=True)
python_tokenizer.fit_on_texts(df['python_code'])
python_seq = python_tokenizer.texts_to_sequences(df['python_code'])
python_seq_padded = pad_sequences(python_seq, padding='post')

# Train the model (you'll need to define the model architecture and compile it)
# model.fit([cobol_seq_padded, python_seq_padded], np.expand_dims(python_seq_padded, -1), batch_size=64, epochs=10, validation_split=0.2)


In [5]:
python_tokenizer.index_word

{1: 'l',
 2: 'r',
 3: "'",
 4: 'o',
 5: 'p',
 6: 'i',
 7: 'n',
 8: 't',
 9: '(',
 10: 'h',
 11: 'e',
 12: ',',
 13: ' ',
 14: 'w',
 15: 'd',
 16: '!',
 17: ')'}