In [13]:
import numpy as np
import random
import string
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding#lstm is a type of recurrent neural network (RNN) layer that handles sequence and time-series data
#dense:A fully connected (dense) layer for the output or intermediate layers in the network.Used to convert input tokens (like words or characters) into dense vector representations

In [14]:
##random substitution cipher
def generate_cipher_map():
    """Generates a random substitution cipher map."""
    letters=list(string.ascii_lowercase)#english alphabet a-z
    shuffled=letters.copy()
    random.shuffle(shuffled)#shuffles the shuffled list which is the copy of the original list
    cipher_map={plain: cipher for plain, cipher in zip(letters,shuffled)}#map letter->shuffled letter plain becomes key cipher becomes the values
    reverse_map={cipher:plain for plain, cipher in cipher_map.items()}#reverse mapping for decryption
    return cipher_map,reverse_map

In [15]:
#encrypt using cipher map
def encrypt_text(text,cipher_map):
    return ''.join(cipher_map.get(char,char) for char in text)# checks for char in the map, if yes then maps it to the cipher map else remains as it is

In [16]:
#generate dataset plaintext-ciphertext pairs using a random substitution cipher.
def generate_dataset(num_samples=10000,max_len=20):
    cipher_map,reverse_map=generate_cipher_map()#substitution text and reverse for decryption
    plaintexts=[]
    ciphertexts=[]
    for _ in range(num_samples):#run num sample times
        plain_text=''.join(random.choices(string.ascii_lowercase+' ',k=random.randint(5,max_len)))# random selection of lower case letters with spaces ranging from 5 to 20 length
        cipher_text=encrypt_text(plain_text,cipher_map)#plain letters to cipher map
        plaintexts.append(plain_text)# ADD TO THE LIST
        ciphertexts.append(cipher_text)
    return plaintexts, ciphertexts, reverse_map

In [17]:
#replaces each character in the text with a corresponding number based on its position in the vocabulary for lstm
def tokenize_texts(texts,vocab):#vocab defines the letter to be tokenised
    tokenizer={char: idx+1 for idx, char in enumerate(vocab)}#give each char in vocab a number and add 1 to index 0 used for padding
    sequences=[[tokenizer[char] for char in text] for text in texts]#each character with its corresponding number from the tokenizer dictionary
    return sequences,tokenizer

In [18]:
def pad_sequences(sequences,max_len):
    # ensures that all sequences of integers (tokenized text data) have the same length by adding padding for lstm model
    return tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding='post')# padding at last for same length

In [20]:
def build_model(input_dim,output_dim,max_len):
   #builds a sequence-to-sequence model using LSTM layers constructs a sequence-to-sequence model using LSTM layers. It's designed to map input sequences (like tokenized plaintext) to output sequences (like tokenized ciphertext) for tasks like text transformation.
    model=Sequential([
        Embedding(input_dim=input_dim,output_dim=64,input_length=max_len),
        LSTM(128,return_sequences=True),#Outputs a sequence (one prediction per input token)
        Dense(output_dim,activation='softmax')#This predicts the probability distribution for each token in the output sequence.
    ])
    model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return model
    #LSTM layer processes sequential data effectively by remembering dependencies over long sequences.

In [35]:
def main():

    print("Generating dataset...")
    vocab=list(string.ascii_lowercase + ' ')# a-z and space
    num_classes=len(vocab)+1# a-z 26 + space+0 padding = 28
    num_samples=10000
    max_len=20
    plaintexts,ciphertexts,reverse_map=generate_dataset(num_samples, max_len)
    print("Sample Plaintext:", plaintexts[0])#print first plain text
    print("Sample Ciphertext:", ciphertexts[0])#print first cypher text
    #tokenize and pad texts
    plaintext_seq,plain_tokenizer=tokenize_texts(plaintexts,vocab)#tokenise plain text to number for lstm
    ciphertext_seq,cipher_tokenizer=tokenize_texts(ciphertexts,vocab) #tokenise pypher text to number
    X=pad_sequences(ciphertext_seq,max_len)#cyphertex->encryption
    y=pad_sequences(plaintext_seq,max_len)#plain text to original text
    y=np.expand_dims(y,-1)#dd an extra dimension for sparse categorical loss 2d->3d
    #Sparse Categorical Loss: In this type of loss function, y must be a 2D array of shape (num_samples, sequence_length), where each element is an integer representing the target class.
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    print("Building model...")
    model=build_model(input_dim=num_classes,output_dim=num_classes,max_len=max_len)
    model.summary()
    print("Training model...")
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32)
     # Testing the model
    idx_to_char={idx: char for char, idx in plain_tokenizer.items()}#integer index->corresponding text
    print("\nTesting model on encrypted text:")
    test_sample=X_test[0]
    predicted=model.predict(np.array([test_sample]))
    predicted_text=''.join([idx_to_char.get(np.argmax(p), '') for p in predicted[0]])
    original_text=''.join([idx_to_char.get(idx, '') for idx in y_test[0].flatten()])
    encrypted_text=''.join([idx_to_char.get(idx, '') for idx in test_sample])
    print(f"Encrypted:{encrypted_text}")
    print(f"Predicted Plaintext:{predicted_text}")
    print(f"Original Plaintext:{original_text}")

if __name__ == "__main__":
    main()
    #out of 10,000 data in the set model trains on 80% and tests on 20% of the random data generated


Generating dataset...
Sample Plaintext: gwoperwlvrahtce
Sample Ciphertext: wvijxmvqcmotpsx
Building model...


Training model...
Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 40ms/step - accuracy: 0.5522 - loss: 2.0724 - val_accuracy: 1.0000 - val_loss: 0.0267
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 1.0000 - loss: 0.0151 - val_accuracy: 1.0000 - val_loss: 0.0035
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 1.0000 - loss: 0.0027 - val_accuracy: 1.0000 - val_loss: 0.0014
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 37ms/step - accuracy: 1.0000 - loss: 0.0012 - val_accuracy: 1.0000 - val_loss: 7.4711e-04
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 41ms/step - accuracy: 1.0000 - loss: 6.6077e-04 - val_accuracy: 1.0000 - val_loss: 4.6621e-04

Testing model on encrypted text:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
Encrypted:lnhnjrptjhdl d
Predicted Plaintext:fdmdp