# 
## Created by Alenna Zweiback



In [1]:
pip install pycipher



In [2]:
#Load the training data set I created using human-readable, 9 letter long words
import pandas as pd
df = pd.read_csv("full_plaintext.csv")

In [3]:
#Ceasar Ciphertext Encryption and Classification Designation
from pycipher import Caesar

pt = df["plaintext"][:300] #I am grabbing the first 300 plaintext words to convert to Ceasar

ceasar = [] #This is where I will store all the results

#This loop will go through the first 100 w/ key =3, second 100 w/ k=4 and third 100 w/ key=5
for i, plaintext in enumerate(pt):
    if i < 100:
        cipher = Caesar(3)
    elif i < 200:
        cipher = Caesar(4)
    else:
        cipher = Caesar(5)

    ciphertext = cipher.encipher(plaintext)
    ceasar.append([plaintext, ciphertext, 0])

df_caesar = pd.DataFrame(ceasar, columns=["plaintext", "ciphertext", "class"])

#I have now converted the first 300 words using Ceasar Cipher (with 3 classifications)

In [4]:
#Playfair Ciphertext Encryption and Classification Designation
from pycipher import Playfair

pt_playfair = df["plaintext"][-200:]
playfair = []

#This loop encrypts first 100 with CRYPTOFUN key, the last 100 with SHERLOCK key
for i, plaintext in enumerate(pt_playfair):
    if i < 100:
        key = "CRYPTOFUNABDEGHIKLMQSUVWXYZ"
    else:
        key = "SHERLOCKABDFGIMNPQTUVWXYZ"

    ciphertext = Playfair(key=key).encipher(plaintext)
    playfair.append([plaintext, ciphertext, 1])

df_playfair = pd.DataFrame(playfair, columns=["plaintext", "ciphertext", "class"])

#I have now converted the last 200 words using Playfair Cipher (with 2 classifications)

In [5]:
#I have finished creating the ciphertext and 5 classifications, so now I will combine into a data set to use for feature engineering
df_full = pd.concat([df_caesar, df_playfair], ignore_index=True)
df_full.to_csv("full_train_data.csv", index=False)

In [6]:
#Viewing my data
#The CSV shows plaintext, ciphertext, and class designation
print(df_full.shape)
print(df_full.head())

(500, 3)
   plaintext ciphertext  class
0  drivegoal  GULYHJRDO      0
1  blackteam  EODFNWHDP      0
2  partyhigh  SDUWBKLJK      0
3  hellofast  KHOORIDVW      0
4  meetsleep  PHHWVOHHS      0


In [7]:
#Feature Engineering
#I created a new CSV called df_full which I will use for Feature Engineering
#I am creating a function that takes in multiple strings and converts to its equivalent integer values

df_full["merged"] = df_full["plaintext"] + df_full["ciphertext"]

def strings_to_alphabet_values(strings):
    result = []
    for s in strings:
        values = []
        for char in s.lower():
            if 'a' <= char <= 'z':
                values.append(ord(char) - ord('a'))
        result.append(values)
    return result

In [8]:
#I will convert to numerical features
str_to_int_list = strings_to_alphabet_values(df_full["merged"])
df = pd.DataFrame(str_to_int_list)

In [9]:
#I added the class label to the feature set for predictions
df["label"] = df_full["class"].values

In [10]:
#To avoid any missing data, filled them with 0
df.fillna(0, inplace=True)

In [11]:
#Showing my new data frame that shows the 19 features and label as the target variable
df.shape
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,label
0,3,17,8,21,4,6,14,0,11,6,20,11,24,7,9,17,3,14,0.0,0
1,1,11,0,2,10,19,4,0,12,4,14,3,5,13,22,7,3,15,0.0,0
2,15,0,17,19,24,7,8,6,7,18,3,20,22,1,10,11,9,10,0.0,0
3,7,4,11,11,14,5,0,18,19,10,7,14,14,17,8,3,21,22,0.0,0
4,12,4,4,19,18,11,4,4,15,15,7,7,22,21,14,7,7,18,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,11,0,2,10,7,8,6,7,12,1,1,10,2,4,12,8,4,22.0,1
496,3,17,4,0,12,19,7,8,18,8,18,17,10,8,20,17,5,4,21.0,1
497,22,7,8,19,4,15,11,0,24,7,2,19,24,7,16,17,1,25,24.0,1
498,19,4,0,12,1,11,0,2,10,16,17,1,8,12,1,1,10,6,4.0,1


In [12]:
#Preparing to build the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,label
0,3,17,8,21,4,6,14,0,11,6,20,11,24,7,9,17,3,14,0.0,0
1,1,11,0,2,10,19,4,0,12,4,14,3,5,13,22,7,3,15,0.0,0
2,15,0,17,19,24,7,8,6,7,18,3,20,22,1,10,11,9,10,0.0,0
3,7,4,11,11,14,5,0,18,19,10,7,14,14,17,8,3,21,22,0.0,0
4,12,4,4,19,18,11,4,4,15,15,7,7,22,21,14,7,7,18,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,11,0,2,10,7,8,6,7,12,1,1,10,2,4,12,8,4,22.0,1
496,3,17,4,0,12,19,7,8,18,8,18,17,10,8,20,17,5,4,21.0,1
497,22,7,8,19,4,15,11,0,24,7,2,19,24,7,16,17,1,25,24.0,1
498,19,4,0,12,1,11,0,2,10,16,17,1,8,12,1,1,10,6,4.0,1


In [14]:
#Extract features in X and the labels in y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [15]:
#Showing the feature set X
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,3,17,8,21,4,6,14,0,11,6,20,11,24,7,9,17,3,14,0.0
1,1,11,0,2,10,19,4,0,12,4,14,3,5,13,22,7,3,15,0.0
2,15,0,17,19,24,7,8,6,7,18,3,20,22,1,10,11,9,10,0.0
3,7,4,11,11,14,5,0,18,19,10,7,14,14,17,8,3,21,22,0.0
4,12,4,4,19,18,11,4,4,15,15,7,7,22,21,14,7,7,18,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,11,0,2,10,7,8,6,7,12,1,1,10,2,4,12,8,4,22.0
496,3,17,4,0,12,19,7,8,18,8,18,17,10,8,20,17,5,4,21.0
497,22,7,8,19,4,15,11,0,24,7,2,19,24,7,16,17,1,25,24.0
498,19,4,0,12,1,11,0,2,10,16,17,1,8,12,1,1,10,6,4.0


In [16]:
#Showing target variable
y

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
495,1
496,1
497,1
498,1


In [17]:
#Split the data into training and testing (to select the best model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [18]:
#Create and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [19]:
#Make predictions on the model
y_pred = model.predict(X_test)

In [20]:
#Evaluating the performance metrics
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [None]:
#Import the Test Data
test_df = pd.read_csv("test_data.csv")

#Seperate the X and y from the imported test data
test_X = test_df.iloc[:, :-1]
test_y = test_df.iloc[:, -1]

#Merging the plaintext and ciphertext to later create a numerical function
test_df_2 = test_X["plaintext"] + test_X["ciphertext"]

#Calling the feature engineering function to convert to integers
test_str_to_int_list = strings_to_alphabet_values(test_df_2)

#Creating a new dataframe with the resulting numerical features
test_feature_df = pd.DataFrame(test_str_to_int_list)

#In case of spillover, add 0 to any missing values
#test_feature_df.fillna(0, inplace=True)

#In case columns are = 18
#if test_feature_df.shape[1] == 18:
  #test_feature_df[18]=0

#Use the previously trained model
y_pred_test = model.predict(test_feature_df)

#Print the accuracy of the test data against the training data
accuracy = accuracy_score(test_y, y_pred_test)
print("Accuracy:", accuracy)