### Dependencies
____

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import upper

import pandas as pd

### Create Spark Session
_____

In [2]:
spark = SparkSession.builder.appName('caesar-cipher').getOrCreate()

#### Create DataFrame
______

In [5]:
csv_file="../resources/letter_frequency_en_US.csv"

df = spark.read.csv(path=csv_file, inferSchema=True, header=True)
df.printSchema()

root
 |-- letter: string (nullable = true)
 |-- count: integer (nullable = true)



In [6]:
df.show()

+------+-----+
|letter|count|
+------+-----+
|     E|21912|
|     T|16587|
|     A|14810|
|     O|14003|
|     I|13318|
|     N|12666|
|     S|11450|
|     R|10977|
|     H|10795|
|     D| 7874|
|     L| 7253|
|     U| 5246|
|     C| 4943|
|     M| 4761|
|     F| 4200|
|     Y| 3853|
|     W| 3819|
|     G| 3693|
|     P| 3316|
|     B| 2715|
+------+-----+
only showing top 20 rows



In [7]:
df.filter(df['letter'].rlike('[A-E]')).show()

+------+-----+
|letter|count|
+------+-----+
|     E|21912|
|     A|14810|
|     D| 7874|
|     C| 4943|
|     B| 2715|
+------+-----+



In [8]:
#df.filter(df['letter'].rlike('[A-E]')).orderBy('count').show()
#df.orderBy(df['Sales'].desc()).show()


df.filter(df['letter'].rlike('[A-Z]')).orderBy(df['count'].desc()).show()

+------+-----+
|letter|count|
+------+-----+
|     E|21912|
|     T|16587|
|     A|14810|
|     O|14003|
|     I|13318|
|     N|12666|
|     S|11450|
|     R|10977|
|     H|10795|
|     D| 7874|
|     L| 7253|
|     U| 5246|
|     C| 4943|
|     M| 4761|
|     F| 4200|
|     Y| 3853|
|     W| 3819|
|     G| 3693|
|     P| 3316|
|     B| 2715|
+------+-----+
only showing top 20 rows



### Pandas to Spark
____

In [63]:
freq_report     = "../resources/Encrypted-1_frequency_letters.csv"
freq_en_letters = "../resources/letter_frequency_en_US.csv"



letters = spark.read.csv(path=freq_report, inferSchema=True, header=True)
#letters.filter(letters['letter'].rlike('[A-Z]')).orderBy(letters['count'].desc()).show()
#letters.select(upper(letters['letter'])).alias('letter').show()


#letters.withColumn('letter', upper(letters['letter'])).show()

letters = letters.withColumn('letter', upper(letters['letter']))
letters = letters.filter(letters['letter'].rlike('[A-Z]')).orderBy(letters['occurrences'].desc())
print (letters.select('letter').collect())

#for l in letters.select('letter').collect():
    #print (l.asDict()['letter'])
#    break
    
#letters.select('letter').collect()[0:2]

#letters.count()

#letter_list = [l.asDict()['letter'] for l in letters.select('letter').collect()]

#print (letter_list)
    


[Row(letter='R'), Row(letter='G'), Row(letter='B'), Row(letter='N'), Row(letter='A'), Row(letter='V'), Row(letter='F'), Row(letter='E'), Row(letter='C'), Row(letter='Y'), Row(letter='U'), Row(letter='P'), Row(letter='Q'), Row(letter='H'), Row(letter='Z'), Row(letter='T'), Row(letter='S'), Row(letter='O'), Row(letter='L'), Row(letter='J'), Row(letter='I'), Row(letter='X'), Row(letter='K'), Row(letter='W'), Row(letter='D'), Row(letter='M')]


In [99]:
freq_report     = "../resources/Encrypted-1_frequency_letters.csv"
freq_en_letters = "../resources/letter_frequency_en_US.csv"

def get_keys_decode_order(freq_report):
    
    prefer_keys_order = []

    letters = spark.read.csv(path=freq_report, inferSchema=True, header=True)
    letters = letters.withColumn('letter', upper(letters['letter']))
    ceaser_letters = letters.filter(letters['letter'].rlike('[A-Z]')).orderBy(letters['occurrences'].desc())
    ceaser_list = [l.asDict()['letter'] for l in ceaser_letters.select('letter').collect()]

    en_letters = spark.read.csv(path=freq_en_letters, inferSchema=True, header=True)
    en_letters = en_letters.withColumn('letter', upper(en_letters['letter']))
    en_letters = en_letters.filter(en_letters['letter'].rlike('[A-Z]')).orderBy(en_letters['count'].desc())
    en_letters_list = [l.asDict()['letter'] for l in en_letters.select('letter').collect()]    
   
    for i in range(len(en_letters_list)):
        en_key  = en_letters_list[i]
        dec_key = ceaser_list[i]
        key_size = abs(ord(en_key) - ord(dec_key))
        
        prefer_keys_order.append(key_size)
        
        
    decode_order  =  list(dict.fromkeys(prefer_keys_order))
    missing_items = [x for x in range(26) if x not in decode_order]

    return (decode_order + missing_items)


In [100]:
print(get_keys_decode_order(freq_report))

[13, 1, 8, 5, 21, 9, 14, 20, 4, 6, 0, 2, 3, 7, 10, 11, 12, 15, 16, 17, 18, 19, 22, 23, 24, 25]


In [86]:
def old_get_keys_decode_order(freq_report):
    prefer_keys_order = []

    letters = pd.read_csv(freq_report)
    cond = letters['letter'].str.contains('[A-Z]')
    ceaser_letters = letters[cond].copy(deep=True)

    ceaser_letters.sort_values(by=['occurrences'],ascending=False, inplace=True)

    en_letters = pd.read_csv(freq_en_letters)
    en_letters.sort_values(by=['count'],ascending=False, inplace=True)

    for i in range(en_letters.shape[0]):
        en_key   = en_letters['letter'].values[i]
        dec_key  = ceaser_letters['letter'].values[i]    
        key_size = abs(ord(dec_key) - ord(en_key))

        prefer_keys_order.append(key_size)
    
    decode_order  =  list(dict.fromkeys(prefer_keys_order))
    missing_items = [x for x in range(26) if x not in decode_order]

    return (decode_order + missing_items)


In [87]:
print(old_get_keys_decode_order(freq_report))

[13, 1, 8, 5, 21, 9, 14, 20, 4, 6, 0, 2, 3, 7, 10, 11, 12, 15, 16, 17, 18, 19, 22, 23, 24, 25]


In [96]:
# English : A
# Encrypt : Y

english='A'
encrypt='Y'
key = ord(english) - ord(encrypt)

print (ord(english) - ord(encrypt))
print (ord(encrypt) - ord(english))
print (key%26)

-24
24
2
