In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import re
from difflib import SequenceMatcher
from colorama import init, Fore

from IPython.display import Image, display

# Helper Functions

In [2]:
# Get joined keystroke and classification data
def get_df_joined(instance):
    # KEYSTROKE DATA
    # COL 1: 'Timestamp'
    # COL 2: 'Keystrokes'
    df_keystrokes = pd.read_csv(f'data/keylogging/{instance}/keystrokes.csv')

    # CLASSIFICATION DATA
    # COL 1: 'Timestamp'
    # COL 3: 'Text'
    # COL 4: 'Target'
    df_classification = pd.read_csv(f'data/keylogging/{instance}/classification.csv')

    # JOINED DATA
    # COL 1: 'Timestamp'
    # COL 2: 'Keystrokes'
    # COL 4: 'Text'
    # COL 5: 'Target'
    df_joined = df_classification.join(df_keystrokes.set_index('Timestamp'), on='Timestamp')
    
    return df_joined

In [3]:
# Cluster subsequent rows where target is the same
def get_df_clustered(df):
    result = [df.iloc[0]]
    
    for i in range(1, len(df)):
        tail = result[-1]
        row = df.iloc[i]
        
        if (tail['Target'] == row['Target']):
            tail['Timestamp'] += ', ' + row['Timestamp']
            tail['Keystrokes'] += row['Keystrokes']
            tail['Text'] += ', ' + row['Text']
        else:
            result.append(row)
        
    return pd.DataFrame(result)

In [36]:
# Finds longest match between keystrokes and text
def find_longest_match(s1, s2, m, n):
    max_len = 0
    end_index = m
   
    find = [[0 for x in range(n + 1)] for y in range(m + 1)]
 
    for i in range(1, m + 1):
        for j in range(1, n + 1):
 
            if s1[i - 1] == s2[j - 1]:
                find[i][j] = find[i - 1][j - 1] + 1
 
                if find[i][j] > max_len:
                    max_len = find[i][j]
                    end_index = i
 
    longest_match = s1[end_index - max_len: end_index]
    
    if len(longest_match) < 4:
        return []
    else:
        return [longest_match]

In [29]:
# Return text with highlights for matched patterns
def highlight_patterns(text, patterns):
    highlights = []
    
    for pattern in patterns:
        text = re.sub(f"({pattern})", Fore.RED + r'\1' + Fore.RESET, text, flags=re.I)
        highlights += re.findall(pattern, text, flags=re.I)

    return text, highlights

In [30]:
# Display an image with a preceding print
def show_image(text, image_name, instance):
    try:
        print(f"{text} Image: {image_name}")
        display(Image(filename=f'data/keylogging/{instance}/images/{image_name}.csv'))
    except ValueError:
        print(f"Could not find {image_name}\n")

In [31]:
# Pretty print the clustered dataframe
def pprint(df, instance, patterns):
    for i in range(len(df)):
        print("=" * 127 + "\n")
        row = df.iloc[i]
        
        timestamps = row['Timestamp'].split(', ')
        print(f"Timeframe: {timestamps[0]} - {timestamps[-1]}\n")
        
        target = row['Target']
        
        if (target == 0):
            print("Found: Nothing of Interest")
        else:
            if (target == 1):
                print("Found: Login Page\n")
            
            if (target == 2): # Else
                print("Found: Credit Card Page\n")
            
            image_names = row['Timestamp'].split(', ')
            image_name_start = image_names[0]
            image_name_end = image_names[-1]
                  
            show_image("Start", f"{image_name_start}.png", instance)
            show_image("End", f"{image_name_end}.png", instance)
            
            keystrokes = row['Keystrokes']
            text = row['Text']
            
            patterns += find_longest_match(keystrokes, text, len(keystrokes), len(text))
            keystrokes_highlighted, highlights = highlight_patterns(keystrokes, patterns)
            
            print(f"Keystrokes: \n{keystrokes_highlighted}")
            print(f"Highlights: \n{highlights}\n")

In [32]:
# Give summary of classification alongside keystrokes and highlights
def summarize(instance, patterns):
    try:
        df_joined = get_df_joined(instance)
        df_clustered = get_df_clustered(df_joined)
        pprint(df_clustered, instance, patterns)
    except FileNotFoundError:
        print("Error: Could not find file")

# Summarize Test Example

In [43]:
# Set up example dataframe
df_example = pd.DataFrame([['14:31', 'Hello, my name ', 'Hello', 1], 
                           ['14:36', 'is Bob.', 'xyz', 1], 
                           ['14:41', 'This sentence ', 'xyz', 2], 
                           ['14:46', 'is unrelated.', 'xyz', 2], 
                           ['14:51', 'Word.', 'word', 0]], 
                          columns=['Timestamp', 'Keystrokes', 'Text', 'Target'])
df_example

Unnamed: 0,Timestamp,Keystrokes,Text,Target
0,14:31,"Hello, my name",Hello,1
1,14:36,is Bob.,xyz,1
2,14:41,This sentence,xyz,2
3,14:46,is unrelated.,xyz,2
4,14:51,Word.,word,0


In [44]:
# Cluster example dataframe
df_clustered = get_df_clustered(df_example)
df_clustered

Unnamed: 0,Timestamp,Keystrokes,Text,Target
0,"14:31, 14:36","Hello, my name is Bob.","Hello, xyz",1
2,"14:41, 14:46",This sentence is unrelated.,"xyz, xyz",2
4,14:51,Word.,word,0


In [45]:
# Pretty print clustered dataframe
pprint(df_clustered, "keylogging-1", ["bob", "sentence", "word", "name"])


Timeframe: 14:31 - 14:36

Found: Login Page

Start Image: 14:31.png
Could not find 14:31.png

End Image: 14:36.png
Could not find 14:36.png

Keystrokes: 
[31mHello, [39mmy [31mname[39m is [31mBob[39m.
Highlights: 
['Bob', 'name', 'Hello, ']


Timeframe: 14:41 - 14:46

Found: Credit Card Page

Start Image: 14:41.png
Could not find 14:41.png

End Image: 14:46.png
Could not find 14:46.png

Keystrokes: 
This [31msentence[39m is unrelated.
Highlights: 
['sentence']


Timeframe: 14:51 - 14:51

Found: Nothing of Interest


# Summarize Instances

In [163]:
# Specify instance name
instance = ""

# Specify patterns to highlight
patterns = ["", ""]

# Summarize instance with highlighted patterns
summarize(instance, patterns)

Error: Could not find file
