In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from datetime import datetime

import re
from colorama import init, Fore

from IPython.display import Image, display

In [2]:
# Get joined keystroke and classification data
def get_df_joined(instance):
    # KEYSTROKE DATA
    # COL 1: 'Timestamp'
    # COL 2: 'Keystrokes'
    df_keystrokes = pd.read_csv(f'data/keylogging/{instance}/keystrokes.csv')

    # CLASSIFICATION DATA
    # COL 1: 'Timestamp'
    # COL 3: 'Text'
    # COL 4: 'Target'
    df_classification = pd.read_csv(f'data/keylogging/{instance}/classification.csv')

    # JOINED DATA
    # COL 1: 'Timestamp'
    # COL 2: 'Keystrokes'
    # COL 4: 'Text'
    # COL 5: 'Target'
    df_joined = df_classification.join(df_keystrokes.set_index('Timestamp'), on='Timestamp')
    
    return df_joined

In [3]:
# Cluster subsequent rows where target is the same
def cluster(df):
    result = [df.iloc[0]]
    
    for i in range(1, len(df)):
        tail = result[-1]
        row = df.iloc[i]
        
        if (tail['Target'] == row['Target']):
            tail['Timestamp'] += ', ' + row['Timestamp']
            tail['Keystrokes'] += row['Keystrokes']
            tail['ImageName'] += ', ' + row['ImageName']
            tail['Text'] += ', ' + row['Text']
        else:
            result.append(row)
        
    return pd.DataFrame(result)

In [4]:
# Return text with highlights for matched patterns
def match_patterns(text, patterns):
    highlights = []
    
    for pattern in patterns:
        text = re.sub(pattern, Fore.RED + r'\1' + Fore.RESET, text, flags=re.I)
        highlights += re.findall(pattern, text, flags=re.I)

    return text, highlights

# Display an image with a preceding print
def show_image(text, image_name, instance):
    try:
        print(f"{text} Image: {image_name}")
        display(Image(filename=f'data/keylogging/{instance}/images/{image_name}.csv'))
    except ValueError:
        print(f"Could not find {image_name}\n")

# Pretty print the clustered dataframe
def pprint(df, instance, patterns):
    for i in range(len(df)):
        print("=" * 127 + "\n")
        row = df.iloc[i]
        
        timestamps = row['Timestamp'].split(', ')
        print(f"Timeframe: {timestamps[0]} - {timestamps[-1]}\n")
        
        target = row['Target']
        
        if (target == 0):
            print("Found: Nothing of Interest")
        else:
            if (target == 1):
                print("Found: Login Page\n")
            
            if (target == 2): # Else
                print("Found: Credit Card Page\n")
            
            image_names = row['ImageName'].split(', ')
            image_name_start = image_names[0]
            image_name_end = image_names[-1]

            print(f"All Images: {row['ImageName']}\n")
                  
            show_image("Start", image_name_start, instance)
            show_image("End", image_name_end, instance)

            keystrokes = row['Keystrokes']
            keystrokes_highlighted, highlights = match_patterns(keystrokes, patterns)
            print(f"Keystrokes: \n{keystrokes_highlighted}")
            print(f"Highlights: \n{highlights}\n")

In [5]:

def summarize(instance, patterns):
    df_joined = get_df_joined(instance)
    df_clustered = cluster(df_joined)
    pprint(df_clustered, instance, patterns)

In [6]:
# Set up example dataframe
df_example = pd.DataFrame([['14:31', 'Hello, my name ', 'image_1.png', 'Transcribed image text 1.', 1], 
                           ['14:36', 'is Bob.', 'image_2.png', 'Transcribed image text 2.', 1], 
                           ['14:41', 'This sentence ', 'image_3.png', 'Transcribed image text 3.', 2], 
                           ['14:46', 'is unrelated.', 'image_4.png', 'Transcribed image text 4.', 2], 
                           ['14:51', 'Word.', 'image_5.png', 'Transcribed image text 5.', 0]], 
                          columns=['Timestamp', 'Keystrokes', 'ImageName', 'ImageText', 'Target'])
df_example

Unnamed: 0,Timestamp,Keystrokes,ImageName,ImageText,Target
0,14:31,"Hello, my name",image_1.png,Transcribed image text 1.,1
1,14:36,is Bob.,image_2.png,Transcribed image text 2.,1
2,14:41,This sentence,image_3.png,Transcribed image text 3.,2
3,14:46,is unrelated.,image_4.png,Transcribed image text 4.,2
4,14:51,Word.,image_5.png,Transcribed image text 5.,0


In [7]:
# Cluster example dataframe
df_clustered = cluster(df_example)
df_clustered

Unnamed: 0,Timestamp,Keystrokes,ImageName,ImageText,Target
0,"14:31, 14:36","Hello, my name is Bob.","image_1.png, image_2.png","Transcribed image text 1., Transcribed image t...",1
2,"14:41, 14:46",This sentence is unrelated.,"image_3.png, image_4.png","Transcribed image text 3., Transcribed image t...",2
4,14:51,Word.,image_5.png,Transcribed image text 5.,0


In [8]:
# Pretty print clustered dataframe
pprint(df_clustered, "keylogging-1", [r'(bob)', r'(sentence)', r'(word)', r'(name)'])


Timeframe: 14:31 - 14:36

Found: Login Page

All Images: image_1.png, image_2.png

Start Image: image_1.png
Could not find image_1.png

End Image: image_2.png
Could not find image_2.png

Keystrokes: 
Hello, my [31mname[39m is [31mBob[39m.
Highlights: 
['Bob', 'name']


Timeframe: 14:41 - 14:46

Found: Credit Card Page

All Images: image_3.png, image_4.png

Start Image: image_3.png
Could not find image_3.png

End Image: image_4.png
Could not find image_4.png

Keystrokes: 
This [31msentence[39m is unrelated.
Highlights: 
['sentence']


Timeframe: 14:51 - 14:51

Found: Nothing of Interest
