# Required Imports and Setup

In [1]:
import csv
import pandas as pd
import numpy as np 
from sklearn import tree      # for decision trees
from sklearn import ensemble  # for random forests
from matplotlib import pyplot as plt
import matplotlib.cm as cm

# different imports for different versions of scikit-learn
try: 
    from sklearn.model_selection import cross_val_score 
except ImportError:
    try:
        from sklearn.cross_validation import cross_val_score
    except:
        print("No cross_val_score!")
        
# read typing data into pandas dataframe
# df = pd.read_csv("new_typing_data.csv", error_bad_lines=False)

# 1. Read typing data into a list

First read data from csv into a list

In [2]:
#
# readcsv is a starting point - it returns the rows from a standard csv file...
#
def readcsv( csv_file_name ):
    """ readcsv takes as
         + input:  csv_file_name, the name of a csv file
        and returns
         + output: a list of lists, each inner list is one row of the csv
           all data items are strings; empty cells are empty strings
    """
    try:
        csvfile = open( csv_file_name, newline='' )  # open for reading
        csvrows = csv.reader( csvfile )              # creates a csvrows object

        all_rows = []                                # we need to read the csv file
        for row in csvrows:                          # into our own Python data structure
            all_rows.append( row )                   # adds only the word to our list

        del csvrows                                  # acknowledge csvrows is gone!
        csvfile.close()                              # and close the file
        return all_rows                              # return the list of lists

    except FileNotFoundError as e:
        print("File not found: ", e)
        return []
    
data = readcsv("test_data.csv")

Clean data so that it correctly starts with the "." character

In [3]:
for i in range(len(data)):
    data[i] = data[i][3:-1]

In [4]:
for row in data:
    print(row)
    print()

["('.', 6.011002074, 'pressed')", "('.', 6.123425249, 'released')", "('t', 6.210437133, 'pressed')", "('t', 6.282306537, 'released')", "('i', 6.290148006, 'pressed')", "('i', 6.358390612, 'released')", "('e', 6.376343242, 'pressed')", "('e', 6.444386616, 'released')", "('5', 6.568501488, 'pressed')", "('5', 6.640304912, 'released')", "('Key.caps_lock', 6.848932655, 'pressed')", "('Key.caps_lock', 6.848944598, 'released')", "('R', 6.930270748, 'pressed')", "('Key.caps_lock', 6.989199045, 'pressed')", "('Key.caps_lock', 6.989210938, 'released')", "('r', 6.993649665, 'released')", "('o', 7.171154254, 'pressed')", "('o', 7.27922604, 'released')", "('a', 7.291250716, 'pressed')", "('a', 7.367345513, 'released')", "('n', 7.441349145, 'pressed')", "('l', 7.521346095, 'pressed')", "('n', 7.545366845, 'released')", "('e', 7.605384269, 'pressed')", "('l', 7.623357665, 'released')", "('e', 7.669298627, 'released')"]

["('.', 8.38986342, 'pressed')", "('.', 8.490285249, 'released')", "('t', 8.5753

Visualise the order in which the keys were pressed and released

In [5]:
stringlist = []

for row in data:
    stringlist += [[row[x][2] for x in range(len(row))]]

# print(stringlist)

for row in stringlist:
    output = ""
    for character in row:
        output += character
    print(output)
    print()



..ttiiee55KKRKKrooaanlnele

..ttiiee55KKRKKroaoanlnlee

..ttiiee55KKRKKroaoanlnlee

..ttiiee55KKRRKKooaanlnele



Convert str in rows to tuples

In [6]:
new_data = []

for row in data:
    new_data_row = []
    for datapoint in row:
        new_data_row.append(eval(datapoint))
        print(datapoint)
    new_data.append(new_data_row)
    
data = new_data 

('.', 6.011002074, 'pressed')
('.', 6.123425249, 'released')
('t', 6.210437133, 'pressed')
('t', 6.282306537, 'released')
('i', 6.290148006, 'pressed')
('i', 6.358390612, 'released')
('e', 6.376343242, 'pressed')
('e', 6.444386616, 'released')
('5', 6.568501488, 'pressed')
('5', 6.640304912, 'released')
('Key.caps_lock', 6.848932655, 'pressed')
('Key.caps_lock', 6.848944598, 'released')
('R', 6.930270748, 'pressed')
('Key.caps_lock', 6.989199045, 'pressed')
('Key.caps_lock', 6.989210938, 'released')
('r', 6.993649665, 'released')
('o', 7.171154254, 'pressed')
('o', 7.27922604, 'released')
('a', 7.291250716, 'pressed')
('a', 7.367345513, 'released')
('n', 7.441349145, 'pressed')
('l', 7.521346095, 'pressed')
('n', 7.545366845, 'released')
('e', 7.605384269, 'pressed')
('l', 7.623357665, 'released')
('e', 7.669298627, 'released')
('.', 8.38986342, 'pressed')
('.', 8.490285249, 'released')
('t', 8.575306964, 'pressed')
('t', 8.651273396, 'released')
('i', 8.673271067, 'pressed')
('i', 8.7

# 2. Feature Engineering

Extract the following features from the data:
1. Key hold time (total time that a key is held down)
2. Keydown-Keydown time (time between pressing of consecutive keys)
3. Keyup-Keydown time (time between the release of one key and the press of next key)

### 1. First finding key hold times for a given data row:

In [77]:
def key_hold_times(datarow):
    output = []
    
    # loop through all tuples in data row
    for i in range(len(datarow)):
        # find only pairs for keys that are pressed
        if datarow[i][2] == "pressed":
            current_char = datarow[i][0].lower()
            # start looking for other half of pair from next index onwards
            for j in range(i + 1, len(datarow)):
                next_char = datarow[j][0].lower()
                # if they match, record differences in time
                if next_char == current_char:
                    output.append((current_char, float(datarow[j][1]) - float(datarow[i][1])))
                    break
                    
    return output

Loop and print differences in key hold times between different data samples to make sure that they are small

In [107]:
hold0 = key_hold_times(data[0])

hold1 = key_hold_times(data[1])

hold2 = key_hold_times(data[2])

print("differences in hold times between sample 0 and sample 1")
for i in range(len(hold0)):
    print(hold0[i][0], np.abs(round(hold0[i][1] - hold1[i][1], 5)))
    
print("\ndifferences in hold times between sample 1 and sample 2")
for i in range(len(hold0)):
    print(hold1[i][0], np.abs(round(hold1[i][1] - hold2[i][1], 5)))

[('.', 0.11242317499999999), ('t', 0.07186940400000008), ('i', 0.0682426060000001), ('e', 0.06804337400000016), ('5', 0.0718034240000005), ('key.caps_lock', 1.1943000000513848e-05), ('r', 0.06337891700000053), ('key.caps_lock', 1.1892999999929543e-05), ('o', 0.108071786), ('a', 0.07609479699999966), ('n', 0.10401769999999999), ('l', 0.10201157000000016), ('e', 0.0639143579999999)]
differences in hold times between sample 0 and sample 1
. 0.012
t 0.0041
i 0.00833
e 0.00801
5 7e-05
key.caps_lock 0.0
r 0.02424
key.caps_lock 0.0
o 0.03309
a 0.01938
n 0.00071
l 0.01002
e 0.01616

differences in hold times between sample 1 and sample 2
. 0.00386
t 0.00012
i 0.00382
e 0.012
5 0.00388
key.caps_lock 1e-05
r 0.01132
key.caps_lock 0.0
o 0.02083
a 0.00062
n 0.01138
l 0.00794
e 0.01592


Great, now we can confirm that hold times are a repeatable charactericstic of the way people type