# Required Imports and Setup

In [1]:
import csv
import pandas as pd
import numpy as np 
from sklearn import tree      # for decision trees
from sklearn import ensemble  # for random forests
from matplotlib import pyplot as plt
import matplotlib.cm as cm

# different imports for different versions of scikit-learn
try: 
    from sklearn.model_selection import cross_val_score 
except ImportError:
    try:
        from sklearn.cross_validation import cross_val_score
    except:
        print("No cross_val_score!")
        
# read typing data into pandas dataframe
# df = pd.read_csv("new_typing_data.csv", error_bad_lines=False)

# 1. Read typing data into a list

First read data from csv into a list

In [2]:
#
# readcsv is a starting point - it returns the rows from a standard csv file...
#
def readcsv( csv_file_name ):
    """ readcsv takes as
         + input:  csv_file_name, the name of a csv file
        and returns
         + output: a list of lists, each inner list is one row of the csv
           all data items are strings; empty cells are empty strings
    """
    try:
        csvfile = open( csv_file_name, newline='' )  # open for reading
        csvrows = csv.reader( csvfile )              # creates a csvrows object

        all_rows = []                                # we need to read the csv file
        for row in csvrows:                          # into our own Python data structure
            all_rows.append( row )                   # adds only the word to our list

        del csvrows                                  # acknowledge csvrows is gone!
        csvfile.close()                              # and close the file
        return all_rows                              # return the list of lists

    except FileNotFoundError as e:
        print("File not found: ", e)
        return []
    
akshay_data = readcsv("akshay.csv")

mummy_data = readcsv("mummy.csv")

test_data = readcsv("test.csv")[1]

Clean data so that it correctly starts with the "." character and doesn't include the "esc" character at the end

In [3]:
for i in range(len(akshay_data)):
    akshay_data[i] = akshay_data[i][3:-1]
    
for i in range(len(mummy_data)):
    mummy_data[i] = mummy_data[i][3:-1]
    
test_data = test_data[i][3:-1]

Let's see what the first 2 rows of Akshay's and Mummy's data look like

In [4]:
print("first 2 rows of akshay's data")
print(akshay_data[0], "\n")
print(akshay_data[1], "\n")

print("first 2 rows of mummy's data")
print(mummy_data[0], "\n")
print(mummy_data[1], "\n")

first 2 rows of akshay's data
["('.', 6.011002074, 'pressed')", "('.', 6.123425249, 'released')", "('t', 6.210437133, 'pressed')", "('t', 6.282306537, 'released')", "('i', 6.290148006, 'pressed')", "('i', 6.358390612, 'released')", "('e', 6.376343242, 'pressed')", "('e', 6.444386616, 'released')", "('5', 6.568501488, 'pressed')", "('5', 6.640304912, 'released')", "('Key.caps_lock', 6.848932655, 'pressed')", "('Key.caps_lock', 6.848944598, 'released')", "('R', 6.930270748, 'pressed')", "('Key.caps_lock', 6.989199045, 'pressed')", "('Key.caps_lock', 6.989210938, 'released')", "('r', 6.993649665, 'released')", "('o', 7.171154254, 'pressed')", "('o', 7.27922604, 'released')", "('a', 7.291250716, 'pressed')", "('a', 7.367345513, 'released')", "('n', 7.441349145, 'pressed')", "('l', 7.521346095, 'pressed')", "('n', 7.545366845, 'released')", "('e', 7.605384269, 'pressed')", "('l', 7.623357665, 'released')", "('e', 7.669298627, 'released')"] 

["('.', 8.38986342, 'pressed')", "('.', 8.4902852

Visualise the order in which the keys were pressed and released for the first 3 rows of both datasets

In [5]:
stringlist = []

stringlist += ["akshay's data"]

for i in range(3):
    stringlist += [[akshay_data[i][x][2] for x in range(len(akshay_data[i]))]]

stringlist += [""]
stringlist += ["mummy's data"]

for i in range(3):
    stringlist += [[akshay_data[i][x][2] for x in range(len(mummy_data[i]))]]

for row in stringlist:
    output = ""
    for character in row:
        output += character
    print(output)



akshay's data
..ttiiee55KKRKKrooaanlnele
..ttiiee55KKRKKroaoanlnlee
..ttiiee55KKRKKroaoanlnlee

mummy's data
..ttiiee55KKRKKrooaanlnele
..ttiiee55KKRKKroaoanlnlee
..ttiiee55KKRKKroaoanlnlee


Convert str in rows to tuples

In [6]:
# first for akshay's data
new_data = []

for row in akshay_data:
    new_data_row = []
    for datapoint in row:
        new_data_row.append(eval(datapoint))
        print(datapoint)
    new_data.append(new_data_row)
    
akshay_data = new_data 

# then for mummy's data
new_data = []

for row in mummy_data:
    new_data_row = []
    for datapoint in row:
        new_data_row.append(eval(datapoint))
        print(datapoint)
    new_data.append(new_data_row)
    
mummy_data = new_data 

('.', 6.011002074, 'pressed')
('.', 6.123425249, 'released')
('t', 6.210437133, 'pressed')
('t', 6.282306537, 'released')
('i', 6.290148006, 'pressed')
('i', 6.358390612, 'released')
('e', 6.376343242, 'pressed')
('e', 6.444386616, 'released')
('5', 6.568501488, 'pressed')
('5', 6.640304912, 'released')
('Key.caps_lock', 6.848932655, 'pressed')
('Key.caps_lock', 6.848944598, 'released')
('R', 6.930270748, 'pressed')
('Key.caps_lock', 6.989199045, 'pressed')
('Key.caps_lock', 6.989210938, 'released')
('r', 6.993649665, 'released')
('o', 7.171154254, 'pressed')
('o', 7.27922604, 'released')
('a', 7.291250716, 'pressed')
('a', 7.367345513, 'released')
('n', 7.441349145, 'pressed')
('l', 7.521346095, 'pressed')
('n', 7.545366845, 'released')
('e', 7.605384269, 'pressed')
('l', 7.623357665, 'released')
('e', 7.669298627, 'released')
('.', 8.38986342, 'pressed')
('.', 8.490285249, 'released')
('t', 8.575306964, 'pressed')
('t', 8.651273396, 'released')
('i', 8.673271067, 'pressed')
('i', 8.7

# 2. Feature Engineering

Extract the following features from the data:
a. Key hold time (total time that a key is held down)
b. Keydown-Keydown time (time between pressing of consecutive keys)
c. Keyup-Keydown time (time between the release of one key and the press of next key)


![features.png](attachment:features.png)

### a. First finding key hold times for a given data row:

In [7]:
def key_hold_times(datarow):
    output = []
    
    # loop through all tuples in data row
    for i in range(len(datarow)):
        # find only pairs for keys that are pressed
        if datarow[i][2] == "pressed":
            current_char = datarow[i][0].lower()
            # start looking for other half of pair from next index onwards
            for j in range(i + 1, len(datarow)):
                next_char = datarow[j][0].lower()
                # if they match, record differences in time
                if next_char == current_char:
                    output.append(("hold_"+ current_char, float(datarow[j][1]) - float(datarow[i][1])))
                    break
                    
    return output

Loop and print differences in key hold times between different data samples to make sure that they are small

In [8]:
# first for akshay's data
hold0 = key_hold_times(akshay_data[0])

hold1 = key_hold_times(akshay_data[1])

hold2 = key_hold_times(akshay_data[2])

print("differences in hold times between sample 0 and sample 1 in Akshay's data")
for i in range(len(hold0)):
    print(hold0[i][0], np.abs(round(hold0[i][1] - hold1[i][1], 5)))
    
print("\ndifferences in hold times between sample 1 and sample 2 in Akshay's data")
for i in range(len(hold0)):
    print(hold1[i][0], np.abs(round(hold1[i][1] - hold2[i][1], 5)))
    
print()
    
# then for mummy's data
hold0 = key_hold_times(mummy_data[0])

hold1 = key_hold_times(mummy_data[1])

hold2 = key_hold_times(mummy_data[2])

print("differences in hold times between sample 0 and sample 1 in Mummy's data")
for i in range(len(hold0)):
    print(hold0[i][0], np.abs(round(hold0[i][1] - hold1[i][1], 5)))
    
print("\ndifferences in hold times between sample 1 and sample 2 in Mummy's data")
for i in range(len(hold0)):
    print(hold1[i][0], np.abs(round(hold1[i][1] - hold2[i][1], 5)))

differences in hold times between sample 0 and sample 1 in Akshay's data
hold_. 0.012
hold_t 0.0041
hold_i 0.00833
hold_e 0.00801
hold_5 7e-05
hold_key.caps_lock 0.0
hold_r 0.02424
hold_key.caps_lock 0.0
hold_o 0.03309
hold_a 0.01938
hold_n 0.00071
hold_l 0.01002
hold_e 0.01616

differences in hold times between sample 1 and sample 2 in Akshay's data
hold_. 0.00386
hold_t 0.00012
hold_i 0.00382
hold_e 0.012
hold_5 0.00388
hold_key.caps_lock 1e-05
hold_r 0.01132
hold_key.caps_lock 0.0
hold_o 0.02083
hold_a 0.00062
hold_n 0.01138
hold_l 0.00794
hold_e 0.01592

differences in hold times between sample 0 and sample 1 in Mummy's data
hold_. 0.01216
hold_t 0.00393
hold_i 0.00388
hold_e 0.00028
hold_5 0.00378
hold_key.caps_lock 0.0
hold_r 0.00351
hold_key.caps_lock 0.0
hold_o 0.00058
hold_a 0.00422
hold_n 0.0042
hold_l 0.00019
hold_e 0.0039

differences in hold times between sample 1 and sample 2 in Mummy's data
hold_. 0.01557
hold_t 0.00408
hold_i 0.01659
hold_e 0.00448
hold_5 0.00019
hold_k

Great, now we can confirm that hold times are a repeatable charactericstic of the way people type

### b. Next finding keydown-keydown times for a given data row:

In [9]:
def keydown_times(datarow):
    output = []
    
    # loop through all tuples in data row and keep only those that are "pressed"
    pressed = [x for x in datarow if x[2] == "pressed"]
    
    # find time between keydown of current key and keydown of next key
    for i in range(len(pressed) - 1):
        output.append(("dd_" + pressed[i][0] + pressed[i + 1][0], pressed[i + 1][1] - pressed[i][1]))
        
    return output

Loop and print differences in keydown-keydown times between different data samples to make sure that they are small

In [10]:
# first for akshay's data
keydown0 = keydown_times(akshay_data[0])

keydown1 = keydown_times(akshay_data[1])

keydown2 = keydown_times(akshay_data[2])

print("differences in keydown-keydown times between sample 0 and sample 1 in Akshay's data")
for i in range(len(keydown0)):
    print(keydown0[i][0], np.abs(round(keydown0[i][1] - keydown1[i][1], 5)))
    
print("\ndifferences in keydown-keydown times between sample 1 and sample 2 in Mummy's")
for i in range(len(keydown0)):
    print(keydown1[i][0], np.abs(round(keydown1[i][1] - keydown2[i][1], 5)))
    
print()

# first for mummy's data
keydown0 = keydown_times(mummy_data[0])

keydown1 = keydown_times(mummy_data[1])

keydown2 = keydown_times(mummy_data[2])

print("differences in keydown-keydown times between sample 0 and sample 1 in Akshay's")
for i in range(len(keydown0)):
    print(keydown0[i][0], np.abs(round(keydown0[i][1] - keydown1[i][1], 5)))
    
print("\ndifferences in keydown-keydown times between sample 1 and sample 2 in Mummy's")
for i in range(len(keydown0)):
    print(keydown1[i][0], np.abs(round(keydown1[i][1] - keydown2[i][1], 5)))


differences in keydown-keydown times between sample 0 and sample 1 in Akshay's data
dd_.t 0.01399
dd_ti 0.01825
dd_ie 0.01025
dd_e5 0.22685
dd_5Key.caps_lock 0.0365
dd_Key.caps_lockR 0.00855
dd_RKey.caps_lock 0.02839
dd_Key.caps_locko 0.02642
dd_oa 0.0633
dd_an 0.01954
dd_nl 0.00013
dd_le 0.01596

differences in keydown-keydown times between sample 1 and sample 2 in Mummy's
dd_.t 0.00107
dd_ti 0.00999
dd_ie 0.01812
dd_e5 0.09201
dd_5Key.caps_lock 0.02943
dd_Key.caps_lockR 0.02339
dd_RKey.caps_lock 0.01142
dd_Key.caps_locko 0.07128
dd_oa 0.03514
dd_an 0.02603
dd_nl 0.01945
dd_le 0.00387

differences in keydown-keydown times between sample 0 and sample 1 in Akshay's
dd_.t 0.08202
dd_ti 0.01401
dd_ie 0.0348
dd_e5 0.0669
dd_5Key.caps_lock 0.01873
dd_Key.caps_lockR 0.01231
dd_RKey.caps_lock 0.0408
dd_Key.caps_locko 0.07243
dd_oa 0.0303
dd_an 0.04145
dd_nl 0.05508
dd_le 0.06534

differences in keydown-keydown times between sample 1 and sample 2 in Mummy's
dd_.t 0.01719
dd_ti 0.00993
dd_ie 0.

Awesome, now we can confirm that keydown-keydown times are a repeatable charactericstic of the way people type

### c. Finally finding keyup-keydown times for a given data row:

In [11]:
# def keyup_keydown_times(datarow):
#     output = []
    
#     # loop through all tuples in data row and keep only pairs that are consectutively "released" then "pressed"
#     up_down = []
    
#     for i in range(len(datarow) - 1):
#         if datarow[i][2] == "released" and datarow[i+1][2] == "pressed":
#             up_down.append(datarow[i])
#             up_down.append(datarow[i+1])
            
#     # find time between keyup of current key and keydown of next key
#     for i in range(len(up_down) - 1):
#         output.append(("ud_" + up_down[i][0] + up_down[i + 1][0], up_down[i + 1][1] - up_down[i][1]))    
    
#     return output

# keyup_keydown_times(akshay_data[3]) 

Loop and print differences in keyup-keydown times between different data samples to make sure that they are small

In [12]:
# keyupdown0 = keyup_keydown_times(data[0])

# keyupdown1 = keyup_keydown_times(data[1])

# keyupdown2 = keyup_keydown_times(data[2])

# keyupdown3 = keyup_keydown_times(data[3])

# print([print(x) for x in keyupdown2])

# print("\n\n")

# print([print(x) for x in keyupdown3])

# # print("differences in keydown-keydown times between sample 0 and sample 1")
# # for i in range(len(keyupdown0)):
# #     print(keyupdown0[i][0], np.abs(round(keyupdown0[i][1] - keyupdown1[i][1], 5)))
    
# # print("\ndifferences in keydown-keydown times between sample 1 and sample 2")
# # for i in range(len(keyupdown0)):
# #     print(keyupdown1[i][0], np.abs(round(keyupdown1[i][1] - keyupdown2[i][1], 5)))

### Putting it all together and extracting features for all rows in data:

In [13]:
# first for akshay's data
# extract features from all data rows and store in tupled_data[]
tupled_data = [] # data with descriptions

# loop through all rows in data
for row in akshay_data:
    # extract features
    hold = key_hold_times(row)
    keydown = keydown_times(row)

    # store features in single list
    tupled_data.append(hold + keydown)
    
# final list of lists that will be inputted into a pandas dataframe
# stripped of feature descriptions
akshay_input_data = []

for row in tupled_data:
    new_row = []
    for item in row:
        new_row += [item[1]]
            
    akshay_input_data += [new_row]
    
# then for mummy's data
# extract features from all data rows and store in tupled_data[]
tupled_data = [] # data with descriptions

# loop through all rows in data
for row in mummy_data:
    # extract features
    hold = key_hold_times(row)
    keydown = keydown_times(row)

    # store features in single list
    tupled_data.append(hold + keydown)
    
# final list of lists that will be inputted into a pandas dataframe
# stripped of feature descriptions
mummy_input_data = []

for row in tupled_data:
    new_row = []
    for item in row:
        new_row += [item[1]]
            
    mummy_input_data += [new_row]

Keep track of feature names of model in features[]

In [14]:
features = [tupled_data[0][x][0] for x in range(len(tupled_data[0]))]
# add label for classification
features += ["akshay?"]

print("model's features are:\n", features)

model's features are:
 ['hold_.', 'hold_t', 'hold_i', 'hold_e', 'hold_5', 'hold_key.caps_lock', 'hold_r', 'hold_key.caps_lock', 'hold_o', 'hold_a', 'hold_n', 'hold_l', 'hold_e', 'dd_.t', 'dd_ti', 'dd_ie', 'dd_e5', 'dd_5Key.caps_lock', 'dd_Key.caps_lockR', 'dd_RKey.caps_lock', 'dd_Key.caps_locko', 'dd_oa', 'dd_an', 'dd_nl', 'dd_le', 'akshay?']


### Label data by person who typed it

In [15]:
for row in akshay_input_data:
    row += [1]
    
for row in mummy_input_data:
    row += [0]

# 3. Loading data into Pandas dataframe

In [18]:
# combine input lists and then create a dataframe from them
input_data = akshay_input_data + mummy_input_data

print(len(input_data))

df = pd.DataFrame(input_data)

41


Let's add feature names as columns and see what the first few rows look like

In [24]:
df.columns = features

df.head()

Unnamed: 0,hold_.,hold_t,hold_i,hold_e,hold_5,hold_key.caps_lock,hold_r,hold_key.caps_lock.1,hold_o,hold_a,...,dd_e5,dd_5Key.caps_lock,dd_Key.caps_lockR,dd_RKey.caps_lock,dd_Key.caps_locko,dd_oa,dd_an,dd_nl,dd_le,akshay?
0,0.112423,0.071869,0.068243,0.068043,0.071803,1.2e-05,0.063379,1.2e-05,0.108072,0.076095,...,0.192158,0.280431,0.081338,0.058928,0.181955,0.120096,0.150098,0.079997,0.084038,1
1,0.100422,0.075966,0.059916,0.07605,0.071875,7e-06,0.087623,1.2e-05,0.074978,0.095479,...,0.41901,0.316935,0.089889,0.087317,0.155538,0.056795,0.130554,0.079863,0.099999,1
2,0.096557,0.075849,0.056091,0.064054,0.067992,1.3e-05,0.09894,1.4e-05,0.095808,0.096097,...,0.327002,0.287505,0.0665,0.098738,0.08426,0.091933,0.156586,0.06041,0.103868,1
3,0.104163,0.07202,0.060119,0.06806,0.075876,9e-06,0.063919,6e-06,0.067999,0.096095,...,0.211005,0.287728,0.056666,0.070927,0.11768,0.071932,0.126244,0.079796,0.083964,1
4,0.092102,0.076233,0.056006,0.072246,0.072044,1e-05,0.091873,1.2e-05,0.079911,0.088521,...,0.201838,0.272471,0.054487,0.082018,0.106111,0.083815,0.123122,0.080226,0.099094,1


Double check that columns have the correct datatypes

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   hold_.              41 non-null     float64
 1   hold_t              41 non-null     float64
 2   hold_i              41 non-null     float64
 3   hold_e              41 non-null     float64
 4   hold_5              41 non-null     float64
 5   hold_key.caps_lock  41 non-null     float64
 6   hold_r              41 non-null     float64
 7   hold_key.caps_lock  41 non-null     float64
 8   hold_o              41 non-null     float64
 9   hold_a              41 non-null     float64
 10  hold_n              41 non-null     float64
 11  hold_l              41 non-null     float64
 12  hold_e              41 non-null     float64
 13  dd_.t               41 non-null     float64
 14  dd_ti               41 non-null     float64
 15  dd_ie               41 non-null     float64
 16  dd_e5     