# Preprocessing Kirjoitustesti data

In [1]:
import pandas as pd
import numpy as np
import csv

## Select Users

TODO: Clean BROWSER_STRING and USING_FEATURES

In [2]:
# Load participants

participants = pd.read_csv('data/raw_data/typingtest_finnish_2020-06-03/raw_participants_fi_2020-06-03.csv')

participants = participants[['PARTICIPANT_ID', 'BROWSER_STRING', 'DEVICE', 'SCREEN_W','SCREEN_H', 'AGE','GENDER', 'HAS_TAKEN_TYPING_COURSE','WPM','ERROR_RATE', 'NATIVE_LANGUAGE','KEYBOARD_TYPE','USING_APP', 'USING_FEATURES','FINGERS','TIME_SPENT_TYPING', 'TYPE_TEST_LANG']]

print(participants.head())
print(participants.head().dtypes)


   PARTICIPANT_ID                                     BROWSER_STRING   DEVICE  \
0               1  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...  desktop   
1               2  Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_1 like...   mobile   
2               3  Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_1 like...   mobile   
3               4  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...  desktop   
4               5  Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6...  desktop   

   SCREEN_W  SCREEN_H  AGE  GENDER  HAS_TAKEN_TYPING_COURSE        WPM  \
0      1440       832   42    male                        0  55.698814   
1       320       568   40    male                        1  51.143282   
2       320       568   42    male                        0  44.001967   
3      1440       836   29  female                        0  78.853701   
4      1440       836   28  female                        0  46.384775   

   ERROR_RATE NATIVE_LANGUAGE KEYBOARD_TYPE USING_APP          USING

In [3]:
participants.shape[0]

22082

In [4]:
# Select only mobile users
participants_mobile = participants.loc[participants['KEYBOARD_TYPE'] == 'mobile']

# Remove WPM != null
participants_mobile = participants_mobile[participants_mobile.WPM.apply(lambda x: x != 'N')]


# Change data types
participants_mobile[["DEVICE", "GENDER",'NATIVE_LANGUAGE','KEYBOARD_TYPE',"USING_APP", 'USING_FEATURES', 'FINGERS', 'TYPE_TEST_LANG']] = participants_mobile[["DEVICE", "GENDER",'NATIVE_LANGUAGE','KEYBOARD_TYPE',"USING_APP", 'USING_FEATURES', 'FINGERS', 'TYPE_TEST_LANG']].astype('category')

# 0 < WPM < 200, error_rate < 25%
participants_mobile = participants_mobile.loc[participants_mobile['WPM'] > 0.]
participants_mobile = participants_mobile.loc[participants_mobile['WPM'] < 200.]
participants_mobile = participants_mobile.loc[participants_mobile['ERROR_RATE'] < 25.]

# PARTICIPANT_ID > 4 : Testers
participants_mobile = participants_mobile.loc[participants_mobile['PARTICIPANT_ID'] > 4]

# Between ages 10 and 70 (most of the data is between this age group)
participants_mobile = participants_mobile.loc[participants_mobile['AGE'] >= 10]
participants_mobile = participants_mobile.loc[participants_mobile['AGE'] <= 70]


print(participants_mobile.shape)

print(participants_mobile.head().dtypes)

(10299, 17)
PARTICIPANT_ID                int64
BROWSER_STRING               object
DEVICE                     category
SCREEN_W                      int64
SCREEN_H                      int64
AGE                           int64
GENDER                     category
HAS_TAKEN_TYPING_COURSE       int64
WPM                         float64
ERROR_RATE                  float64
NATIVE_LANGUAGE            category
KEYBOARD_TYPE              category
USING_APP                  category
USING_FEATURES             category
FINGERS                    category
TIME_SPENT_TYPING             int64
TYPE_TEST_LANG             category
dtype: object


## Save CSV

In [5]:
# Create new csv
participants_mobile.to_csv('data/processed2020/finnish/participants.csv', index=False)

# Select test_sections

In [6]:
# Load data 
test_sections = pd.read_csv('data/raw_data/typingtest_finnish_2020-06-03/raw_test_sections_fi_2020-06-03.csv')

In [7]:
print(test_sections.head())
print(test_sections.dtypes)

   TEST_SECTION_ID  SENTENCE_ID  PARTICIPANT_ID  \
0                1          317               1   
1                2          421               1   
2                3          545               1   
3                4           81               1   
4                5           18               1   

                                          USER_INPUT  INPUT_LENGTH  \
0  Hänen mielestään päätökset olivat silloin hyvi...            61   
1                     Norja oli vertailun paras maa.            30   
2  Yksinäisiä poikia on enemmän yläkoulussa kuin ...            58   
3                         Kohta pop up liikuntailta.            26   
4        Ei tässä tarvitse todistella yhtään mitään.            43   

   INPUT_TIME        WPM  ERROR_LENGTH  EDIT_DISTANCE  ERROR_RATE   DEVICE  
0       13906  51.776212            61              0         0.0  desktop  
1        5096  68.288854            30              0         0.0  desktop  
2       14829  46.125835            58   

## Select users

In [8]:
# List of participants
users = participants_mobile['PARTICIPANT_ID'].tolist()

# Select only test sections of selected participants
test_sections_mobile = test_sections.loc[test_sections['PARTICIPANT_ID'].isin(users)]
test_sections_mobile = test_sections_mobile[test_sections_mobile.WPM.apply(lambda x: x != 'N')]
len(users)

10299

## Remove improper test sections and users

In [9]:
test_sections_mobile = test_sections_mobile.loc[test_sections_mobile['WPM'] > 0.]
test_sections_mobile = test_sections_mobile.loc[test_sections_mobile['WPM'] < 200.]
test_sections_mobile = test_sections_mobile.loc[test_sections_mobile['ERROR_RATE'] < 25.]
test_sections_mobile.shape

(179576, 11)

In [10]:
# Select only users who have done at least 15 sentences.
selected_users = []
users_over15ts = []
for user in users:
    user_tests = test_sections_mobile.loc[test_sections_mobile['PARTICIPANT_ID'] == int(user)]
    num_tests = user_tests.shape[0]
    if num_tests >= 15: 
        selected_users.append(user)
        if num_tests > 15:
            users_over15ts.append(user)
    
print(len(selected_users))
print(len(users_over15ts))

test_sections_mobile = test_sections_mobile.loc[test_sections_mobile['PARTICIPANT_ID'].isin(selected_users)]
participants_mobile = participants_mobile.loc[participants_mobile['PARTICIPANT_ID'].isin(selected_users)]

9655
1624


In [11]:
test_sections_mobile.shape

(170858, 11)

In [12]:
# Remove additional test sections (if participant has more than 15)
remove_ts_ids = []
for user in users_over15ts:
    user_tests = test_sections_mobile.loc[test_sections_mobile['PARTICIPANT_ID'] == int(user)]
    user_tests_list = user_tests['TEST_SECTION_ID'].tolist()
    user_tests_list.sort()
    for i in range(15,len(user_tests_list)):
        remove_ts_ids.append(user_tests_list[i])
        #print(i)
    
#remove ids from data frame
test_sections_mobile.drop(test_sections_mobile.loc[test_sections_mobile['TEST_SECTION_ID'].isin(remove_ts_ids)].index, inplace = True)
test_sections_mobile.shape

(144825, 11)

In [13]:
print("Mean:", test_sections_mobile['WPM'].mean())
print('Number of test sections', test_sections_mobile.shape)

Mean: 44.218960591194424
Number of test sections (144825, 11)


## Save CSV

In [14]:
# Create new csv for TEST SECTIONS
test_sections_mobile.to_csv('data/processed2020/finnish/test_sections.csv', index=False)

In [15]:
# Create new csv for PARTICIPANTS
participants_mobile.to_csv('data/processed2020/finnish/participants.csv', index=False)

# Clean Logs


In [16]:
# Selected test sections
test_section_list = test_sections_mobile['TEST_SECTION_ID'].tolist()

In [22]:
# Read in chunks
# Selects only usable test sections. Keeps only keyup events.

first_chunk = True
for chunk in pd.read_csv('data/raw_data/typingtest_finnish_2020-06-03/raw_log_data_fi_2020-06-03.csv', chunksize=1000000, low_memory=False):
    #process(chunk)
    
    
    #print("mem1 original", chunk.memory_usage(index=True).sum())
    # Transform data types
    chunk[['LOG_DATA_ID', 'TEST_SECTION_ID']] = chunk[['LOG_DATA_ID', 'TEST_SECTION_ID']].apply(pd.to_numeric, downcast='integer')
    chunk[['TIMESTAMP']] = chunk[['EVENT_TIMESTAMP']].apply(pd.to_numeric)
    chunk[["EVENT_TYPE", "KEY", "EVENT_CODE"]] = chunk[["EVENT_TYPE", "EVENT_KEY", "EVENT_CODE"]].astype('category')
    chunk[['INPUT']] = chunk[['INPUT_TEXT']]
    
    
    #print(chunk.head().dtypes)
    #print("mem2 data types", chunk.memory_usage(index=True).sum())
    
        
    # Select only wanted test sections
    chunk = chunk.loc[chunk['TEST_SECTION_ID'].isin(test_section_list)]

    #print("mem3 select test sections", chunk.memory_usage(index=True).sum())
    
    # Write as CSV file with all input information.
    #if first_chunk:
    #    chunk.to_csv('data/processed2020/finnish/log_data_all.csv', mode='w',  index=False)
    #    first_chunk = False
    #else:
    #    chunk.to_csv('data/processed2020/finnish/log_data_all.csv', mode='a',  index=False, header=None)
    
    
    # Keep only keyups
    # Todo: try other event types
    chunk_filtered = chunk.loc[chunk['EVENT_TYPE'] == 'keyup']
    
    #print("mem4 only keyup", chunk_filtered.memory_usage(index=True).sum())
    
    # Remove unnecessary columns.
    chunk_filtered = chunk_filtered[['TEST_SECTION_ID', 'LOG_DATA_ID', 'TIMESTAMP', 'KEY', 'EVENT_CODE','INPUT']]
    
    #print("mem5 remove cols", chunk_filtered.memory_usage(index=True).sum())
    
    # Remove improper test sections
    chunk_filtered = chunk_filtered.loc[chunk_filtered['TEST_SECTION_ID'].isin(test_section_list)]
    
    # Order by TEST_SECTION_ID and TIMESTAMP (Order after creating new tables)
    chunk_filtered.sort_values(by=['TEST_SECTION_ID','TIMESTAMP'])
    
    
    # Write as CSV file.
    if first_chunk:
        chunk_filtered.to_csv('data/processed2020/finnish/log_data.csv', mode='w', index=False)
        first_chunk = False
    else:
        chunk_filtered.to_csv('data/processed2020/finnish/log_data.csv', mode='a', index=False, header=None)

    