The purpose of this notebooks is to verify that the filtering that we do to remove invalid participants is not biased towards or against a certain demographic.

In [1]:
import pandas as pd
import processing

In [8]:
log_sample = pd.read_csv('./data/log_sample.csv')
log_sample = processing.log_process(log_sample)
log = processing.mark_entries(log_sample)
log = processing.get_participants_for_log(log)
participants = processing.get_participants()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
log.head()

Unnamed: 0,ts_id,key,lev_dist,text_field,len_diff,iki,text_field_prev,is_rep,ite,is_forward,entry_id,participant_id
0,2710,t,1,t,-1,,,False,none,True,0.0,688
1,2710,ti,1,ti,1,148.0,t,False,none,True,0.0,688
2,2710,ti,0,ti,0,104.0,ti,False,none,True,0.0,688
3,2710,tig,1,tig,1,135.0,ti,False,none,True,0.0,688
4,2710,tig,0,tig,0,118.0,tig,False,none,True,0.0,688


In [10]:
participants.head()

Unnamed: 0,PARTICIPANT_ID,IP_ADDRESS,USER_AGENT,BROWSER_LANGUAGE,DEVICE,SCREEN_W,SCREEN_H,AGE,GENDER,HAS_TAKEN_TYPING_COURSE,...,ITE_PREDICT,ITE_AUTOCORR,ITE_SWYPE_NEW,ITE_PREDICT_NEW,ITE_AUTOCORR_NEW,browser,os,device_family,device_brand,device_model
0,629,110.54.203.10,Mozilla/5.0 (Linux; Android 5.0.2; vivo Y51 Bu...,en-US,mobile,360,640,20,male,0,...,0.034259,0.0,0.0,0.0,0.0,Facebook,Android,vivo Y51,vivo,Y51
1,632,115.164.40.232,Mozilla/5.0 (Linux; Android 7.0; vivo 1713 Bui...,en-US,mobile,360,640,24,male,0,...,0.157132,0.013333,0.054709,0.134725,0.015,Chrome Mobile WebView,Android,vivo 1713,vivo,1713
2,669,122.8.244.180,Mozilla/5.0 (Linux; Android 6.0; LG-V495 Build...,"en-US,en;q=0.8",mobile,601,962,16,female,0,...,0.037251,0.045875,0.092886,0.099632,0.051431,Chrome,Android,LG-V495,LG,V495
3,673,37.219.99.214,Mozilla/5.0 (Linux; Android 6.0; HTC One X9 du...,"en-US,en;q=0.9",mobile,360,640,27,female,0,...,0.186325,0.187377,0.0,0.155214,0.187377,Chrome Mobile,Android,HTC One X9 dual sim,HTC,One X9 dual sim
4,674,118.93.226.60,Mozilla/5.0 (iPhone; CPU iPhone OS 11_4_1 like...,en-us,mobile,375,667,24,female,0,...,0.246296,0.007407,0.073333,0.112963,0.014074,Mobile Safari,iOS,iPhone,Apple,iPhone


### Filtering the undefined keys

In [63]:
ts = pd.read_csv('./data/test_sections.csv',sep='\t')

In [97]:
participants_invalid1 = ts.loc[ts.is_valid == 0].PARTICIPANT_ID.unique()

In [98]:
participants.loc[~participants.PARTICIPANT_ID.isin(participants_invalid1),'is_valid'] = True
participants.loc[participants.PARTICIPANT_ID.isin(participants_invalid1),'is_valid'] = False

In [65]:
participants.is_valid.value_counts(normalize = True)

True     0.899165
False    0.100835
Name: is_valid, dtype: float64

In [75]:
participants.groupby('is_valid').os.value_counts(normalize=True)

is_valid  os           
False     Android          0.938819
          iOS              0.055380
          Windows Phone    0.005802
True      iOS              0.539392
          Android          0.460578
          Other            0.000030
Name: os, dtype: float64

In [66]:
participants.groupby('is_valid').browser.value_counts(normalize=True)

is_valid  browser                   
False     Samsung Internet              0.420622
          Chrome Mobile                 0.327268
          UC Browser                    0.066456
          Mobile Safari                 0.047996
          Facebook                      0.035601
          Firefox Mobile                0.027426
          Opera Mobile                  0.025053
          Chrome Mobile WebView         0.020570
          Chrome                        0.014768
          Edge Mobile                   0.005802
          Chrome Mobile iOS             0.002110
          QQ Browser Mobile             0.002110
          Crosswalk                     0.001582
          Pinterest                     0.001055
          Mobile Safari UI/WKWebView    0.000791
          Android                       0.000527
          Amazon Silk                   0.000264
True      Mobile Safari                 0.473502
          Chrome Mobile                 0.391258
          Facebook              

In [76]:
participants.loc[participants.os == 'iOS'].groupby('is_valid').browser.value_counts(normalize=True)

is_valid  browser                   
False     Mobile Safari                 0.866667
          Facebook                      0.071429
          Chrome Mobile iOS             0.038095
          Mobile Safari UI/WKWebView    0.014286
          Pinterest                     0.009524
True      Mobile Safari                 0.877844
          Chrome Mobile iOS             0.067219
          Facebook                      0.033061
          Mobile Safari UI/WKWebView    0.018093
          Pinterest                     0.002083
          Firefox iOS                   0.001206
          UC Browser                    0.000384
          Opera Mini                    0.000110
Name: browser, dtype: float64

In [84]:
participants.groupby('is_valid').device_brand.value_counts(normalize=True).nlargest(20)

is_valid  device_brand   
False     Samsung            0.550105
True      Apple              0.539408
          Generic            0.180788
False     Generic_Android    0.130274
True      Samsung            0.093100
False     Generic            0.070675
          Apple              0.055380
True      Generic_Android    0.052997
False     XiaoMi             0.033755
True      LG                 0.029545
          XiaoMi             0.025671
False     Huawei             0.025316
          LG                 0.020833
True      Huawei             0.019578
False     Tecno              0.016614
          vivo               0.015295
True      vivo               0.013249
False     ZTE                0.011603
          Motorola           0.011340
          Lenovo             0.011076
Name: device_brand, dtype: float64

**Conclusion:** The undefined key behaviour occurs to a moderate percenatage (~ 10%) of users. It is largely restricted to Android users, and to mainly Samsung brands.

### Filtering the multiple character keys

In [99]:
# Find the weird multichar behaviour
# First action of an entry is single letter AND following two actions in the entry have increasing # of chars
index_first = log.groupby(['ts_id','entry_id']).head(1).index
mask = log.index.isin(index_first)
mask &= log.key.str.len() == 1
mask &= log.key.shift(-1).str.len() > log.key.str.len()
mask &= log.key.shift(-2).str.len() > log.key.shift(-1).str.len()
mask &= log.entry_id.shift(-1) == log.entry_id
mask &= log.entry_id.shift(-2) == log.entry_id.shift(-1)

participants_invalid2 = log.loc[mask].participant_id.unique()

In [88]:
participants.loc[~participants.PARTICIPANT_ID.isin(participants_invalid2),'is_valid'] = True
participants.loc[participants.PARTICIPANT_ID.isin(participants_invalid2),'is_valid'] = False

In [89]:
participants.is_valid.value_counts(normalize = True)

True     0.955619
False    0.044381
Name: is_valid, dtype: float64

In [90]:
participants.groupby('is_valid').browser.value_counts(normalize=True)

is_valid  browser                   
False     Chrome Mobile                 0.857999
          Facebook                      0.044338
          Chrome Mobile WebView         0.027561
          Samsung Internet              0.027561
          Chrome                        0.019772
          Amazon Silk                   0.005992
          Opera Mobile                  0.005992
          Pinterest                     0.004793
          Mobile Safari                 0.003595
          UC Browser                    0.001198
          Puffin                        0.000599
          Yandex Browser                0.000599
True      Mobile Safari                 0.450427
          Chrome Mobile                 0.362829
          Samsung Internet              0.054150
          Facebook                      0.046025
          Chrome Mobile iOS             0.034338
          Chrome Mobile WebView         0.012800
          Mobile Safari UI/WKWebView    0.009266
          Chrome                

In [91]:
participants.groupby('is_valid').os.value_counts(normalize=True)

is_valid  os           
False     Android          0.996405
          iOS              0.003595
True      iOS              0.513204
          Android          0.486156
          Windows Phone    0.000612
          Other            0.000028
Name: os, dtype: float64

In [92]:
participants.groupby('is_valid').device_brand.value_counts(normalize=True).nlargest(20)

is_valid  device_brand   
True      Apple              0.513218
False     Generic            0.417615
          Samsung            0.190533
True      Generic            0.158170
          Samsung            0.136799
False     Generic_Android    0.118035
          LG                 0.082684
True      Generic_Android    0.058131
False     XiaoMi             0.046735
True      LG                 0.026158
          XiaoMi             0.025545
False     Huawei             0.025165
          Motorola           0.020971
True      Huawei             0.019924
False     vivo               0.014979
True      vivo               0.013385
False     Lenovo             0.013182
          HTC                0.011384
          ZTE                0.010785
          Infinix            0.007190
Name: device_brand, dtype: float64

**Conclusion:** The multicharacter behaviour occurs to a small percenatage (~ 5%) of users. It is restricted to Android users, and to mainly 'Generic" brands.

### Demographics

After removing all invalid groups, are the demographics affected?

In [100]:
mask = participants.PARTICIPANT_ID.isin(participants_invalid1)
mask |= participants.PARTICIPANT_ID.isin(participants_invalid2)

participants.loc[~mask,'is_valid'] = True
participants.loc[mask,'is_valid'] = False

In [136]:
participants.is_valid.value_counts(normalize=True)

True     0.854784
False    0.145216
Name: is_valid, dtype: float64

In [108]:
percentiles = [0.05,0.1,0.25,0.5,0.75,0.9,0.95]

In [101]:
participants.groupby('is_valid').GENDER.value_counts(normalize=True)

is_valid  GENDER
False     female    0.621498
          male      0.339498
          none      0.039004
True      female    0.650490
          male      0.302069
          none      0.047441
Name: GENDER, dtype: float64

In [112]:
participants.groupby('is_valid').AGE.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,5461.0,25.683025,8.793961,6.0,20.0,24.0,30.0,61.0
True,32145.0,23.897496,8.848265,6.0,18.0,22.0,28.0,61.0


In [129]:
participants.groupby('is_valid').TYPE_ENGLISH.value_counts(normalize=True)

is_valid  TYPE_ENGLISH
False     always          0.412551
          usually         0.291496
          sometimes       0.229484
          rarely          0.053843
          never           0.012625
True      always          0.484009
          usually         0.268620
          sometimes       0.191312
          rarely          0.043223
          never           0.012836
Name: TYPE_ENGLISH, dtype: float64

In [134]:
participants.groupby('is_valid').BROWSER_LANGUAGE.value_counts(normalize=True).nlargest(10)

is_valid  BROWSER_LANGUAGE          
True      en-us                         0.443366
False     en-US,en;q=0.8                0.235488
          en-US,en;q=0.9                0.194836
True      en-US,en;q=0.9                0.193218
False     en-US                         0.096319
          en-GB,en-US;q=0.8,en;q=0.6    0.091375
True      en-GB,en-US;q=0.9,en;q=0.8    0.053414
          en-gb                         0.047099
False     en-GB,en-US;q=0.9,en;q=0.8    0.043765
          en-us                         0.030031
Name: BROWSER_LANGUAGE, dtype: float64

In [127]:
participants.groupby('is_valid').WPM.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,5461.0,31.637835,11.787545,0.689707,23.650195,30.116277,38.059934,154.257192
True,32145.0,36.896674,13.299399,0.019106,27.418723,35.278929,44.836511,148.166012


In [124]:
participants.groupby('is_valid').ERROR_RATE.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,5461.0,2.144371,2.074338,0.0,0.814332,1.554404,2.81155,23.824959
True,32145.0,2.371068,2.081737,0.0,1.003344,1.772526,3.114187,24.348607


**Conclusion:** Almost 15% of the data was marked as invalid. The invalid group consists of a very slightly older and more male group. This group types in English less frequently. The group exhibits a slightly lower typing speed (WPM).