In [1]:
import pandas as pd
import processing

# Filtering effects

In this section we examine the effect of our participants filtering on the various demographics of the sample.

In [2]:
log = pd.read_csv('./data/log_valid_processed.csv')
log_raw = pd.read_csv('./data/log_valid.csv')
log_raw = processing.get_participants_for_log(log_raw)
participants = pd.read_csv('./data/participants_processed.csv')
participants = participants.set_index('PARTICIPANT_ID')

In [3]:
participants = participants.loc[log_raw.participant_id.unique()]

In [4]:
participants['is_sampled'] = False
participants.loc[log.participant_id.unique(),'is_sampled'] = True

In [5]:
participants.is_sampled.value_counts()

False    24862
True      8952
Name: is_sampled, dtype: int64

## Brand and OS

In [6]:
participants.groupby('is_sampled').os.value_counts(normalize=True)

is_sampled  os     
False       Android    0.596493
            iOS        0.403467
            Other      0.000040
True        iOS        0.916890
            Android    0.083110
Name: os, dtype: float64

In [7]:
participants.groupby('is_sampled').browser.value_counts(normalize=True)

is_sampled  browser                   
False       Chrome Mobile                 0.510257
            Mobile Safari                 0.350615
            Facebook                      0.048468
            Chrome Mobile iOS             0.030448
            Chrome Mobile WebView         0.015445
            Samsung Internet              0.015325
            Chrome                        0.010096
            Mobile Safari UI/WKWebView    0.007562
            Opera Mobile                  0.005189
            Amazon Silk                   0.002856
            Pinterest                     0.001810
            UC Browser                    0.000845
            Firefox iOS                   0.000523
            Puffin                        0.000282
            Android                       0.000121
            Coc Coc                       0.000040
            Opera Mini                    0.000040
            Other                         0.000040
            Yandex Browser                0

In [8]:
participants.groupby('is_sampled').device_brand.value_counts(normalize=True).nlargest(20)

is_sampled  device_brand   
True        Apple              0.916890
False       Apple              0.403483
            Generic            0.237802
            Samsung            0.118418
            Generic_Android    0.069144
            LG                 0.039741
            XiaoMi             0.029886
            Huawei             0.025824
True        Samsung            0.022788
            Generic            0.022453
False       vivo               0.015003
True        XiaoMi             0.013963
False       Motorola           0.009292
True        vivo               0.008378
            Generic_Android    0.008155
False       Lenovo             0.007401
            ZTE                0.007200
            Infinix            0.005229
            HTC                0.005189
            Asus               0.004948
Name: device_brand, dtype: float64

**Conclusion:** The valid, post-filtering users are much more heavily oriented towards iOS usage. They also are more likely to use Safari over Chrome

## Demographics

In [9]:
percentiles = [0.05,0.1,0.25,0.5,0.75,0.9,0.95]

In [10]:
participants.groupby('is_sampled').GENDER.value_counts(normalize=True)

is_sampled  GENDER
False       female    0.632290
            male      0.322943
            none      0.044767
True        female    0.697163
            male      0.248436
            none      0.054401
Name: GENDER, dtype: float64

In [11]:
participants.groupby('is_sampled').AGE.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_sampled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,24862.0,24.359504,8.775551,6.0,19.0,23.0,28.0,61.0
True,8952.0,22.976318,9.062334,6.0,17.0,21.0,27.0,61.0


**Conclusion:** The valid, post-filtering users have a slightly larger female compostion and are slightly younger.

## Typing

In [12]:
participants.groupby('is_sampled').TYPE_ENGLISH.value_counts(normalize=True)

is_sampled  TYPE_ENGLISH
False       always          0.435811
            usually         0.291562
            sometimes       0.212131
            rarely          0.047297
            never           0.013199
True        always          0.809629
            usually         0.107360
            sometimes       0.058661
            rarely          0.014388
            never           0.009961
Name: TYPE_ENGLISH, dtype: float64

In [13]:
participants.groupby('is_sampled').BROWSER_LANGUAGE.value_counts(normalize=True).nlargest(10)

is_sampled  BROWSER_LANGUAGE          
True        en-us                         0.739723
False       en-us                         0.307135
            en-US,en;q=0.9                0.275762
True        en-gb                         0.083892
False       en-GB,en-US;q=0.9,en;q=0.8    0.074089
True        en-ca                         0.041667
False       en-US                         0.036964
True        en-au                         0.034629
False       en-gb                         0.030689
True        en-US,en;q=0.9                0.024911
Name: BROWSER_LANGUAGE, dtype: float64

In [14]:
participants.groupby('is_sampled').WPM.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_sampled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,24862.0,35.301802,12.963875,0.019106,26.142669,33.572441,42.658034,154.257192
True,8952.0,40.425418,13.421506,0.212161,31.051003,39.433127,49.029891,135.426179


**Conclusion:** The valid, post-filtering participants type in English more frequenlty, and type faster. They also have a higher error rate.