# Analisys

Post-processing of the data gathered in CO1111 course work 2.

## Supplimentary context-unaware functions

Import necessary libraries:

In [1]:
# embedded in python
import os
from pprint import pprint
# pip install
#from IPython.display import display, HTML
import pandas as pd
#import plotly.graph_objects as go
import yaml
# same package

Define several functions:

In [2]:
def readf( path, encoding='utf-8', n_lines=0 ):

    # Reads either the whole file or a specified number of lines from it.

    if n_lines>0:
        lines = []
        with open( path, 'r', encoding=encoding ) as f:
            for _ in range(n_lines):
                lines.append( f.readline() )
            #log.info( 'read {n_lines} lines from file {file}'.format(n_lines=n_lines,file=path) )
            return lines

    with open( path, 'r', encoding=encoding ) as f:
        text = f.read()
        #log.info( 'read file %s'%path )
        return text
    
def readf_yaml( path, encoding='utf-8' ):

    with open( path, 'r', encoding=encoding ) as f:
        data = yaml.safe_load(f)
        return data

## Obtaining master table

Read source files with gathered statistics:

In [3]:
YAMLS = {}
INDEX = {}

root = 's'
for f in os.listdir(root):
    src = os.path.join( root,f )
    name, ext = os.path.splitext(f)
    
    if name[-2:] in ['kb','ui']:
        # these are actual yaml files
        
        yamls = readf(src).replace('...','').split('---')
        rows = []
        for y in yamls:
            if len(y)==0: continue
            rows.append( yaml.safe_load(y) )
        YAMLS[src] = rows
        
    else:
        # this is stringified dictionary
        
        text = readf(src)
        INDEX[src] = eval(text)

Combine them into a dataframe:

In [4]:
dfs = []
for src, dictionary in INDEX.items():
    # obtain stats and metadata of a single user
    
    path2kb = os.path.normpath( dictionary['kb stats'] )
    path2ui = os.path.normpath( dictionary['ui stats'] )
    # read ui and kb stats into one vertical dataframe that shows how user interacted with inteface
    df = pd.concat(
        [
            pd.DataFrame( YAMLS[path2kb] ),
            pd.DataFrame( YAMLS[path2ui] )
            ], axis=0 )
    # add metadata to each row
    df['user agent'] = dictionary['user agent']
    df['user id'] = dictionary['user id']
    df['image path'] = dictionary['im']
    df['bodypart'] = dictionary['bodypart']
    
    # remember this dataframe
    dfs.append(df)
    
# concat stats and metadata of all users into one big dataframe, sort by timestamp
df = pd.concat( dfs, axis=0 )
df.sort_values( 'timestamp', inplace=True )
df.reset_index( drop=True, inplace=True )
print(df)

         key             timestamp           bt  \
0        NaN  2022.4.1_18.1.18_393       orange   
1     delete  2022.4.1_18.1.21_158          NaN   
2        NaN  2022.4.1_18.2.12_415  greenyellow   
3          e  2022.4.1_18.2.16_798          NaN   
4     delete  2022.4.1_18.2.23_158          NaN   
...      ...                   ...          ...   
1902   enter   2022.4.6_9.1.30_226          NaN   
1903     NaN    2022.4.6_9.1.3_618        black   
1904       2     2022.4.6_9.1.8_10          NaN   
1905       3    2022.4.6_9.1.8_331          NaN   
1906       2    2022.4.6_9.1.9_514          NaN   

                                             user agent  \
0     Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...   
1     Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...   
2     Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...   
3     Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...   
4     Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...   
...                              

Export it to disk:

In [5]:
df.to_csv( 'all_stats_timeline.csv', index=False )

## Processing the master table

Number of unique users:

In [6]:
unique_users = df['user id'].unique()
len( unique_users )

6

How actively they were interacting with the application interface:

In [7]:
for unique_user in unique_users:
    mask = df['user id']==unique_user
    
    all_images_he_drew = df[mask]['image path'].unique()
    print(
        'unique user %s drew %s images'%(
            unique_user,
            len( all_images_he_drew )
            )
        )
    
    # add total image count to the dataframe
    df.loc[mask,'total images drawn'] = len( all_images_he_drew )

unique user 2022.4.1_17.58.37_970 drew 1 images
unique user 2022.4.1_18.19.10_142 drew 1 images
unique user 2022.4.1_19.55.45_778 drew 1 images
unique user 2022.4.2_17.7.43_296 drew 1 images
unique user 2022.4.6_18.0.23_65 drew 12 images
unique user 2022.4.6_9.0.59_991 drew 1 images


Which buttons were they using:

In [8]:
"""
for unique_user in ( unique_users ):
    mask = df['user id']==unique_user
    
    maskK = df['key'].notna()
    maskB = df['bt'].notna()
    print(
        'unique user %s pressed %s keyboard buttons and %s ui buttons, where'%(
            unique_user,
            len( df[ mask & maskK ] ),
            len( df[ mask & maskB ] )
            )
        )
    print( df[ mask & maskK ]['key'].value_counts() )
    print( df[ mask & maskB ]['bt'].value_counts() )
    print( '-'*10 )
"""

"\nfor unique_user in ( unique_users ):\n    mask = df['user id']==unique_user\n    \n    maskK = df['key'].notna()\n    maskB = df['bt'].notna()\n    print(\n        'unique user %s pressed %s keyboard buttons and %s ui buttons, where'%(\n            unique_user,\n            len( df[ mask & maskK ] ),\n            len( df[ mask & maskB ] )\n            )\n        )\n    print( df[ mask & maskK ]['key'].value_counts() )\n    print( df[ mask & maskB ]['bt'].value_counts() )\n    print( '-'*10 )\n"

Which buttons were they preferring:

In [9]:
normalized_counts = []
for unique_user in ( unique_users ):
    mask = df['user id']==unique_user
    
    maskK = df['key'].notna()
    maskB = df['bt'].notna()
    print(
        'unique user %s pressed %s keyboard buttons and %s ui buttons, where'%(
            unique_user,
            len( df[ mask & maskK ] ),
            len( df[ mask & maskB ] )
            )
        )
    
    countK = df[ mask & maskK ]['key'].value_counts() #/ df[mask&maskK]['total images drawn']
    countB = df[ mask & maskB ]['bt'].value_counts() #/ df[mask&maskB]['total images drawn']
    
    countKdf = pd.DataFrame()
    countBdf = pd.DataFrame()
    
    total_ims = df[mask]['total images drawn'].iloc[0]
    
    countKdf['v'] = list(countK)
    countKdf['v norm'] = countKdf['v'] / total_ims
    countKdf['k'] = list( countK.index )
    countKdf['user id'] = unique_user
    
    countBdf['v'] = list(countB)
    countBdf['v norm'] = countBdf['v'] / total_ims
    countBdf['k'] = list( countB.index )
    countBdf['user id'] = unique_user
    
    normalized_counts.append( countKdf )
    normalized_counts.append( countBdf )
    
    print( countKdf[['k','v norm']], 'were keyboard shortcuts' )
    print( countBdf[['k','v norm']], 'were ui buttons' )
    print( '-'*10 )

unique user 2022.4.1_17.58.37_970 pressed 3 keyboard buttons and 7 ui buttons, where
        k  v norm
0  delete     2.0
1       e     1.0 were keyboard shortcuts
             k  v norm
0  greenyellow     2.0
1          red     2.0
2       orange     1.0
3        black     1.0
4         save     1.0 were ui buttons
----------
unique user 2022.4.1_18.19.10_142 pressed 1 keyboard buttons and 1 ui buttons, where
        k  v norm
0  delete     1.0 were keyboard shortcuts
      k  v norm
0  save     1.0 were ui buttons
----------
unique user 2022.4.1_19.55.45_778 pressed 27 keyboard buttons and 1 ui buttons, where
         k  v norm
0  control    27.0 were keyboard shortcuts
      k  v norm
0  save     1.0 were ui buttons
----------
unique user 2022.4.2_17.7.43_296 pressed 0 keyboard buttons and 2 ui buttons, where
Empty DataFrame
Columns: [k, v norm]
Index: [] were keyboard shortcuts
       k  v norm
0   save     1.0
1  erase     1.0 were ui buttons
----------
unique user 2022.4.6_18.0.23

In [10]:
ndf = pd.concat( normalized_counts, axis=0 )
ndf.reset_index( drop=True, inplace=True )
ndf

Unnamed: 0,v,v norm,k,user id
0,2.0,2.0,delete,2022.4.1_17.58.37_970
1,1.0,1.0,e,2022.4.1_17.58.37_970
2,2.0,2.0,greenyellow,2022.4.1_17.58.37_970
3,2.0,2.0,red,2022.4.1_17.58.37_970
4,1.0,1.0,orange,2022.4.1_17.58.37_970
5,1.0,1.0,black,2022.4.1_17.58.37_970
6,1.0,1.0,save,2022.4.1_17.58.37_970
7,1.0,1.0,delete,2022.4.1_18.19.10_142
8,1.0,1.0,save,2022.4.1_18.19.10_142
9,27.0,27.0,control,2022.4.1_19.55.45_778


In [11]:
sums = ndf.groupby( by=['k'] ).sum()
sums.sort_values( by='v norm', ascending=False )

Unnamed: 0_level_0,v,v norm
k,Unnamed: 1_level_1,Unnamed: 2_level_1
e,1755.0,147.166667
control,28.0,27.083333
save,21.0,5.416667
delete,4.0,3.083333
black,12.0,2.833333
red,12.0,2.833333
greenyellow,11.0,2.75
2,4.0,2.166667
orange,10.0,1.75
enter,8.0,1.583333


In [12]:
sums['user id'] = ''
for unique_user in unique_users:
    mask = df['user id']==unique_user
    
    for k in sums.index:
        clicked = df[ mask & ( df['key'].isin([k]) | df['bt'].isin([k]) ) ]
        if len(clicked)>0:
            sums.loc[ k,'user id' ] += unique_user+', '

In [13]:
ks = ['e','control','enter',1,2,3,4,5,'delete','shift','z'] # info from html source code
sums.loc[ sums.index.isin(ks), 'role'] = 'keyboard shortcut'
sums.loc[ ~sums.index.isin(ks), 'role'] = 'UI button'
sums.sort_values( by='v norm', ascending=False )

Unnamed: 0_level_0,v,v norm,user id,role
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,1755.0,147.166667,"2022.4.1_17.58.37_970, 2022.4.6_18.0.23_65,",keyboard shortcut
control,28.0,27.083333,"2022.4.1_19.55.45_778, 2022.4.6_18.0.23_65,",keyboard shortcut
save,21.0,5.416667,"2022.4.1_17.58.37_970, 2022.4.1_18.19.10_142, ...",UI button
delete,4.0,3.083333,"2022.4.1_17.58.37_970, 2022.4.1_18.19.10_142, ...",keyboard shortcut
black,12.0,2.833333,"2022.4.1_17.58.37_970, 2022.4.6_18.0.23_65, 20...",UI button
red,12.0,2.833333,"2022.4.1_17.58.37_970, 2022.4.6_18.0.23_65,",UI button
greenyellow,11.0,2.75,"2022.4.1_17.58.37_970, 2022.4.6_18.0.23_65,",UI button
2,4.0,2.166667,"2022.4.6_18.0.23_65, 2022.4.6_9.0.59_991,",keyboard shortcut
orange,10.0,1.75,"2022.4.1_17.58.37_970, 2022.4.6_18.0.23_65,",UI button
enter,8.0,1.583333,"2022.4.6_18.0.23_65, 2022.4.6_9.0.59_991,",keyboard shortcut


Export ui usage to disk:

In [14]:
sums.to_csv( 'ui_usage.csv', index=False )