In [20]:
import numpy as np
import pandas as pd

import re
import os
from tqdm import tqdm

In [21]:
START = "**SOF**"
END = "**EOF**"
INPUT_FOLDER= "./data/01_txt"
SAVE_FOLDER = "./data/02_feathers"

data = []
file_names = os.listdir(INPUT_FOLDER)

with tqdm(total=len(file_names)) as waiter:
    for fname in file_names:
        waiter.set_description(f"reading file {fname}")
        with open(f"{INPUT_FOLDER}/{fname}") as f:
            user = os.path.splitext(fname)[0]
            sess_id = -1
            
            for token in f:
                token = token.strip()
                if token == START:
                    sess_id += 1
                elif token == END:
                    pass
                else:
                    row = [user, sess_id, token]
                    data.append(row)
        waiter.update()
        
data = pd.DataFrame(data, columns=['user', 'sess_id', 'token'])

reading file USER6.txt: 100%|██████████| 9/9 [00:00<00:00, 35.26it/s]


In [22]:
data.head()

Unnamed: 0,user,sess_id,token
0,USER8,0,X
1,USER8,0,z
2,USER8,2,cd
3,USER8,2,<1>
4,USER8,2,cd


In [23]:
data.groupby('user').apply(lambda x: x.sess_id.nunique())

user
USER0     562
USER1     488
USER2     755
USER3     484
USER4     911
USER5     546
USER6    2425
USER7    1339
USER8    1590
dtype: int64

In [24]:
ARGS1 = re.compile(r'\<.\>')
ARGS2 = re.compile(r'%.')
OPTS1 = re.compile(r'-.')
OPTS2 = ("&")
PIPE = (";","|","&&","||")


def token_type(s: str) -> str:
    if bool(ARGS1.findall(s)) or bool(ARGS2.findall(s)):
        return 'args'
    elif bool(OPTS1.findall(s)) or s in OPTS2:
        return 'opts'
    elif s in PIPE:
        return 'pipe'
    else:
        return 'cmd'

In [25]:
data['token_type'] = data.token.apply(token_type)
data['line_id'] = 0

In [26]:
data.head(300)

Unnamed: 0,user,sess_id,token,token_type,line_id
0,USER8,0,X,cmd,0
1,USER8,0,z,cmd,0
2,USER8,2,cd,cmd,0
3,USER8,2,<1>,args,0
4,USER8,2,cd,cmd,0
5,USER8,2,<1>,args,0
6,USER8,2,ll,cmd,0
7,USER8,2,vi,cmd,0
8,USER8,2,<1>,args,0
9,USER8,2,vi,cmd,0


In [27]:
def parse_token_sequence(df):
    token_sequence = df.token_type.tolist()
    line = 0
    line_ids = [line]

    for i in range(1, len(token_sequence)):
        if token_sequence[i] == 'cmd' and token_sequence[i-1] != 'pipe':
            line += 1
            line_ids.append(line)
        else:
            line_ids.append(line)
    df.line_id = line_ids
    return df

In [28]:
data = data.groupby(['user', 'sess_id']).apply(parse_token_sequence)

In [29]:
data.head(300)

Unnamed: 0,user,sess_id,token,token_type,line_id
0,USER8,0,X,cmd,0
1,USER8,0,z,cmd,1
2,USER8,2,cd,cmd,0
3,USER8,2,<1>,args,0
4,USER8,2,cd,cmd,1
5,USER8,2,<1>,args,1
6,USER8,2,ll,cmd,2
7,USER8,2,vi,cmd,3
8,USER8,2,<1>,args,3
9,USER8,2,vi,cmd,4


In [30]:
# Check the structure of session 10 of USER8 (it's a longer session and has pipes in it)

data.loc[data.user == 'USER8'].loc[data.sess_id == 10].groupby('line_id').apply(lambda x: print(x.token.tolist()))

['xbiff', '&']
['xbiff', '&']
['elm']
['bc', '-l']
['elm']
['cd', '<1>']
['cd']
['elm']
['elm']
['findall']
['elm']
['ct']
['ll']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['vi', '<1>']
['man', '<1>']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['man', '<1>']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['vi', '<1>']
['man', '<1>']
['fg']
['bg']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['cat', '<1>']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['vi', '<1>']
['<GENSYM:1>/foo.pl']
['<GENSYM:1>/foo.pl']
['cd']
['elm']
['z']


In [146]:
data.groupby('token_type').nunique()

Unnamed: 0_level_0,user,sess_id,token,token_type,line_id
token_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
args,9,2542,9,1,946
bg,9,666,2,1,294
cmd,9,2961,1915,1,1181
opts,9,1586,425,1,550
oth,5,27,4,1,31
pipe,9,551,2,1,227


In [145]:
# Number of lines per user
data.groupby(['user']).apply(lambda x: x.line_id.nunique())

user
USER0     121
USER1     251
USER2     175
USER3    1181
USER4     524
USER5     770
USER6     270
USER7     222
USER8     289
dtype: int64

In [32]:
# Unique tokens (words)
data.token.nunique()

2357

In [33]:
data.to_csv('data/04_dataset.tsv', sep='\t')