# Overview

In this notebook I will examine how to convert labeled screen view csv data to a Hugging Face dataset.

In [19]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer

In [9]:
df = pd.read_csv("data/18929485529.csv")
df.head()

Unnamed: 0,ID,User ID,Time,I,Language,Application Name,Package Name,Class Name,Context,View ID,View Depth,View Class Name,Text,Description,Seen Timestamp,Is Visible,X 1,Y 1,X 2,Y 2
0,18929485529,165559,2024-09-04T10:55:25.287,1,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,0,de.penny.app.main.view.MainActivity,,,0,False,0,0,0,0
1,18929485529,165559,2024-09-04T10:55:25.287,2,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,android:id/content,2,android.widget.FrameLayout,,,1725440082464,True,0,0,1080,2400
2,18929485529,165559,2024-09-04T10:55:25.287,3,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,11,android.widget.TextView,UVP 14.99,,1725440082464,True,339,833,498,874
3,18929485529,165559,2024-09-04T10:55:25.287,4,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,11,android.widget.TextView,9.99,,1725440082464,True,356,884,482,960
4,18929485529,165559,2024-09-04T10:55:25.287,5,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,10,android.widget.TextView,UVP,,1725440082464,True,63,986,125,1027


Since the data is not labeled, labels will be added at random for the sake of this notebook.

In [12]:
labels = ['RAND1', 'RAND2', 'RAND3', 'RAND4']
np.random.seed(69)
df['Label'] = np.random.choice(labels, len(df))
df.head()

Unnamed: 0,ID,User ID,Time,I,Language,Application Name,Package Name,Class Name,Context,View ID,...,View Class Name,Text,Description,Seen Timestamp,Is Visible,X 1,Y 1,X 2,Y 2,Label
0,18929485529,165559,2024-09-04T10:55:25.287,1,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,de.penny.app.main.view.MainActivity,,,0,False,0,0,0,0,RAND3
1,18929485529,165559,2024-09-04T10:55:25.287,2,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,android:id/content,...,android.widget.FrameLayout,,,1725440082464,True,0,0,1080,2400,RAND4
2,18929485529,165559,2024-09-04T10:55:25.287,3,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,android.widget.TextView,UVP 14.99,,1725440082464,True,339,833,498,874,RAND2
3,18929485529,165559,2024-09-04T10:55:25.287,4,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,android.widget.TextView,9.99,,1725440082464,True,356,884,482,960,RAND4
4,18929485529,165559,2024-09-04T10:55:25.287,5,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,android.widget.TextView,UVP,,1725440082464,True,63,986,125,1027,RAND3


In [55]:
cleaned_df = df.dropna(subset=['Text'])

grouping_columns = ['Application Name', 'Seen Timestamp']
grouped_dfs = [group for _, group in cleaned_df.groupby(grouping_columns)]

data_dicts = [df[['Text', 'Label']].to_dict(orient='list') for df in grouped_dfs]

dataset = Dataset.from_dict({key: [d[key] for d in data_dicts] for key in data_dicts[0]})
dataset

Dataset({
    features: ['Text', 'Label'],
    num_rows: 6
})

In [59]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(dataset['Text'], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'UV',
 '##P',
 '14',
 '.',
 '99',
 '9',
 '.',
 '99',
 'UV',
 '##P',
 'J',
 '##OH',
 '##N',
 '##NI',
 '##E',
 'WA',
 '##L',
 '##KE',
 '##R',
 'Red',
 'Label',
 'B',
 '##len',
 '##ded',
 'Scotch',
 'j',
 '##e',
 '0',
 ',',
 '7',
 'I',
 'UV',
 '##P',
 '0',
 '.',
 '99',
 '0',
 '.',
 '75',
 'UV',
 '##P',
 'SA',
 '##N',
 'MI',
 '##G',
 '##UE',
 '##L',
 'E',
 '##sp',
 '##ec',
 '##ial',
 'j',
 '##e',
 '0',
 ',',
 '5',
 'I',
 'UV',
 '##P',
 '2',
 '.',
 '99',
 '2',
 '.',
 '79',
 'UV',
 '##P',
 'F',
 '##EL',
 '##IX',
 'K',
 '##na',
 '##bbe',
 '##r',
 'Mix',
 'j',
 '##e',
 '200',
 'g',
 '3',
 '.',
 '89',
 'Pre',
 '##isk',
 '##nal',
 '##ler',
 'F',
 '##EL',
 '##IX',
 'So',
 'gut',
 'w',
 '##ie',
 'es',
 'au',
 '##ssie',
 '##ht',
 'in',
 'G',
 '##ele',
 '##e',
 'j',
 '##e',
 '12',
 'x',
 '85',
 'g',
 'Spa',
 '##ren',
 'au',
 '##f',
 'Top',
 '-',
 'Mark',
 '##en',
 'a',
 '##b',
 '05',
 '.',
 '09',
 '.',
 'bi',
 '##s',
 '07',
 '.',
 '09',
 '.',
 'Ang',
 '##eb',
 '##ote',
 'V',
 '##ort',
 '#