# Overview

In this notebook I will show how to convert labeled screen view csv data to a Hugging Face dataset.

In [1]:
import pandas as pd
import numpy as np
import torch
from typing import List
from datasets import Sequence, ClassLabel, Dataset
from transformers import AutoTokenizer

In [2]:
df = pd.read_csv("data/18929485529.csv")
df.head()

Unnamed: 0,ID,User ID,Time,I,Language,Application Name,Package Name,Class Name,Context,View ID,View Depth,View Class Name,Text,Description,Seen Timestamp,Is Visible,X 1,Y 1,X 2,Y 2
0,18929485529,165559,2024-09-04T10:55:25.287,1,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,0,de.penny.app.main.view.MainActivity,,,0,False,0,0,0,0
1,18929485529,165559,2024-09-04T10:55:25.287,2,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,android:id/content,2,android.widget.FrameLayout,,,1725440082464,True,0,0,1080,2400
2,18929485529,165559,2024-09-04T10:55:25.287,3,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,11,android.widget.TextView,UVP 14.99,,1725440082464,True,339,833,498,874
3,18929485529,165559,2024-09-04T10:55:25.287,4,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,11,android.widget.TextView,9.99,,1725440082464,True,356,884,482,960
4,18929485529,165559,2024-09-04T10:55:25.287,5,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,10,android.widget.TextView,UVP,,1725440082464,True,63,986,125,1027


Since the data is not labeled, labels will be added at random for the sake of this notebook.

In [3]:
labels = ClassLabel(names=['O', 'B-RAND1', 'I-RAND1', 'B-RAND2', 'I-RAND2'])
np.random.seed(69)
df['Label'] = np.random.randint(0, labels.num_classes, len(df))
df.head()

Unnamed: 0,ID,User ID,Time,I,Language,Application Name,Package Name,Class Name,Context,View ID,...,View Class Name,Text,Description,Seen Timestamp,Is Visible,X 1,Y 1,X 2,Y 2,Label
0,18929485529,165559,2024-09-04T10:55:25.287,1,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,de.penny.app.main.view.MainActivity,,,0,False,0,0,0,0,3
1,18929485529,165559,2024-09-04T10:55:25.287,2,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,android:id/content,...,android.widget.FrameLayout,,,1725440082464,True,0,0,1080,2400,1
2,18929485529,165559,2024-09-04T10:55:25.287,3,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,android.widget.TextView,UVP 14.99,,1725440082464,True,339,833,498,874,3
3,18929485529,165559,2024-09-04T10:55:25.287,4,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,android.widget.TextView,9.99,,1725440082464,True,356,884,482,960,2
4,18929485529,165559,2024-09-04T10:55:25.287,5,de,PENNY,de.penny.app,de.penny.app.main.view.MainActivity,,,...,android.widget.TextView,UVP,,1725440082464,True,63,986,125,1027,4


In [15]:
cleaned_df = df.dropna(subset=['Text'])

grouping_columns = ['Application Name', 'Seen Timestamp']
grouped_dfs = [group for _, group in cleaned_df.groupby(grouping_columns)]

data_dicts = [df[['Text', 'Label']].to_dict(orient='list') for df in grouped_dfs]

dataset = Dataset.from_dict({key: [d[key] for d in data_dicts] for key in data_dicts[0]})
dataset = dataset.rename_column('Text', 'tokens')
dataset = dataset.rename_column('Label', 'ner_tags')
dataset = dataset.cast_column('ner_tags', Sequence(feature=labels))
dataset = dataset.add_column('id', [i for i in range(len(dataset))])
dataset

Casting the dataset:   0%|          | 0/6 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'id'],
    num_rows: 6
})

In [16]:
dataset.features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-RAND1', 'I-RAND1', 'B-RAND2', 'I-RAND2'], id=None), length=-1, id=None),
 'id': Value(dtype='int64', id=None)}

Now the dataset is ready for further processing.