In [1]:
import pandas as pd
import numpy as np

#import package
from graphsd.datasets import load_playgroundA, load_playgroundB
from graphsd.utils import make_bins, to_dataframe
from graphsd import DigraphSDMining

## Load dataset

Here, we load the "PlaygroundA" dataset, divided in two datasets: *position data* and *social data*. This dataset was previously pre-processed in order to have the structure that is needed for our package.

The *position data* should have the columns *id*, *time*, *x* and *y*. This dataset should be movement data, meaning that each line of a certain *id*, represents the position (*x*,*y*) of the subject with this *id*, in one time instance. These lines should be ordered by id, by time (from latest to most recent timestamp).

The *social data* should include the attributes that are going to be analysed (e.g., gender, age, etc...). With the *social data*, we make separate the numerical values of the dataset (e.g., age) in 3 bins *0*, *1* and *2*, representing *low values*, *medium value* and *high value*, respectively.

Both datasets should contain a *header*, and the values should be separated by a comma (,), and each line separated by paragraph. 

E.g.:

>id,Gender,AgeM,Emotion,ProSoc,Peer,Conduct,Hyper
>
>1,F,102,0,8,2,0,2
>
>2,F,97,2,9,0,0,1

In [2]:
# load datasets
position_data, social_data = load_playgroundA()
# for playgroundB instead, uncomment the next line
#position_data, social_data = load_playgroundB()

# make bins for social data
social_data = make_bins(social_data)

## Create Graph from dataset


In [3]:
# choose a random seed to guarantee same results every time this exeperiment is run

# create digraph
dig = DigraphSDMining(random_state=1234)

# it is also possible to create a multidigraph instead, by using
# dig = MultiDigraphSDMining(random_state=1234)

# give attributes to edges, based on datasets. 
# time_step indicates how many seconds the algorithm should skip when calculating the interactions:
# if the dataset has 100 positions, one per second, time_step=10 means the interactions will be calculated every 10 seconds.
counter = dig.read_data(position_data, social_data, time_step=10)

## Run subgroup discovery for the "to" version (wrt edges' attributes)

Running subgroup discovery on the digraph.

Possible modes are *to*, *from* and *comparison*.

Possible quality measures are *qP* and *qS*.

Possible metrics are *mean* and *var* (for variation)

*min_support* is used to get the frequent items set, from Orange3-Associate package, that states 
> minimal support: a minimal ratio of data instances that must support (contain) the itemset for it to be generated. For large data sets it is normal to set a lower minimal support (e.g. between 2%-0.01%)


In [4]:
subgroups_to = dig.subgroup_discovery(mode="to", min_support=0.20, metric='mean', quality_measure="qP")
to_dataframe(subgroups_to)

Unnamed: 0,Pattern,Nodes,in,out,Edges,Mean Weight,Score
0,"((ProSoc, 1), (Peer, 0))",15,4,15,41,4.9,2.0
1,"((Gender, M), (Emotion, 0), (Peer, 0))",16,4,15,35,5.0,1.8
2,"((Gender, M), (Peer, 0))",16,6,16,54,4.6,1.7
3,"((Peer, 0), (Conduct, 1))",16,3,16,34,4.7,1.5
4,"((ProSoc, 1),)",15,5,15,51,4.4,1.5
...,...,...,...,...,...,...,...
80,"((Conduct, 0), (Hyper, 0), (ProSoc, 2))",16,3,16,32,2.5,-2.1
81,"((Conduct, 0), (Gender, F), (ProSoc, 2))",16,3,16,32,2.5,-2.1
82,"((Conduct, 0), (ProSoc, 2))",16,3,16,32,2.5,-2.3
83,"((Conduct, 0), (Gender, F))",16,3,16,32,2.5,-2.4


## Run subgroup discovery for the "from" version (wrt edges' attributes)

In [5]:
subgroups_from = dig.subgroup_discovery(mode="from", min_support=0.20, metric='mean', quality_measure="qP")
to_dataframe(subgroups_from)

Unnamed: 0,Pattern,Nodes,in,out,Edges,Mean Weight,Score
0,"((Peer, 0),)",16,16,10,94,4.5,3.0
1,"((Gender, M), (Peer, 0))",16,16,6,52,4.9,2.9
2,"((ProSoc, 1), (Peer, 0))",15,15,4,38,5.2,2.8
3,"((Gender, M), (Emotion, 0), (Peer, 0))",16,15,4,34,5.2,2.5
4,"((AgeM, 2), (Peer, 0))",16,16,4,41,5.0,2.3
...,...,...,...,...,...,...,...
66,"((Gender, M), (AgeM, 0))",15,15,5,45,3.1,-1.7
67,"((AgeM, 0), (ProSoc, 0))",15,15,5,45,3.1,-1.7
68,"((AgeM, 0), (Peer, 1))",15,15,3,32,2.7,-1.8
69,"((Peer, 1), (Hyper, 2))",15,15,3,32,2.7,-2.0


## Run subgroup discovery for the "comparison" version (wrt edges' attributes)

In [6]:
subgroups_comp = dig.subgroup_discovery(mode="comparison", min_support=0.20, quality_measure="qP")
to_dataframe(subgroups_comp)

Unnamed: 0,Pattern,Nodes,in,out,Edges,Mean Weight,Score
0,"((Gender, ('M', 'M')),)",8,8,8,32,5.8,3.1
1,"((Conduct, EQ),)",16,16,16,53,4.8,2.7
2,"((Peer, EQ),)",15,15,15,73,4.3,1.7
3,"((AgeM, >),)",16,12,9,53,4.5,1.7
4,"((Emotion, EQ),)",16,16,16,54,4.5,1.5
5,"((Hyper, <),)",16,9,11,46,4.5,1.3
6,"((Gender, ('F', 'F')),)",8,8,8,49,4.2,0.8
7,"((ProSoc, <),)",16,10,11,55,3.9,0.1
8,"((Peer, <),)",16,6,13,40,3.8,0.0
9,"((ProSoc, >),)",16,11,10,52,3.8,-0.1
