In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import export_graphviz, plot_tree
from graphviz import Source,Digraph
from sklearn.tree import DecisionTreeClassifier

### Generator - creates dataset 
#### n_stages - number of nodes in graph
#### n_ways - number of nodes in one action
#### size - number of objects in log

In [2]:
class Generator():
    def __init__(self, n_stages=6,  n_ways=4, size=100):
        self.df = None        
        self.n_stages = n_stages
        self.n_ways = n_ways
        self.size = size
    
    def make_df(self):
        df_len = self.n_ways * self.size
        data = pd.DataFrame(np.arange(df_len), columns=['id'])
        data['stages'] = [0] * df_len
        ways = []
       
        # creating random ways
        while len(ways) != self.n_ways:
            way = [1] # start
            way_len = np.random.randint(2, self.n_stages)
            for _ in range(way_len):
                stage = np.random.randint(2, self.n_stages)
                way.append(stage)
            way.append(self.n_stages)
            way = pd.unique(way).tolist()
            if way not in ways and len(way) > 2:
                ways.append(way)                
        ways = np.repeat(ways, self.size)
        data['stages'] = ways            
        self.df = data
        return self    
    
    def make_features(self):
        data = self.df
        data = data.explode('stages')
        data['stages'] = data['stages'].astype(str)
        data['from'] = data.groupby(['id'])['stages'].shift(1)          
        edges = data.loc[:, ['from', "stages"]].drop_duplicates().dropna().values
        edges = [list(map(str, edge)) for edge in edges]                
        feature_cols = []
        i = 0
        for stage in data['stages'].unique():               
            transitions = list(filter(lambda x: stage in x[0], edges))
            if len(transitions) > 1:                
                stage_from = stage
                stages_to = [transition[1] for transition in transitions]

                train_mask = data['from'] == stage_from

                feat_name = 'X_{}'.format(i)
                for feat, stage_to in enumerate(stages_to):
                    mask = (data["stages"] == stage_to) & (data['from'] == stage_from)
                    data.loc[mask, feat_name] = (feat + 1) 
                    data.loc[mask, feat_name] += 1.1 * np.random.random(data.loc[mask, feat_name].shape[0]) - 0.55
                feature_cols.append(feat_name)
            i += 1
        data[data.columns[3:]] = data[data.columns[3:]].fillna(0)
        self.df = data
        return feature_cols, data        

In [3]:
G = Generator(1000)
G.make_df()

<__main__.Generator at 0x1a1b130e90>

In [4]:
G.df.head()

Unnamed: 0,id,stages
0,0,"[1, 763, 396, 796, 475, 881, 36, 483, 641, 383..."
1,1,"[1, 763, 396, 796, 475, 881, 36, 483, 641, 383..."
2,2,"[1, 763, 396, 796, 475, 881, 36, 483, 641, 383..."
3,3,"[1, 763, 396, 796, 475, 881, 36, 483, 641, 383..."
4,4,"[1, 763, 396, 796, 475, 881, 36, 483, 641, 383..."


In [5]:
feature_cols, data = G.make_features()

In [6]:
data.head(10)

Unnamed: 0,id,stages,from,X_0,X_1,X_2,X_3,X_4,X_5,X_6,...,X_807,X_808,X_809,X_810,X_811,X_812,X_813,X_873,X_879,X_897
0,0,1,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,763,1.0,388.487807,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,396,763.0,0.0,0.784429,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,796,396.0,0.0,0.0,1.214728,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,475,796.0,0.0,0.0,0.0,0.618631,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,881,475.0,0.0,0.0,0.0,0.0,1.446626,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,36,881.0,0.0,0.0,0.0,0.0,0.0,0.490542,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,483,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.461592,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,641,483.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0,383,641.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
data.shape

(172200, 613)

### what are these features 

In [8]:
feature_cols

['X_0',
 'X_1',
 'X_2',
 'X_3',
 'X_4',
 'X_5',
 'X_6',
 'X_7',
 'X_8',
 'X_9',
 'X_10',
 'X_11',
 'X_13',
 'X_14',
 'X_15',
 'X_16',
 'X_17',
 'X_18',
 'X_19',
 'X_20',
 'X_21',
 'X_22',
 'X_23',
 'X_24',
 'X_25',
 'X_26',
 'X_27',
 'X_28',
 'X_29',
 'X_30',
 'X_31',
 'X_32',
 'X_33',
 'X_34',
 'X_35',
 'X_36',
 'X_37',
 'X_38',
 'X_39',
 'X_40',
 'X_41',
 'X_42',
 'X_43',
 'X_44',
 'X_45',
 'X_46',
 'X_48',
 'X_49',
 'X_50',
 'X_51',
 'X_52',
 'X_53',
 'X_54',
 'X_57',
 'X_58',
 'X_59',
 'X_60',
 'X_61',
 'X_62',
 'X_63',
 'X_64',
 'X_65',
 'X_66',
 'X_67',
 'X_68',
 'X_69',
 'X_70',
 'X_71',
 'X_72',
 'X_73',
 'X_74',
 'X_75',
 'X_76',
 'X_77',
 'X_78',
 'X_79',
 'X_80',
 'X_81',
 'X_82',
 'X_83',
 'X_84',
 'X_85',
 'X_86',
 'X_87',
 'X_88',
 'X_89',
 'X_90',
 'X_91',
 'X_92',
 'X_94',
 'X_95',
 'X_96',
 'X_97',
 'X_98',
 'X_99',
 'X_100',
 'X_101',
 'X_102',
 'X_104',
 'X_105',
 'X_106',
 'X_108',
 'X_109',
 'X_111',
 'X_113',
 'X_114',
 'X_115',
 'X_116',
 'X_117',
 'X_118',
 'X_1

In [9]:
class RuleFinder():
    def __init__(self, df, id_col, stage_col, feature_cols):
        self.df = df
        self.id_col = id_col
        self.stage_col = stage_col
        self.feature_cols = feature_cols
        self.graph = None
        
    # creating graph with stages as nodes and ways as edges
    def create_graph(self):
        stages = self.df[self.stage_col].unique()
        graph = Digraph(comment='Process', format = 'pdf')
        for stage in stages:
            graph.node(str(stage))
        self.df['from'] = self.df.groupby([self.id_col])[self.stage_col].shift(1)
        edges = self.df.loc[:, ['from', self.stage_col]].drop_duplicates().dropna().values
        edges = [list(map(str, edge)) for edge in edges]
        graph.edges(edges)
        self.graph = graph
        return graph
    
    def _save_dot(self, trees):
        for num, tree in trees.items():
            if tree != None:
                export_graphviz(trees[num], out_file='tree{}.dot'.format(num),
                    feature_names=self.feature_cols,node_ids=True)
    
#     def _make_features(self):
#         feature_cols = []
#         i = 0
#         for stage in self.df[self.stage_col].unique():
#             transitions_str = list(filter(lambda x: '\t{} -> '.format(stage) in x, self.graph.body))
#             if len(transitions_str) > 1:
#                 feature_cols.append('X_{}'.format(i))
#                 self.df['X_{}'.format(i)] = np.zeros(shape=self.df.shape[0])
                
#                 transitions = list(map(lambda x: x.strip().split(" -> "), transitions_str))

#                 stage_from = stage
#                 stages_to = [transition[1] for transition in transitions]

#                 train_mask = self.df['from'] == stage_from

#                 for feat, stage_to in enumerate(stages_to):
#                     mask = (self.df[self.stage_col] == stage_to) & (self.df['from'] == stage_from)
#                     self.df.loc[mask, 'X_{}'.format(i)] = feat + 1
#                     self.df.loc[mask, 'X_{}'.format(i)] += 1.5*np.random.random(
#                         self.df.loc[mask, 'X_{}'.format(i)].shape[0]) -0.75
#                 i += 1
#         self.feature_cols = feature_cols
#         return self.df
    
    def compute_trees(self,min_impurity_decrease = 0.0015):
        trees = {}
        for stage in self.df[self.stage_col].unique():
            transitions_str = list(filter(lambda x: '\t{} -> '.format(stage) in x, self.graph.body))
            if len(transitions_str) > 1:
                transitions = list(map(lambda x: x.strip().split(" -> "), transitions_str))

                stage_from = stage
                stages_to = [transition[1] for transition in transitions]

                train_mask = self.df['from'] == stage_from

                for stage_to in stages_to:
                    mask = (self.df[self.stage_col] == stage_to) & (self.df['from'] == stage_from)
                    self.df.loc[mask, '_target_'] = stage_to

                X = self.df.loc[train_mask, self.feature_cols].values
                y = self.df.loc[train_mask, '_target_']
                
                min_class_ratio = 0.3 * np.min(y.value_counts()) / y.shape[0]
                tree = DecisionTreeClassifier(
                    min_samples_leaf=min_class_ratio, 
                    min_impurity_split=0.2, 
                    min_impurity_decrease=min_impurity_decrease)
                tree.fit(X, y)
                
                trees[stage] = tree
            else:
                trees[stage] = None
        self.df.drop('_target_', inplace=True, axis=1)
        
        self._save_dot(trees)
        return trees
    
    

In [10]:
F = RuleFinder(data, 'id', 'stages', feature_cols)

In [11]:
%%time
graph = F.create_graph()

CPU times: user 54.3 ms, sys: 23.8 ms, total: 78.1 ms
Wall time: 77.3 ms


In [None]:
graph

In [None]:
graph.body

graph.view()

In [None]:
gen = Generator(n_stages=5, n_ways=10).make_df()
feature_cols, data = gen.make_features()

In [None]:
rule_finder = RuleFinder(data, 'id', 'stages', feature_cols)
graph = rule_finder.create_graph()
trees = rule_finder.compute_trees()

In [None]:
trees

In [None]:
file = open('tree{}.dot'.format('2'), 'r')
text=file.read()
Source(text)

In [None]:
graph.view()