In [217]:
%load_ext google.cloud.bigquery

import numpy as np
import pandas as pd
from operator import mul
from functools import reduce
from datetime import datetime

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


#### Fake dataset 1 - prosty graf

In [185]:
fake1 = pd.DataFrame([
     {'prev':'START', 'next':'A', 'prob':0.3}
    ,{'prev':'START', 'next':'B', 'prob':0.7}
    ,{'prev':'A', 'next':'C', 'prob':0.2}
    ,{'prev':'A', 'next':'D', 'prob':0.8}
    ,{'prev':'B', 'next':'D', 'prob':1.0}
    ,{'prev':'D', 'next':'C', 'prob':0.2}
    ,{'prev':'C', 'next':'A', 'prob':0.3}
    ,{'prev':'C', 'next':'N_CONV', 'prob':0.7}
    ,{'prev':'D', 'next':'N_CONV', 'prob':0.1}
    ,{'prev':'D', 'next':'CONV', 'prob':0.7}
])

print(fake1)

    prev    next  prob
0  START       A   0.3
1  START       B   0.7
2      A       C   0.2
3      A       D   0.8
4      B       D   1.0
5      D       C   0.2
6      C       A   0.3
7      C  N_CONV   0.7
8      D  N_CONV   0.1
9      D    CONV   0.7


In [224]:
class Crawler:
    graph = pd.DataFrame()
    active_crawlers = []
    paths = []
    probs_conv = []
    probs_n_conv = []
    current_index = 0
    verbose = False
    treshold = 3
    step_count = 0
    
    def __init__(self, history, probs):
        if Crawler.graph.empty:
            Crawler.message('Before using Crawler you must load a graph! (Crawler.load_graph())')
        else:
            Crawler.active_crawlers.append(self)
            self.index = Crawler.current_index
            Crawler.current_index += 1
            self.history = history
            self.probs = probs
            Crawler.message('Created crawler{}, active crawlers: {} ({})'.format(self.index, len(Crawler.active_crawlers), self.history))
    
    def _step(self):
        current_node = self.history[-1]
        exits = Crawler.graph[Crawler.graph['prev'] == current_node]['next'].to_list()
        for e in exits:
            if self.history.count(e) > Crawler.treshold:
                exits.remove(e)
        if len(exits) == 0:
            Crawler.message('Crawler{} finished'.format(self.index))
            Crawler.message('\tCrawler{}\'s path: {} '.format(self.index, self.history))
            Crawler.message('\tCrawler{}\'s probs: {} '.format(self.index, self.probs))
            if current_node == 'CONV':
                Crawler.paths.append(self.history)
                Crawler.probs_conv.append(self.probs)
            if current_node == 'N_CONV':
                Crawler.paths.append(self.history)
                Crawler.probs_n_conv.append(self.probs)

        else:
            for e in exits:             
                p = Crawler.graph.loc[(Crawler.graph['prev'] == current_node) & (Crawler.graph['next'] == e)]['prob'].to_list()
                new = Crawler(self.history.copy(), self.probs.copy())
                new.history.append(e)
                new.probs += p
                new._step()
        idx = self.index
        Crawler.active_crawlers.remove(self)
        Crawler.message('Removed crawler{}, active crawlers: {}'.format(idx, len(Crawler.active_crawlers)))
    
    @staticmethod
    def message(txt):
        if Crawler.verbose:
            Crawler.step_count += 1
            step = Crawler.step_count
            t = Crawler.get_time()
            print('{}: ({}) {}'.format(step, t, txt))
            
    @staticmethod
    def set_verbose(verbose=True):
        Crawler.verbose = verbose
        
    
    @staticmethod
    def set_treshold(treshold=3):
        Crawler.treshold = treshold
    
    @staticmethod
    def start():
        c0 = Crawler(['START'], [])
        c0._step()
            
    @staticmethod
    def load_graph(graph):
        Crawler.graph = graph
        
    @staticmethod
    def reset():
        Crawler.graph = pd.DataFrame()
        Crawler.active_crawlers = []
        Crawler.paths = []
        Crawler.probs_conv = []
        Crawler.probs_n_conv = []
        Crawler.current_index = 0
        Crawler.verbose = False
        Crawler.treshold = 3
        Crawler.step_count = 0
        
    @staticmethod
    def get_time():
        return(datetime.now().strftime("%H:%M:%S.%f"))
            
    @staticmethod
    def result():
        conv = 0
        n_conv = 0
        for plist in Crawler.probs_conv:
            conv += reduce(mul, plist)
        for plist in Crawler.probs_n_conv:
            n_conv += reduce(mul, plist)
        return(conv, n_conv)
    

In [230]:
Crawler.reset()
Crawler.load_graph(fake1)
Crawler.set_treshold(3)
#Crawler.set_verbose()
Crawler.start()
print(Crawler.result())

(0.7046793093222399, 0.29526082916352)
