## Load modules

In [1]:
from itertools import combinations

## Read file

In [2]:
fileRDD = sc.textFile("account_operations.csv")
fileRDD

account_operations.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:-2

## Show the data

In [3]:
fileRDD.take(5)

[u'20.07.2015',
 u'"Rank,""Frequency"",""% of Total Paths"",""Path"""',
 u'"1,""680"",""0.23726448011165388"",""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""',
 u'"2,""370"",""0.12909979064898813"",""payment hub - payment cards->payment hub - payment verification->payment hub - payment confirmation"""',
 u'"3,""77"",""0.026866713189113746"",""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete->accounts homepage->payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""']

## Split columns

In [4]:
splited_RDD = fileRDD.map(lambda line: line.split(","))
splited_RDD.take(3)

[[u'20.07.2015'],
 [u'"Rank', u'""Frequency""', u'""% of Total Paths""', u'""Path"""'],
 [u'"1',
  u'""680""',
  u'""0.23726448011165388""',
  u'""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""']]

In [5]:
# choose the needed columns
pathRDD = splited_RDD.filter(lambda line: len(line)>1).map(lambda line: (line[3],line[1]))
pathRDD.collect()

[(u'""Path"""', u'""Frequency""'),
 (u'""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""',
  u'""680""'),
 (u'""payment hub - payment cards->payment hub - payment verification->payment hub - payment confirmation"""',
  u'""370""'),
 (u'""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete->accounts homepage->payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""',
  u'""77""'),
 (u'""payment date entry->payment stored->payment stored->payment amount->payment amount->payment confirm->payment success confirmation"""',
  u'""71""'),
 (u'""payment hub - payment cards->payment hub - payment account->payment hub -payment account verification->payment hub - payment account confirmation->payment hub - payment cards->payment hub - payment verification->payment hub - payment confirmation"""',
  u'""58""'),
 (u'""pa

## Drop head

In [6]:
header = pathRDD.first()
data_pathRDD = pathRDD.filter(lambda line: line != header)
data_pathRDD.take(3)

[(u'""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""',
  u'""680""'),
 (u'""payment hub - payment cards->payment hub - payment verification->payment hub - payment confirmation"""',
  u'""370""'),
 (u'""payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete->accounts homepage->payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete"""',
  u'""77""')]

## Drop quotes 

In [7]:
cleaned_pathRDD = data_pathRDD.map(lambda x: (x[0][2:-3], x[1][2:-2]))
cleaned_pathRDD.take(3)

[(u'payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete',
  u'680'),
 (u'payment hub - payment cards->payment hub - payment verification->payment hub - payment confirmation',
  u'370'),
 (u'payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete->accounts homepage->payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete',
  u'77')]

## Sum of frequencies

In [8]:
frec = cleaned_pathRDD.map(lambda x: int(x[1]))
frec.take(10)

[680, 370, 77, 71, 58, 58, 44, 42, 35, 22]

In [9]:
sum_frec = frec.sum()
sum_frec

1471

In [10]:
n_paths = frec.count()
n_paths

13

## Find patterns

In [11]:
def patterns(text):
    text = text.split('->')
    output = [list(combinations(text, i)) for i in range(len(text) + 1)]
    paths = []
    for item in output:
        for i in range(len(item)):
            if len(item[i])>1 and len(item[i])<6:
                paths.append('->'.join(item[i]))
    return paths

In [12]:
patt = 'A->B->C->D'
patt_spl = patt.split('->')
patt_spl

['A', 'B', 'C', 'D']

In [13]:
output = [list(combinations(patt_spl, i)) for i in range(len(patt_spl) + 1)]
output

[[()],
 [('A',), ('B',), ('C',), ('D',)],
 [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'), ('C', 'D')],
 [('A', 'B', 'C'), ('A', 'B', 'D'), ('A', 'C', 'D'), ('B', 'C', 'D')],
 [('A', 'B', 'C', 'D')]]

In [14]:
paths = []
for item in output:
    for i in range(len(item)):
        if len(item[i])>1 and len(item[i])<6:
            paths.append('->'.join(item[i]))
                
paths

['A->B',
 'A->C',
 'A->D',
 'B->C',
 'B->D',
 'C->D',
 'A->B->C',
 'A->B->D',
 'A->C->D',
 'B->C->D',
 'A->B->C->D']

In [15]:
sub_patterns = cleaned_pathRDD.map(lambda (x,y): (patterns(x), y))
sub_patterns.take(2)

[([u'payment_start->payment details overlay',
   u'payment_start->payment verify overlay',
   u'payment_start->payment confirmation overlay',
   u'payment_start->payment_complete',
   u'payment details overlay->payment verify overlay',
   u'payment details overlay->payment confirmation overlay',
   u'payment details overlay->payment_complete',
   u'payment verify overlay->payment confirmation overlay',
   u'payment verify overlay->payment_complete',
   u'payment confirmation overlay->payment_complete',
   u'payment_start->payment details overlay->payment verify overlay',
   u'payment_start->payment details overlay->payment confirmation overlay',
   u'payment_start->payment details overlay->payment_complete',
   u'payment_start->payment verify overlay->payment confirmation overlay',
   u'payment_start->payment verify overlay->payment_complete',
   u'payment_start->payment confirmation overlay->payment_complete',
   u'payment details overlay->payment verify overlay->payment confirmation 

## Find weights for each pattern

In [16]:
def pair(x, y):
    w = []
    for i in x:
        w.append((i,float(int(y))/sum_frec))
    return w        

In [17]:
p_weights =  sub_patterns.map(lambda x: pair(x[0], x[1]))
p_weights.take(2)

[[(u'payment_start->payment details overlay', 0.46227056424201224),
  (u'payment_start->payment verify overlay', 0.46227056424201224),
  (u'payment_start->payment confirmation overlay', 0.46227056424201224),
  (u'payment_start->payment_complete', 0.46227056424201224),
  (u'payment details overlay->payment verify overlay', 0.46227056424201224),
  (u'payment details overlay->payment confirmation overlay',
   0.46227056424201224),
  (u'payment details overlay->payment_complete', 0.46227056424201224),
  (u'payment verify overlay->payment confirmation overlay',
   0.46227056424201224),
  (u'payment verify overlay->payment_complete', 0.46227056424201224),
  (u'payment confirmation overlay->payment_complete', 0.46227056424201224),
  (u'payment_start->payment details overlay->payment verify overlay',
   0.46227056424201224),
  (u'payment_start->payment details overlay->payment confirmation overlay',
   0.46227056424201224),
  (u'payment_start->payment details overlay->payment_complete',
   0.4

## Collect al subpatterns. 

In [18]:
p_all = p_weights.flatMap(lambda x: x)
p_all.take(10)

[(u'payment_start->payment details overlay', 0.46227056424201224),
 (u'payment_start->payment verify overlay', 0.46227056424201224),
 (u'payment_start->payment confirmation overlay', 0.46227056424201224),
 (u'payment_start->payment_complete', 0.46227056424201224),
 (u'payment details overlay->payment verify overlay', 0.46227056424201224),
 (u'payment details overlay->payment confirmation overlay',
  0.46227056424201224),
 (u'payment details overlay->payment_complete', 0.46227056424201224),
 (u'payment verify overlay->payment confirmation overlay',
  0.46227056424201224),
 (u'payment verify overlay->payment_complete', 0.46227056424201224),
 (u'payment confirmation overlay->payment_complete', 0.46227056424201224)]

In [19]:
p_grouped = p_all.reduceByKey(lambda a,b: a+b).cache()
p_grouped.take(5)

[(u'payment details overlay->payment details overlay->payment verify overlay->payment confirmation overlay',
  0.07613868116927261),
 (u'payment_start->payment verify overlay->payment confirmation overlay->payment_complete->payment details overlay',
  0.07613868116927261),
 (u'payment hub -payment account verification->payment hub - payment verification->payment hub - payment confirmation',
  0.04486743711760707),
 (u'payment verify overlay->payment confirmation overlay->payment_complete->payment_start',
  0.07613868116927261),
 (u'payment_start->payment details overlay->accounts homepage',
  0.07613868116927261)]

## Take top patterns

In [20]:
top_20 = p_grouped.takeOrdered(20, lambda s: -1 * s[1])
top_20

[(u'payment_start->payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete',
  0.9191026512576483),
 (u'payment_start->payment verify overlay->payment confirmation overlay->payment_complete',
  0.8429639700883756),
 (u'payment_start->payment details overlay->payment verify overlay->payment confirmation overlay',
  0.8429639700883756),
 (u'payment details overlay->payment verify overlay->payment confirmation overlay->payment_complete',
  0.8429639700883756),
 (u'payment_start->payment details overlay->payment verify overlay->payment_complete',
  0.8429639700883756),
 (u'payment_start->payment details overlay->payment confirmation overlay->payment_complete',
  0.8429639700883756),
 (u'payment_start->payment verify overlay->payment_complete',
  0.766825288919103),
 (u'payment_start->payment details overlay->payment verify overlay',
  0.766825288919103),
 (u'payment_start->payment details overlay->payment confirmation overlay',
  0.766825288919103),


## Save all results to file

In [69]:
p_grouped.saveAsTextFile("subpaterns_result.csv")