### Frequent Sequence 를 찾은 후 Feature로 이용하는 모듈 (faster ver)

달라진 점: 기존 161024 version에서는 pymining package로 support가 일정 이상인, frequent sequence를 찾은 후 information gain 계산을 위해 data frame 전체에 검색해서 TP, TN, FP, FN를 계산했는데 이 방법으로는 한 사이클에 30분 이상이 걸렸었다.

이 version에서는 frequent sequence를 찾으면서, 해당 sequence의 revisit intention을 동시에 기록해 놓아 information gain 계산을 하는데 걸렸던 검색 시간을 없애서 한 사이클에 20초 내외로 끝낼 수 있었다.

To-Do: Scikit-learn에서 10-fold cross validation을 제공해 주는데, 보통 feature들을 다 뽑은 후,  frequent sequence가 test data까지 모두 합한 데이터를 기반해서 찾아짐.

In [2]:
import pandas as pd
from pymining import seqmining
import numpy as np
from scipy.special import entr
from collections import defaultdict

In [3]:
trajs_combined_balanced = pd.read_pickle("../data/786/786_trajs_combined_balanced.p")

In [6]:
trajs_combined_balanced['revisit_intention'] = trajs_combined_balanced['revisit_intention'].astype(int)
seqs = trajs_combined_balanced.apply(lambda x: (x['traj'], x['revisit_intention']), axis=1)

In [80]:
seqs.shape[0]

8886

In [7]:
freq_seqs = freq_seq_enum(seqs, 200)

In [81]:
freq_seqs

{(('1f',), 6509, 3359),
 (('1f', '1f'), 617, 431),
 (('1f', '1f', '1f-inner'), 201, 141),
 (('1f', '1f', '1f-left'), 225, 163),
 (('1f', '1f', '1f-right'), 341, 236),
 (('1f', '1f-inner'), 1827, 1005),
 (('1f', '1f-inner', '1f'), 207, 143),
 (('1f', '1f-inner', '1f-left'), 610, 372),
 (('1f', '1f-inner', '1f-left', '1f-right'), 229, 142),
 (('1f', '1f-inner', '1f-right'), 657, 390),
 (('1f', '1f-inner', '1f-right', '1f-left'), 203, 130),
 (('1f', '1f-inner', '2f'), 432, 265),
 (('1f', '1f-inner', '2f-right'), 211, 122),
 (('1f', '1f-inner', 'out'), 407, 210),
 (('1f', '1f-left'), 1823, 1069),
 (('1f', '1f-left', '1f-inner'), 729, 429),
 (('1f', '1f-left', '1f-inner', '1f-right'), 206, 132),
 (('1f', '1f-left', '1f-right'), 717, 423),
 (('1f', '1f-left', '1f-right', '1f-inner'), 217, 134),
 (('1f', '1f-left', '2f'), 453, 282),
 (('1f', '1f-left', '2f-right'), 218, 124),
 (('1f', '1f-left', 'out'), 405, 237),
 (('1f', '1f-right'), 2608, 1486),
 (('1f', '1f-right', '1f'), 301, 218),
 (('1

In [82]:
freq_seqs_sorted = sorted(freq_seqs, key=lambda tup: tup[1], reverse=True)

In [11]:
for x in freq_seqs_sorted:
    if (len(x[0]) >= 4):
#         if (x[0][0] != 'out') & (x[0][0] != 'in'):
        print(x)


(('out', 'in', '1f', '1f-right'), 2607, 1486)
(('out', 'in', '1f', '1f-inner'), 1827, 1005)
(('out', 'in', '1f', '1f-left'), 1823, 1069)
(('out', 'in', '1f', '2f'), 1612, 883)
(('out', 'in', '1f', 'out'), 1541, 832)
(('out', 'out', 'in', '1f'), 1390, 771)
(('out', 'in', '1f-right', '1f-left'), 1136, 700)
(('out', 'in', '2f', '1f'), 1094, 666)
(('out', 'in', '2f', '2f-right'), 1032, 564)
(('out', 'in', '1f-right', '1f-inner'), 1002, 589)
(('out', 'in', '2f', '2f-left'), 995, 548)
(('out', 'in', '2f', '1f-right'), 914, 609)
(('out', 'in', '1f', '1f-right', '1f-left'), 898, 556)
(('out', '1f', '1f-right', '1f-left'), 898, 556)
(('in', '1f', '1f-right', '1f-left'), 898, 556)
(('out', 'in', '1f-right', '1f'), 884, 546)
(('out', 'in', '1f-left', '1f-right'), 872, 508)
(('out', 'in', '1f-left', '1f-inner'), 838, 485)
(('out', 'in', '1f-right', '2f'), 836, 486)
(('out', 'in', '1f', '2f-right'), 806, 437)
(('in', '1f', '1f-right', '1f-inner'), 805, 477)
(('out', '1f', '1f-right', '1f-inner'), 8

In [84]:
freq_seqs_sample = []
for x in freq_seqs_sorted:
    if (len(x[0]) >= 4):
#         if (x[0][0] != 'out') & (x[0][0] != 'in'):
        freq_seqs_sample.append(x)
print(len(freq_seqs_sample))

491


In [16]:
freq_seqs_sample2 = {}
for kv in freq_seqs_sample:
    freq_seqs_sample2.setdefault((kv[1], kv[2]), []).append(kv[0])
freq_seqs_sample2

{(200, 115): [('out', 'in', '1f-left', '2f-inner')],
 (201, 127): [('out', 'in', '1f', '1f-right', '1f-inner', '2f'),
  ('1f', '1f-right', '1f-inner', '2f'),
  ('in', '1f', '1f-right', '1f-inner', '2f'),
  ('out', '1f', '1f-right', '1f-inner', '2f')],
 (201, 141): [('out', '1f', '1f', '1f-inner'),
  ('in', '1f', '1f', '1f-inner'),
  ('out', 'in', '1f', '1f', '1f-inner')],
 (202, 114): [('out', 'in', '1f', '2f', '2f-right', '2f-left'),
  ('out', '1f', '2f', '2f-right', '2f-left'),
  ('1f', '2f', '2f-right', '2f-left'),
  ('in', '1f', '2f', '2f-right', '2f-left')],
 (202, 131): [('out', 'in', '1f', '1f-right', '1f-left', 'out'),
  ('in', '1f', '1f-right', '1f-left', 'out'),
  ('out', '1f', '1f-right', '1f-left', 'out'),
  ('1f', '1f-right', '1f-left', 'out')],
 (202, 140): [('out', 'in', 'out', 'in', '1f-right'),
  ('out', 'in', 'out', '1f-right'),
  ('in', 'out', 'in', '1f-right')],
 (203, 130): [('1f', '1f-inner', '1f-right', '1f-left'),
  ('out', '1f', '1f-inner', '1f-right', '1f-left

In [17]:
def recursivelyFindLongestSequence(aabaaba, new_list):

#     longest = []
#     for item in aabaaba:
#         if len(item)==len(aabaaba[0]):
#             longest.append(item)

#     new_list.extend(longest)
    try:
        for item in aabaaba:
            testval = 0
            for longt in new_list:
                testval += is_subseq(item, longt)

            if testval == 0:
                new_list.append(item)

        for item in new_list:
            aabaaba.remove(item)


        recursivelyFindLongestSequence(aabaaba, new_list)
    except:
        pass

In [19]:
freqfreqfreq = []

for k, v in freq_seqs_sample2.items():
    if len(v) > 1:
        v = sorted(v, key = len, reverse=True)
        new_list = []
        new_list.append(v[0])
        recursivelyFindLongestSequence(v, new_list) 
        for item in new_list:
            freqfreqfreq.append(tuple((item, k)))
    else:
        freqfreqfreq.append(tuple((v[0], k)))
        
        
freqfreqfreq = sorted(freqfreqfreq, key=lambda tup: tup[1], reverse=True)
print(len(freqfreqfreq))
freqfreqfreq

    

217


[(('out', 'in', '1f', '1f-right'), (2607, 1486)),
 (('out', 'in', '1f', '1f-inner'), (1827, 1005)),
 (('out', 'in', '1f', '1f-left'), (1823, 1069)),
 (('out', 'in', '1f', '2f'), (1612, 883)),
 (('out', 'in', '1f', 'out'), (1541, 832)),
 (('out', 'out', 'in', '1f'), (1390, 771)),
 (('out', 'in', '1f-right', '1f-left'), (1136, 700)),
 (('out', 'in', '2f', '1f'), (1094, 666)),
 (('out', 'in', '2f', '2f-right'), (1032, 564)),
 (('out', 'in', '1f-right', '1f-inner'), (1002, 589)),
 (('out', 'in', '2f', '2f-left'), (995, 548)),
 (('out', 'in', '2f', '1f-right'), (914, 609)),
 (('out', 'in', '1f', '1f-right', '1f-left'), (898, 556)),
 (('out', 'in', '1f-right', '1f'), (884, 546)),
 (('out', 'in', '1f-left', '1f-right'), (872, 508)),
 (('out', 'in', '1f-left', '1f-inner'), (838, 485)),
 (('out', 'in', '1f-right', '2f'), (836, 486)),
 (('out', 'in', '1f', '2f-right'), (806, 437)),
 (('out', 'in', '1f', '1f-right', '1f-inner'), (805, 477)),
 (('out', 'in', '1f-inner', '1f-right'), (754, 449)),
 

In [20]:
freqfreqfreqfreq = []
for x in freqfreqfreq:
    if (len(x[0]) >= 4):
#         if (x[0][0] != 'out') & (x[0][0] != 'in'):
        freqfreqfreqfreq.append(x[0])
freqfreqfreqfreq

[('out', 'in', '1f', '1f-right'),
 ('out', 'in', '1f', '1f-inner'),
 ('out', 'in', '1f', '1f-left'),
 ('out', 'in', '1f', '2f'),
 ('out', 'in', '1f', 'out'),
 ('out', 'out', 'in', '1f'),
 ('out', 'in', '1f-right', '1f-left'),
 ('out', 'in', '2f', '1f'),
 ('out', 'in', '2f', '2f-right'),
 ('out', 'in', '1f-right', '1f-inner'),
 ('out', 'in', '2f', '2f-left'),
 ('out', 'in', '2f', '1f-right'),
 ('out', 'in', '1f', '1f-right', '1f-left'),
 ('out', 'in', '1f-right', '1f'),
 ('out', 'in', '1f-left', '1f-right'),
 ('out', 'in', '1f-left', '1f-inner'),
 ('out', 'in', '1f-right', '2f'),
 ('out', 'in', '1f', '2f-right'),
 ('out', 'in', '1f', '1f-right', '1f-inner'),
 ('out', 'in', '1f-inner', '1f-right'),
 ('out', 'in', '1f', '2f-left'),
 ('out', 'in', '1f', '1f-left', '1f-inner'),
 ('out', 'in', '1f', '1f-left', '1f-right'),
 ('out', 'in', '2f', '2f-inner'),
 ('out', 'in', '1f-inner', '1f-left'),
 ('out', 'in', '1f', '1f-right', '2f'),
 ('out', 'in', '2f', 'out'),
 ('out', 'in', '1f', '2f-inne

In [45]:
num1 = trajs_combined_balanced.revisit_intention.value_counts().loc[1]
num0 = trajs_combined_balanced.revisit_intention.value_counts().loc[0]

In [67]:
freqfreqfreq2 = []
for i in freqfreqfreq:
    z = []
    a = i[1][1]
    b = i[1][0] - i[1][1]
    c = num1 - a
    d = num0 - b
    z.append(a)
    z.append(b)
    z.append(c)
    z.append(d)
    ig = informationGain(a, b, c, d)
    z.append(ig)

    freqfreqfreq2.append((i[0], z))
freqfreqfreq2

[(('out', 'in', '1f', '1f-right'),
  [1486, 1121, 2957, 3322, 0.0058854516458606287]),
 (('out', 'in', '1f', '1f-inner'),
  [1005, 822, 3438, 3621, 0.0018756587139771019]),
 (('out', 'in', '1f', '1f-left'),
  [1069, 754, 3374, 3689, 0.005581544098560598]),
 (('out', 'in', '1f', '2f'), [883, 729, 3560, 3714, 0.0014608174348673675]),
 (('out', 'in', '1f', 'out'), [832, 709, 3611, 3734, 0.00096504113499285005]),
 (('out', 'out', 'in', '1f'), [771, 619, 3672, 3824, 0.0016022306616736426]),
 (('out', 'in', '1f-right', '1f-left'),
  [700, 436, 3743, 4007, 0.0057564452877962857]),
 (('out', 'in', '2f', '1f'), [666, 428, 3777, 4015, 0.0048271636700197851]),
 (('out', 'in', '2f', '2f-right'),
  [564, 468, 3879, 3975, 0.00082124520261728229]),
 (('out', 'in', '1f-right', '1f-inner'),
  [589, 413, 3854, 4030, 0.0028415885563155552]),
 (('out', 'in', '2f', '2f-left'),
  [548, 447, 3895, 3996, 0.00093863777824565453]),
 (('out', 'in', '2f', '1f-right'),
  [609, 305, 3834, 4138, 0.009307784296493815

In [74]:
igdict = {}

for traj in freqfreqfreq2:
    igdict[traj[0]] = traj[1]
    
sortE = sorted(igdict.items(), key=lambda value: value[1][-1], reverse=True)

In [75]:
sortE

[(('out', 'in', '2f', '1f-right'),
  [609, 305, 3834, 4138, 0.0093077842964938151]),
 (('out', 'in', '1f', '1f'), [431, 185, 4012, 4258, 0.0087958376956862594]),
 (('out', 'in', 'in', '1f'), [347, 144, 4096, 4299, 0.0074204189040372182]),
 (('out', 'in', '2f', '1f', '1f-right'),
  [381, 172, 4062, 4271, 0.0069998582169473744]),
 (('out', 'in', '2f', '1f-left'),
  [414, 197, 4029, 4246, 0.0068569278804576639]),
 (('out', 'in', '1f', '2f', '1f-right'),
  [350, 156, 4093, 4287, 0.0065599673506357981]),
 (('out', 'in', '2f-right', '1f-right'),
  [321, 141, 4122, 4302, 0.006158808644885605]),
 (('out', 'in', '2f', '1f-inner'),
  [345, 158, 4098, 4285, 0.0061199911937140916]),
 (('out', 'in', '2f', '2f-right', '1f-right'),
  [270, 108, 4173, 4335, 0.0060731090524346332]),
 (('out', 'in', '1f', '1f-right'),
  [1486, 1121, 2957, 3322, 0.0058854516458606287]),
 (('out', 'in', '1f', '2f', '1f'),
  [251, 98, 4192, 4345, 0.0058569702627397913]),
 (('out', 'in', '1f-right', '1f-left'),
  [700, 436,

In [17]:
def is_subseq(x, y):
    it = iter(y)
    return all(c in it for c in x)

assert is_subseq('india', 'indonesia')
assert is_subseq('oman', 'romania')
assert is_subseq('mali', 'malawi')
assert is_subseq((''), ('a', 'b', 'c', 'd', 'e'))
assert not is_subseq('mali', 'banana')
assert not is_subseq('ais', 'indonesia')
assert not is_subseq('ca', 'abc')

In [14]:
xc = ('out', '1f')

In [64]:
def entropy(prob1, prob2):
    return(-prob1*np.log2(prob1)-prob2*np.log2(prob2))

In [62]:
def informationGain(a, b, c, d):
    ''' 
    a = (True, 1.0) - Subsequence, Revisit intention      
    b = (True, 0.0)       
    c = (False, 1.0)    
    d = (False, 0.0) 
    '''
    # Entropy before
    prob1a = (a+c) / (a+b+c+d)
    prob2a = 1-prob1a
    entropy_before = entropy(prob1a, prob2a)
    
    # Entropy after sequence
    prob1b = a / (a+b)
    prob2b = 1 - prob1b
    prob3b = c / (c+d)
    prob4b = 1 - prob3b
    
    entropy1 = entropy(prob1b, prob2b)
    entropy2 = entropy(prob3b, prob4b)
    entropy_after = (a+b)/(a+b+c+d)*entropy1 + (c+d)/(a+b+c+d)*entropy2
    
    IG = entropy_before-entropy_after
    
    return IG

In [23]:
# def countforentropy(x, traj):
#     c = is_subseq(traj, x['traj']), x['revisit_intention']
#     print (c)
#     return c

# # countforentropy(x, traj)

In [26]:
igdict = {}

for traj in freqfreqfreqfreq:
    c = trajs_combined_balanced.apply(lambda x: (is_subseq(traj, x['traj']), x['revisit_intention']), axis=1)
    cc = c.value_counts().sort_index(ascending=False)
    IG = informationGain(cc[0], cc[1], cc[2], cc[3])
    igdict[traj] = IG


In [27]:
sortE = sorted(igdict.items(), key=lambda value: value[1], reverse=True)

In [31]:
sortE[:30] ### Information Gain 순으로 뽑은 sequence feature (descending order)

[(('out', 'in', '2f', '1f-right'), 0.0093077842964938151),
 (('out', 'in', '1f', '1f'), 0.0087958376956862594),
 (('out', 'in', 'in', '1f'), 0.0074204189040372182),
 (('out', 'in', '2f', '1f', '1f-right'), 0.0069998582169473744),
 (('out', 'in', '2f', '1f-left'), 0.0068569278804576639),
 (('out', 'in', '1f', '2f', '1f-right'), 0.0065599673506357981),
 (('out', 'in', '2f-right', '1f-right'), 0.006158808644885605),
 (('out', 'in', '2f', '1f-inner'), 0.0061199911937140916),
 (('out', 'in', '2f', '2f-right', '1f-right'), 0.0060731090524346332),
 (('out', 'in', '1f', '1f-right'), 0.0058854516458606287),
 (('out', 'in', '1f', '2f', '1f'), 0.0058569702627397913),
 (('out', 'in', '1f-right', '1f-left'), 0.0057564452877962857),
 (('out', 'in', '1f', 'in'), 0.0057384269724725101),
 (('out', 'in', '1f', '1f-left'), 0.005581544098560598),
 (('out', 'in', '1f', 'in', '1f'), 0.005472580436154173),
 (('out', 'in', '1f', '2f', '1f-left'), 0.0053320649724375446),
 (('out', 'in', 'out', 'in'), 0.0051750

In [76]:
# 일단은 이거를 feature로 이용하는 것까지 하고, 
# 1f-long, 1f-short 등 머무른 시간도 넣어서 해보기
# 
seqE = []
for item in sortE[:30]:
    seqE.append(item[0])
len(seqE)
    

30

In [77]:
seqE

[('out', 'in', '2f', '1f-right'),
 ('out', 'in', '1f', '1f'),
 ('out', 'in', 'in', '1f'),
 ('out', 'in', '2f', '1f', '1f-right'),
 ('out', 'in', '2f', '1f-left'),
 ('out', 'in', '1f', '2f', '1f-right'),
 ('out', 'in', '2f-right', '1f-right'),
 ('out', 'in', '2f', '1f-inner'),
 ('out', 'in', '2f', '2f-right', '1f-right'),
 ('out', 'in', '1f', '1f-right'),
 ('out', 'in', '1f', '2f', '1f'),
 ('out', 'in', '1f-right', '1f-left'),
 ('out', 'in', '1f', 'in'),
 ('out', 'out', 'out', 'out'),
 ('out', 'in', '1f', '1f-left'),
 ('out', 'in', '1f', 'in', '1f'),
 ('out', 'in', '1f', '1f-right', '1f'),
 ('out', 'in', '1f', '2f', '1f-left'),
 ('out', 'in', 'out', 'in'),
 ('out', 'in', '2f-left', '1f-right'),
 ('out', 'out', 'out', 'in'),
 ('out', 'in', '2f', '1f'),
 ('out', 'in', 'in', '1f-right'),
 ('out', 'in', '1f', '1f-right', '1f-left'),
 ('out', 'in', '2f', '1f', '1f-left'),
 ('out', 'in', '1f', '2f', '1f', '1f-right'),
 ('out', 'in', '2f', '2f-left', '1f-right'),
 ('out', 'in', '1f-right', '1f

In [35]:
# for row in trajs_combined_balanced.iterrows():
#     for log in row[1]['logs']:
#         print (log)
#     break

In [36]:
# 포함/불포함 여부로 binary variable 만들면 됨.

def relatedfeatures(traj, seqE):
    sss = 2001
    ddd = []
    for seq in seqE:
        if is_subseq(seq, traj) == True:
            ddd.append(sss)
        sss += 1
    return ddd

def generateIGFeatureColumns(df, seqE):
    sss = 2001
    for seq in seqE:
        df[sss] = 0
        sss += 1
    for row in df.iterrows():
        for seq_ig in row[1]['seq_ig_ft']:
            df.set_value(row[0], seq_ig, 1) 
        
#     df.apply(lambda x: df.set_value('y', 1) for y in x['seq_ig_ft'], axis=1 )
    
    

def add_indoor_sequence_IG_features(df, seqE):

    newdf = df
    newdf['seq_ig_ft'] = df.apply(lambda x: relatedfeatures(x['traj'], seqE), axis=1)
    generateIGFeatureColumns(newdf, seqE)
    
#     newdf.apply(lambda x: newdf[y] = 1 for y in x)

    
    #     newdf['']
    #     seqE
#     df.traj.apply(lambda x: x)
#     print(is_subseq(seqE[0],df.traj[2]))
    return newdf
    

newdf = add_indoor_sequence_IG_features(trajs_combined_balanced, seqE)


### 확인완료. 맞게 됨.
newdf_samplerow = newdf.ix['16991_ebe19542cfead37db10244c163fbf76c']
print(newdf_samplerow)
print(newdf_samplerow['seq_ig_ft'])

logs                [568504, 567964, 567693, 567669, 567667, 56766...
traj                [out, out, out, in, 1f, 1f-inner, 2f, 2f-right...
ts                  [1468031077, 1468033418, 1468034623, 146803473...
dwell_time          [326, 42, 1070, 955, 710, 124, 526, 488, 394, ...
hour_start               [11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
time_start          [11:24:37, 12:03:38, 12:23:43, 12:25:38, 12:25...
ts_end              [1468031403, 1468033460, 1468035693, 146803569...
hour_end                 [11, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13]
device_id                            ebe19542cfead37db10244c163fbf76c
date                                                            16991
out                                                              1832
in                                                               1349
1f                                                               1104
1f-inner                                                          124
1f-left             

In [40]:
del newdf['traj2'] 
del newdf['seq_ig_ft']

--------------------------------------

내 스크립트 파일을 라이브러리처럼 import해서 써보자.

In [None]:
%load ../code/code/preprocessing.py

In [5]:
from code import preprocessing

import pickle
import pandas as pd
import numpy as np
from numpy import inf

ImportError: cannot import name 'preprocessing'

In [46]:
placeNum = str(786)

rawdata_picklePath = "../code/data/"+placeNum+"/"+placeNum+".p"
reindexed_picklePath = "../code/data/"+placeNum+"/"+placeNum+"_mpframe.p"
reindexed_picklePath2 = "../code/data/"+placeNum+"/"+placeNum+"_mpframe2.p"
statistical_picklePath = "../code/data/"+placeNum+"/"+placeNum+"_mpframe3.p"

In [47]:
df = pd.read_pickle(rawdata_picklePath)

In [48]:
mpframe3 = pd.read_pickle(statistical_picklePath)

result1 = []
result2 = []
result3 = []

for i in range(10):

    print('initial shape of the data frame: ', mpframe3.shape)
    # mpframe3 = mpframe3.reindex(np.random.permutation(mpframe3.index)).tail(5000)
    mpframe4 = preprocessing.label_balancing(mpframe3, 90, 10)
    print('Label balancing has been done: ', mpframe4.shape)
    mpframe5 = featuregenerator.add_indoor_temporal_movement_features(mpframe4)
    print('Indoor temporal movement features has been added: ', mpframe5.shape)
    mpframe6 = sequencefeaturegenerator.add_frequent_sequence_features(mpframe5, int(round(mpframe5.shape[0]*0.02)))
    print('Frequent sequence features has been added: ', mpframe6.shape)	

    df_learning1 = preprocessing.finalprocessing(mpframe4)
    df_learning2 = preprocessing.finalprocessing(mpframe5)
    df_learning3 = preprocessing.finalprocessing(mpframe6)


    data = np.asarray(df_learning1)
    data[data == inf] = 0
    X, y = data[:, 11:-1], data[:, -1].astype(int)
    # print('Number of features:', X.shape[1])
    cvresults = predict.basicDecisionTree(X, y)
    print("Result 1: ", np.mean(cvresults))
    result1.append(np.mean(cvresults))


    data = np.asarray(df_learning2)
    data[data == inf] = 0
    X, y = data[:, 11:-1], data[:, -1].astype(int)
    # print('Number of features:', X.shape[1])
    cvresults = predict.basicDecisionTree(X, y)
    print("Result 2: ", np.mean(cvresults))
    result2.append(np.mean(cvresults))


    data = np.asarray(df_learning3)
    data[data == inf] = 0
    X, y = data[:, 11:-1], data[:, -1].astype(int)
    # print('Number of features:', X.shape[1])
    cvresults = predict.basicDecisionTree(X, y)
    print("Result 3: ", np.mean(cvresults))
    result3.append(np.mean(cvresults))


print("Average results for exp 1",df_learning1.shape,": ", np.mean(result1))
print("Average results for exp 2",df_learning2.shape,": ", np.mean(result2))
print("Average results for exp 3",df_learning3.shape,": ", np.mean(result3))

initial shape of the data frame:  (98993, 42)


NameError: name 'preprocessing' is not defined


```
(py35) Sundongui-MacBook-Pro:code sundong$ python main.py 
========================================
0:00:01.960 - Start Program
========================================
initial shape of the data frame:  (98993, 42)
label_balancing function took 484.636 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5400.785 ms
Indoor temporal movement features has been added:  (38442, 68)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 2120932.245 ms
generate_seqE function took 0.244 ms
Frequent sequence features has been added:  (38442, 837)
Result 1:  0.679699180683
Result 2:  0.680115819617
Result 3:  0.680635407518
initial shape of the data frame:  (98993, 42)
label_balancing function took 507.315 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5223.394 ms
Indoor temporal movement features has been added:  (38442, 66)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1797211.518 ms
generate_seqE function took 0.134 ms
Frequent sequence features has been added:  (38442, 835)
Result 1:  0.679777116163
Result 2:  0.679438872664
Result 3:  0.679569446045
initial shape of the data frame:  (98993, 42)
label_balancing function took 438.450 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5038.465 ms
Indoor temporal movement features has been added:  (38442, 67)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1849163.265 ms
generate_seqE function took 0.138 ms
Frequent sequence features has been added:  (38442, 836)
Result 1:  0.681000071428
Result 2:  0.681650787363
Result 3:  0.681806184838
initial shape of the data frame:  (98993, 42)
label_balancing function took 456.566 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5016.674 ms
Indoor temporal movement features has been added:  (38442, 67)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1826511.238 ms
generate_seqE function took 0.155 ms
Frequent sequence features has been added:  (38442, 836)
Result 1:  0.680791887243
Result 2:  0.679413385693
Result 3:  0.679439035002
initial shape of the data frame:  (98993, 42)
label_balancing function took 451.288 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5368.173 ms
Indoor temporal movement features has been added:  (38442, 66)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1913510.662 ms
generate_seqE function took 0.152 ms
Frequent sequence features has been added:  (38442, 835)
Result 1:  0.681156023556
Result 2:  0.681286502241
Result 3:  0.680349774865
initial shape of the data frame:  (98993, 42)
label_balancing function took 438.086 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5313.705 ms
Indoor temporal movement features has been added:  (38442, 67)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1784981.903 ms
generate_seqE function took 0.143 ms
Frequent sequence features has been added:  (38442, 836)
Result 1:  0.681883809171
Result 2:  0.68154616091
Result 3:  0.681754236871
initial shape of the data frame:  (98993, 42)
label_balancing function took 499.207 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5214.580 ms
Indoor temporal movement features has been added:  (38442, 67)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1889672.010 ms
generate_seqE function took 0.139 ms
Frequent sequence features has been added:  (38442, 836)
Result 1:  0.682352463714
Result 2:  0.681389992332
Result 3:  0.681597811259
initial shape of the data frame:  (98993, 42)
label_balancing function took 439.690 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5143.135 ms
Indoor temporal movement features has been added:  (38442, 67)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1748855.295 ms
generate_seqE function took 0.140 ms
Frequent sequence features has been added:  (38442, 836)
Result 1:  0.683835564661
Result 2:  0.683367085984
Result 3:  0.682690910134
initial shape of the data frame:  (98993, 42)
label_balancing function took 443.224 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5342.338 ms
Indoor temporal movement features has been added:  (38442, 66)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1894269.483 ms
generate_seqE function took 0.143 ms
Frequent sequence features has been added:  (38442, 835)
Result 1:  0.680688451263
Result 2:  0.680739668713
Result 3:  0.680765493887
initial shape of the data frame:  (98993, 42)
label_balancing function took 495.460 ms
Label balancing has been done:  (38442, 41)
add_indoor_temporal_movement_features function took 5133.643 ms
Indoor temporal movement features has been added:  (38442, 67)
pymining package has been done
longest subsequence has been calcualated
ig calculation has been done
generate_sortE function took 1801389.487 ms
generate_seqE function took 0.142 ms
Frequent sequence features has been added:  (38442, 836)
Result 1:  0.680426966298
Result 2:  0.682508361729
Result 3:  0.681884431465
Average results for exp 1 (38442, 41) :  0.681161153418
Average results for exp 2 (38442, 836) :  0.681145663725
Average results for exp 3 (38442, 836) :  0.681049273188
========================================
5:33:57.272 - End Program
Elapsed time: 5:33:55.311
========================================
(py35) Sundongui-MacBook-Pro:code sundong$ 
```

In [4]:
### 이 코드에서 계산중에 같이 collect할 수 있을거라 생각했는데 , 포기.
from collections import defaultdict

def freq_seq_enum(sequences, min_support):
    '''Enumerates all frequent sequences.

       :param sequences: A sequence of sequences.
       :param min_support: The minimal support of a set to be included.
       :rtype: A set of (frequent_sequence, support).
    '''
    freq_seqs = set()
    _freq_seq(sequences, tuple(), 0, 0, min_support, freq_seqs)
    return freq_seqs


def _freq_seq(sdb, prefix, prefix_support, revisit_support, min_support, freq_seqs):
    if prefix:
#         print('prefix: yes', prefix)
        freq_seqs.add((prefix, prefix_support, revisit_support))
#         print('freq seqs', freq_seqs)
    locally_frequents = _local_freq_items(sdb, prefix, min_support)
#     print('locally frequents', locally_frequents)
    if not locally_frequents:
#         print ('not locally frequents')
        return
    for (item, support1, support2) in locally_frequents:
        new_prefix = prefix + (item,)
        new_sdb = _project(sdb, new_prefix)
#         print('new_prefix', new_prefix)
#         print('new_sdb', new_sdb)
        _freq_seq(new_sdb, new_prefix, support1, support2, min_support, freq_seqs)


def _local_freq_items(sdb, prefix, min_support):
    items = defaultdict(int)  ## for support
    items2 = defaultdict(int)  ## for revisit_intention_support
    freq_items = []
    for entry in sdb:
        visited = set()
        for element in entry[0]:
#             print (element)
            if element not in visited:
                items[element] += 1
                items2[element] += entry[1]
                visited.add(element)
    # Sorted is optional. Just useful for debugging for now.
    for item in items:
        support = items[item] ## support
        support2 = items2[item] ## revisit_intention
        if support >= min_support:
            freq_items.append((item, support, support2))
    return freq_items


def _project(sdb, prefix):
    new_sdb = []
    if not prefix:
        return sdb
    current_prefix_item = prefix[-1]
    for entry in sdb:
        j = 0
        projection = None
        for item in entry[0]:
            if item == current_prefix_item:
                projection = (entry[0][j + 1:], entry[1])
                break
            j += 1
        if projection:
            new_sdb.append(projection)
    return new_sdb




In [92]:
trajs_combined_balanced['revisit_intention'].astype(int)

16767_25e9517c47319ab64486403f94348dd6    0
16955_53f0a6276486cc80405cf534d913fcbb    0
16734_5b37d580c76a8c8c245cad3418349acb    0
16736_70968e4fb8894b74997f6f4e89b32bee    1
16847_e0b87d079ff087efd100c423d88d0fbd    0
16706_3352c133a2ee0a29ed3351ab69dae032    0
16760_b210865acfd2674ae0017b11271afc33    1
16866_ec0ff57c50c68919aa1b5d3bd9424028    0
16724_8eb18e0cbf9c29ea1e5ed4f98cb05171    1
16798_20ee02428cacb74eee06f0a155933689    0
16833_c751fe946b3c00c90874339929e99f06    0
16935_a176960c74009d94369ede0614a5288f    0
16764_3d2f176265bea59caaff155857c2b2e8    0
16732_a6a7fce63f5da450cccee8a0099d58e2    0
16869_56d15da75690ce19d9e8cf5f82282ce9    0
16941_a7bdc8d8b2b694c6f25f81f7382f41b5    0
16744_fe1acb395a7475844a1eb29a618ec566    0
16831_3e5edaf2d72ac48a405c280c54eebdee    0
16991_ebe19542cfead37db10244c163fbf76c    1
16813_7d9dc6f5f24a5ee36bbc7f1b1ecfe027    1
16855_7ea0e50b8a95c9c29595b323c7ce649b    1
16714_f59239d966989d77610087d5e9b9b802    1
16721_197b75b623c8e0d837d878a6b8

In [93]:
trajs_combined_balanced['revisit_intention'] = trajs_combined_balanced['revisit_intention'].astype(int)
yopoup = trajs_combined_balanced.apply(lambda x: (x['traj'], x['revisit_intention']), axis=1)

In [94]:
sample = [(['1f-left', '1f-inner'],1),(['1f-left', '1f-right', '1f-inner'],0), (['1f-left'],1)]

sample[1]
_project(sample, ('1f-left', '1f-inner',))
_local_freq_items(sample, ('1f-inner',), 1 )

[('1f-inner', 2, 1), ('1f-right', 1, 0), ('1f-left', 3, 2)]

In [95]:
freq_seq_enum(yopoup, 200)

{(('1f',), 6509, 3359),
 (('1f', '1f'), 617, 431),
 (('1f', '1f', '1f-inner'), 201, 141),
 (('1f', '1f', '1f-left'), 225, 163),
 (('1f', '1f', '1f-right'), 341, 236),
 (('1f', '1f-inner'), 1827, 1005),
 (('1f', '1f-inner', '1f'), 207, 143),
 (('1f', '1f-inner', '1f-left'), 610, 372),
 (('1f', '1f-inner', '1f-left', '1f-right'), 229, 142),
 (('1f', '1f-inner', '1f-right'), 657, 390),
 (('1f', '1f-inner', '1f-right', '1f-left'), 203, 130),
 (('1f', '1f-inner', '2f'), 432, 265),
 (('1f', '1f-inner', '2f-right'), 211, 122),
 (('1f', '1f-inner', 'out'), 407, 210),
 (('1f', '1f-left'), 1823, 1069),
 (('1f', '1f-left', '1f-inner'), 729, 429),
 (('1f', '1f-left', '1f-inner', '1f-right'), 206, 132),
 (('1f', '1f-left', '1f-right'), 717, 423),
 (('1f', '1f-left', '1f-right', '1f-inner'), 217, 134),
 (('1f', '1f-left', '2f'), 453, 282),
 (('1f', '1f-left', '2f-right'), 218, 124),
 (('1f', '1f-left', 'out'), 405, 237),
 (('1f', '1f-right'), 2608, 1486),
 (('1f', '1f-right', '1f'), 301, 218),
 (('1

In [None]:
def entropy(data, target_attr):
 
    val_freq = {}
    data_entropy = 0.0
 
    # Calculate the frequency of each of the values in the target attr
    for record in data:
        if (val_freq.has_key(record[target_attr])):
            val_freq[record[target_attr]] += 1.0
        else:
            val_freq[record[target_attr]]  = 1.0
 
    # Calculate the entropy of the data for the target attribute
    for freq in val_freq.values():
        data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2) 
 
    return data_entropy


x = np.random.rand(3, 10)
p = x/x.sum(axis=1, keepdims=True)
print(p)
print(p.shape)
print(p.sum(axis=1))

print(entr(p).sum(axis=1))
print(entr(p).sum(axis=1)/np.log(2))
print((-p*np.log2(p)).sum(axis=1))

In [62]:
yyy = trajs_combined_balanced.ix['16767_25e9517c47319ab64486403f94348dd6']

In [68]:
def asdf(yyy, i):
    return yyy['logs'][i], yyy['traj'][i]

In [69]:
asdf(yyy, 2)

(3145074, '2f')

In [66]:
yyy['logs'][2]

3145074

In [67]:
yyy['traj'][2]

'2f'

-----------

In [98]:
### Leave long enough trajs
tatt2 = trajs_combined_balanced.loc(len(trajs_combined_balanced['traj']) > 8)
dfff = trajs_combined_balanced

In [119]:
mask = dfff['traj'].str.len() > 5
dfff2 = dfff.loc[mask]

In [120]:
dfff2.shape

(3231, 37)

In [121]:
dfff2.traj

16706_3352c133a2ee0a29ed3351ab69dae032            [out, in, 3f, 2f-right, 2f, 1f, 1f-right]
16760_b210865acfd2674ae0017b11271afc33                 [out, in, 2f, 1f, 2f-left, 2f-inner]
16724_8eb18e0cbf9c29ea1e5ed4f98cb05171    [out, in, 1f, 1f-right, 2f, 3f, out, in, 1f-le...
16833_c751fe946b3c00c90874339929e99f06           [out, in, 1f, 1f-inner, 1f-right, 1f-left]
16935_a176960c74009d94369ede0614a5288f            [out, in, 2f, 2f-inner, 2f-left, 1f, out]
16732_a6a7fce63f5da450cccee8a0099d58e2                [out, in, 1f, 1f-right, 1f-left, out]
16831_3e5edaf2d72ac48a405c280c54eebdee           [out, in, 1f-right, 1f, 1f-left, 1f-inner]
16991_ebe19542cfead37db10244c163fbf76c    [out, out, out, in, 1f, 1f-inner, 2f, 2f-right...
16714_f59239d966989d77610087d5e9b9b802    [out, in, 1f, 1f-right, 1f-left, 2f-right, 2f,...
16927_dedf509af2003d829b2c3dd8e718bae7                     [out, in, 2f, 2f-inner, 1f, out]
16889_2dae56b75aed7823db01f5d6fe878fa5           [out, in, 1f-right, 1f, 1f-left

-----------

In [129]:
type(dfff['traj'][0])

list

In [125]:
dfff['tt'] = 1

In [145]:
import random
def createRandomTrajs():
    rand = random.randrange(10) + 10
    listt = []
    foo = ['1f', '2f', '3f', '1f-left', '1f-inner', '1f-right', '2f-inner', '2f-left', '2f-right']
    listt.append('out')
    listt.append('in')
    for i in range(rand-2):
        listt.append(random.choice(foo))
    return listt
    

In [154]:
np.array([createRandomTrajs()])

array([['out', 'in', '3f', '2f-inner', '3f', '1f-inner', '1f-left',
        '2f-right', '2f-left', '1f-right', '1f', '1f-inner']], 
      dtype='<U8')

In [166]:
listt = []
listt2 = []
for i in range(50000):
    listt.append(createRandomTrajs())
    listt2.append(random.choice([0, 1]))

In [174]:
d = {'traj': listt, 'r_i': listt2}
dfyo = pd.DataFrame(data=d, columns=['traj', 'r_i'])

In [236]:
dfyo.head(5000)

Unnamed: 0,traj,r_i
0,"[out, in, 2f-inner, 2f-left, 1f, 2f, 1f-inner,...",0
1,"[out, in, 1f, 1f, 1f, 2f-inner, 2f-right, 1f-i...",1
2,"[out, in, 2f-right, 1f, 1f-right, 2f-inner, 3f...",1
3,"[out, in, 3f, 1f-left, 1f-inner, 1f-inner, 1f-...",1
4,"[out, in, 1f-inner, 2f-right, 2f, 2f-right, 1f...",1
5,"[out, in, 2f-right, 2f-inner, 2f, 3f, 1f-inner...",0
6,"[out, in, 2f-left, 2f, 1f-left, 1f-left, 1f-in...",1
7,"[out, in, 1f-inner, 2f, 1f-right, 2f-inner, 1f...",1
8,"[out, in, 2f, 2f, 2f, 1f, 2f-right, 2f-left, 2...",0
9,"[out, in, 1f-right, 2f-inner, 1f-right, 1f-inn...",1


In [147]:


dfff['tt'] = createRandomTrajs()

ValueError: Length of values does not match length of index

In [207]:
length = 15
fakedata1 = [i for i in range(random.randrange(length)+10)]

In [208]:

len(fakedata1)

19

In [209]:


prob_revisit = (len(fakedata1)-10)/(length)
prob_revisit

0.6

In [227]:
s = 0
for i in range(10000):
    s += np.random.choice([0, 1], p=[1-prob_revisit, prob_revisit])
print (s)

5958
