In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from operator import add
%matplotlib inline

pd.set_option('display.max_colwidth', 500)

In [2]:
train = pd.read_csv('../data/train.csv', index_col='Id')

In [3]:
seqs = {ix: pd.Series(x['Sequence'].split(',')) for ix, x in train.iterrows()}

# Size

In [4]:
train['Size'] = [len(seq) for seq in seqs.values()]

In [5]:
train[train.Size == 4][15:18]['Sequence']

Id
5487                             1,9,81,150094635296999121
5614    81647160420,170655787050,211212209880,227961624450
5698                                            1,2,8,7073
Name: Sequence, dtype: object

# ValuesSizeMean/Max/Min

In [6]:
train['ValuesSizeMean'] = [seq.apply(lambda x: len(x)).mean() for seq in seqs.values()]
train['ValuesSizeMax'] = [seq.apply(lambda x: len(x)).max() for seq in seqs.values()]
train['ValuesSizeMin'] = [seq.apply(lambda x: len(x)).min() for seq in seqs.values()]

In [7]:
train[train.ValuesSizeMax == 4][13:15]['Sequence']

Id
202             1,1,1,1,1,1,2,2,3,3,4,4,6,6,9,9,13,13,19,19,28,28,41,41,60,60,88,88,129,129,189,189,277,277,406,406,595,595,872,872,1278,1278,1873,1873,2745,2745,4023,4023,5896,5896,8641
205    3255,3257,3258,3259,3260,3261,3263,3264,3265,3266,3267,3269,3270,3271,3272,3273,3301,3302,3303,3304,3305,3307,3308,3309,3315,3316,3317,3319,3320,3321,3322,3323,3326,3327,3328,3329
Name: Sequence, dtype: object

# ValuesMode and ValuesModeFreq
ValuesMode is equals to the most frequent value, if it's the only

In [8]:
train['ValuesMode'] = [None if seqmode.size != 1 else seqmode[0] for seqmode in (seq.mode() for seq in seqs.values())]
train['ValuesModeFreq'] = [seq.value_counts().max() for seq in seqs.values()]
train.loc[train.ValuesMode.isnull(), 'ValuesModeFreq'] = None

In [9]:
train.loc[[2256], ['Sequence', 'ValuesMode', 'ValuesModeFreq']]

Unnamed: 0_level_0,Sequence,ValuesMode,ValuesModeFreq
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2256,35037670,0,4


# LastValue

In [10]:
train['LastValue'] = [seq.iloc[-1] for seq in seqs.values()]

In [11]:
train.loc[[5698], ['Sequence', 'LastValue']]

Unnamed: 0_level_0,Sequence,LastValue
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
5698,1287073,7073


# LastValue == ValuesMode

In [12]:
train['LastValueEqValuesMode'] = train.apply(lambda x: x['LastValue'] == x['ValuesMode'], axis=1)

In [14]:
train[train.LastValueEqValuesMode].loc[[2176], ['Sequence', 'LastValue']]

Unnamed: 0_level_0,Sequence,LastValue
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2176,11141111041141113714141111041114111,1


# NUniqueValues

In [15]:
train['NUniqueValues'] = [seq.value_counts().shape[0] for seq in seqs.values()]

# NPrefixValues
number of values that are prefix of next value

In [16]:
train['NPrefixValues'] = [reduce(add, (seq[i+1].startswith(seq[i]) for i in range(seq.shape[0] - 1)), 0) for seq in seqs.values()]

# NPrefixValues == Size - 1
pattern

x(0) = base

x(t+1) = concat(x(t), c(t+1))

In [17]:
train['NPrefixValuesEqSizeMinus1'] = (train.NPrefixValues == train.Size - 1)

In [18]:
train[(train.NPrefixValuesEqSizeMinus1) & (train.Size < 10) & (train.ValuesSizeMax < 5)][:2]

Unnamed: 0_level_0,Sequence,Size,ValuesSizeMean,ValuesSizeMax,ValuesSizeMin,ValuesMode,ValuesModeFreq,LastValue,LastValueEqValuesMode,NUniqueValues,NPrefixValues,NPrefixValuesEqSizeMinus1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
156118,2232357,3,2.333333,4,1,,,2357,False,3,2,True
191625,2212122122,4,2.5,4,1,,,2122,False,4,3,True


In [19]:
train[train.NPrefixValuesEqSizeMinus1].shape

(246, 12)

In [20]:
train.sample(2)

Unnamed: 0_level_0,Sequence,Size,ValuesSizeMean,ValuesSizeMax,ValuesSizeMin,ValuesMode,ValuesModeFreq,LastValue,LastValueEqValuesMode,NUniqueValues,NPrefixValues,NPrefixValuesEqSizeMinus1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
108461,11212311241123511123611122371111223811112224911111222410,55,1.018182,2,1,1.0,26.0,10,False,10,23,False
94079,91827364554637282838485868788899099108117126135144153163164165166167168169170171180189198207216225234244245246247248249250251252261,50,2.62,3,1,,,261,False,50,0,False
