In [1]:
import pandas as pd
import numpy as np
import time
import multiprocessing as mp

import feat_builder #custom function

In [2]:
cores = mp.cpu_count()

def parallelize_dataframe(df, times, func, n_cores = cores):
    times_split = np.array_split(times, n_cores)
    pool = mp.Pool(n_cores)
    results = np.vstack(pool.starmap(func,[(split, df) for split in times_split]))
    pool.close()
    pool.join()
    return results

In [3]:
df_data = pd.read_csv('raw_data.csv', sep = ' ', skiprows = [0], names = ['ProfileID', 'SiteID', 'ActionTime', 'ActionID', 'EmailID'])
df_data.drop(['SiteID','EmailID'], axis=1, inplace=True)
df_data = df_data.query('ActionID == [0 , 3 , 4, 5]')
df_data = df_data.sort_values(by = ['ProfileID', 'ActionTime'], ignore_index = True)
df_data

Unnamed: 0,ProfileID,ActionTime,ActionID
0,-9078563994714573724,1593244722,3
1,-9078563994714573724,1593314378,3
2,-9078563994714573724,1593319295,3
3,-9078563994714573724,1593320015,3
4,-9078563994714573724,1593408914,3
...,...,...,...
9542,8981967240768651675,1593924505,3
9543,8981967240768651675,1593934271,3
9544,8981967240768651675,1593942352,3
9545,8981967240768651675,1593945988,3


In [4]:
df_data['ActionTimeDt'] = pd.to_datetime(df_data['ActionTime'], unit = 's').dt.strftime('%y %m %d')
df_data.head(20)

Unnamed: 0,ProfileID,ActionTime,ActionID,ActionTimeDt
0,-9078563994714573724,1593244722,3,20 06 27
1,-9078563994714573724,1593314378,3,20 06 28
2,-9078563994714573724,1593319295,3,20 06 28
3,-9078563994714573724,1593320015,3,20 06 28
4,-9078563994714573724,1593408914,3,20 06 29
5,-9078563994714573724,1593411880,3,20 06 29
6,-9078563994714573724,1593507258,3,20 06 30
7,-9078563994714573724,1593507454,3,20 06 30
8,-9078563994714573724,1595834401,0,20 07 27
9,-9071552367607809971,1595835307,0,20 07 27


In [5]:
timesT = df_data[df_data['ActionID'] == 3].index
len(timesT)

7209

In [6]:
#no_parallelization
time_start = time.perf_counter()

res_1_9 = []
for oneT in timesT:
    day1 = df_data[(df_data['ActionTime'] > (df_data['ActionTime'][oneT] - 86400))&
                     (df_data['ActionTime'] < df_data['ActionTime'][oneT])&
                     (df_data['ProfileID'] == df_data['ProfileID'][oneT])]
    day3 = df_data[(df_data['ActionTime'] < (df_data['ActionTime'][oneT] - 86400))&
                     (df_data['ActionTime'] > (df_data['ActionTime'][oneT]-259200))&
                     (df_data['ProfileID'] == df_data['ProfileID'][oneT])]
    did_action = df_data[(df_data['ProfileID'] == df_data['ProfileID'][oneT])&
                     (df_data['ActionTime'] < df_data['ActionTime'][oneT])&
                     (df_data['ActionID'].isin([0, 4, 5]))]
    if len(did_action.index) < 2:
        feat8 = float('NaN')
    else:
        feat8 = int((did_action.iloc[-1]['ActionTime'] - did_action.iloc[0]['ActionTime'])/86400)

    if len(did_action.index) == 0:
        feat9 = float('NaN')
    else:
        feat9 = int((df_data['ActionTime'][oneT] - did_action.iloc[-1]['ActionTime'])/86400)

    res_1_9.append([
            #feature 1. доставки за 24 часа
            len(day1[df_data['ActionID'] == 3].index),
            #feature 2. открытия за 24 часа
            len(day1[df_data['ActionID'] == 4].index),
            #feature 3. клики за 24 часа
            len(day1[df_data['ActionID'] == 5].index),
            #feature 4. доставки за 3 дня
            len(day3[df_data['ActionID'] == 3].index),
            #feature 5. открытия за 3 дня
            len(day3[df_data['ActionID'] == 4].index),
            #feature 6. клики за 3 дня
            len(day3[df_data['ActionID'] == 5].index),
            #feature 7. количество уникальных дней активности - надо доработать
            len(did_action['ActionTimeDt'].unique()),
            #feature 8. макс дней между действиями
            feat8,
            #feature 9. дней между действием и доставкой
            feat9
    ])
time_end = time.perf_counter()
print('Заняло %f секунды'%(time_end-time_start))

  len(day1[df_data['ActionID'] == 3].index),
  len(day1[df_data['ActionID'] == 4].index),
  len(day1[df_data['ActionID'] == 5].index),
  len(day3[df_data['ActionID'] == 3].index),
  len(day3[df_data['ActionID'] == 4].index),
  len(day3[df_data['ActionID'] == 5].index),


Заняло 45.580436 секунды


In [7]:
time_start = time.perf_counter()
res_1_9 = parallelize_dataframe(df_data, timesT, feat_builder.feat_var, mp.cpu_count())
time_end = time.perf_counter()
print('Заняло %f секунды'%(time_end-time_start))

Заняло 13.789109 секунды


In [8]:
res_1_9

array([[ 0.,  0.,  0., ...,  0., nan, nan],
       [ 1.,  0.,  0., ...,  0., nan, nan],
       [ 2.,  0.,  0., ...,  0., nan, nan],
       ...,
       [13.,  2.,  0., ...,  6.,  5.,  0.],
       [13.,  2.,  0., ...,  6.,  5.,  0.],
       [13.,  2.,  0., ...,  6.,  5.,  0.]])

In [9]:
df_final = df_data[['ProfileID', 'ActionTime']].iloc[timesT]
df_final[['feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9']] = pd.DataFrame(res_1_9, index = timesT)
df_final.iloc[:, 2:9] = df_final.iloc[:, 2:9].astype(int)
df_final

Unnamed: 0,ProfileID,ActionTime,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9
0,-9078563994714573724,1593244722,0,0,0,0,0,0,0,,
1,-9078563994714573724,1593314378,1,0,0,0,0,0,0,,
2,-9078563994714573724,1593319295,2,0,0,0,0,0,0,,
3,-9078563994714573724,1593320015,3,0,0,0,0,0,0,,
4,-9078563994714573724,1593408914,0,0,0,4,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...
9541,8981967240768651675,1593919306,22,2,0,7,4,1,6,5.0,0.0
9542,8981967240768651675,1593924505,19,2,0,11,4,1,6,5.0,0.0
9543,8981967240768651675,1593934271,13,2,0,18,4,1,6,5.0,0.0
9544,8981967240768651675,1593942352,13,2,0,19,4,1,6,5.0,0.0


In [10]:
df_final.to_csv('feat_1_9.csv', index = False)

In [11]:
pd.read_csv('feat_1_9.csv')

Unnamed: 0,ProfileID,ActionTime,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9
0,-9078563994714573724,1593244722,0,0,0,0,0,0,0,,
1,-9078563994714573724,1593314378,1,0,0,0,0,0,0,,
2,-9078563994714573724,1593319295,2,0,0,0,0,0,0,,
3,-9078563994714573724,1593320015,3,0,0,0,0,0,0,,
4,-9078563994714573724,1593408914,0,0,0,4,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...
7204,8981967240768651675,1593919306,22,2,0,7,4,1,6,5.0,0.0
7205,8981967240768651675,1593924505,19,2,0,11,4,1,6,5.0,0.0
7206,8981967240768651675,1593934271,13,2,0,18,4,1,6,5.0,0.0
7207,8981967240768651675,1593942352,13,2,0,19,4,1,6,5.0,0.0
