In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
# 配置项
# 这个要放到设置中文之前否则还是小方框
plt.style.use("seaborn")

# 指定默认字体 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

# #全部行都能输出
InteractiveShell.ast_node_interactivity = "all"

In [4]:
action_info = pd.read_csv('/Users/JQC/Desktop/data2/rawdata/table_3.csv')
action_info.head()

Unnamed: 0,userid,actionType,actionTime
0,100000000013,1,1474300753
1,100000000013,5,1474300763
2,100000000013,6,1474300874
3,100000000013,5,1474300911
4,100000000013,6,1474300936


In [5]:
action_info.shape

(1334856, 3)

In [7]:
# 无缺失值
action_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1334856 entries, 0 to 1334855
Data columns (total 3 columns):
userid        1334856 non-null int64
actionType    1334856 non-null int64
actionTime    1334856 non-null int64
dtypes: int64(3)
memory usage: 30.6 MB


In [15]:
action_info.actionType.value_counts()

5    479227
1    384875
6    227223
3     78034
4     45386
2     44862
8     28779
7     28083
9     18387
Name: actionType, dtype: int64

In [10]:
# 每个用户点击次数 (事件交互次数)
action_num = action_info.groupby(['userid'])['actionType'].count()
action_num = pd.DataFrame(action_num)
action_num.columns = ['F3.1']
action_num.head()

Unnamed: 0_level_0,F3.1
userid,Unnamed: 1_level_1
100000000013,143
100000000111,3
100000000127,6
100000000231,44
100000000379,84


In [18]:
def sum_action(df,args,scale):
    value_count = pd.value_counts(df.loc[:,args])
    sum_c = 0
    for i in value_count.index:
        if i in scale:
            sum_c += value_count[i]
    return sum_c

# actionType 1, 2, 3, 4为非支付动作  5, 6, 7, 8, 9为支付动作
action_1234 = action_info.groupby('userid').apply(sum_action, args='actionType', scale=[1, 2, 3, 4])
action_56789 = action_info.groupby('userid').apply(sum_action, args='actionType', scale=[5, 6, 7, 8, 9])

In [19]:
action_1234 = pd.DataFrame(action_1234)
action_56789 = pd.DataFrame(action_56789)
# 非支付动作
action_1234.columns = ['F3.2']
# 支付动作
action_56789.columns = ['F3.3']


In [24]:
feature = pd.merge(action_num, action_1234, on='userid')
feature = feature.merge(action_56789, on='userid')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100000000013,143,85,58
100000000111,3,1,2
100000000127,6,2,4
100000000231,44,28,16
100000000379,84,58,26


In [25]:
# 筛选出每个动作,并建立新表
action_1 = action_info[action_info.actionType.isin([1])]
action_2 = action_info[action_info.actionType.isin([2])]
action_3 = action_info[action_info.actionType.isin([3])]
action_4 = action_info[action_info.actionType.isin([4])]
action_5 = action_info[action_info.actionType.isin([5])]
action_6 = action_info[action_info.actionType.isin([6])]
action_7 = action_info[action_info.actionType.isin([7])]
action_8 = action_info[action_info.actionType.isin([8])]
action_9 = action_info[action_info.actionType.isin([9])]

In [26]:
action_1.head()

Unnamed: 0,userid,actionType,actionTime
0,100000000013,1,1474300753
10,100000000013,1,1474479934
11,100000000013,1,1474566968
13,100000000013,1,1474657473
14,100000000013,1,1475062132


In [29]:
# 统计每个用户每个动作的次数
# 统计动作1的次数
feature['F3.4'] = action_1.groupby('userid')['actionType'].count()
# 统计动作2的次数
feature['F3.5'] = action_2.groupby('userid')['actionType'].count()
feature['F3.6'] = action_3.groupby('userid')['actionType'].count()
feature['F3.7'] = action_4.groupby('userid')['actionType'].count()
feature['F3.8'] = action_5.groupby('userid')['actionType'].count()
feature['F3.9'] = action_6.groupby('userid')['actionType'].count()
feature['F3.10'] = action_7.groupby('userid')['actionType'].count()
feature['F3.11'] = action_8.groupby('userid')['actionType'].count()
feature['F3.12'] = action_9.groupby('userid')['actionType'].count()

In [30]:
# 统计非支付占比
feature['F3.13'] = feature['F3.2'] / feature['F3.1']
# 统计支付占比
feature['F3.14'] = feature['F3.3'] / feature['F3.1']
# 动作1占比
feature['F3.15'] = feature['F3.4'] / feature['F3.1']
feature['F3.16'] = feature['F3.5'] / feature['F3.1']
feature['F3.17'] = feature['F3.6'] / feature['F3.1']
feature['F3.18'] = feature['F3.7'] / feature['F3.1']
feature['F3.19'] = feature['F3.8'] / feature['F3.1']
feature['F3.20'] = feature['F3.9'] / feature['F3.1']
feature['F3.21'] = feature['F3.10'] / feature['F3.1']
feature['F3.22'] = feature['F3.11'] / feature['F3.1']
# 动作9占比
feature['F3.23'] = feature['F3.12'] / feature['F3.1']


In [42]:
feature = feature.fillna(0)
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.16,F3.17,F3.18,F3.19,F3.20,F3.21,F3.22,F3.23,F3.24,F3.25
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.006993,0.027972,0.006993,0.223776,0.125874,0.006993,0.027972,0.020979,203785.8,889258900000.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,23.0,200.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,1423920.0,3682693000000.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.136364,0.045455,0.113636,0.227273,0.136364,0.0,0.0,0.0,412180.6,1089375000000.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.035714,0.083333,0.071429,0.166667,0.130952,0.011905,0.0,0.0,361943.5,522407900000.0


In [52]:
# 计算时间间隔均值
def time_gap_mean(df,args):
    t = df.loc[:,args]
    d = t.diff().dropna() # diff计算时间间隔，dropna删除空值
    return d.mean()

actiontime_gap_mean = action_info.groupby('userid').apply(time_gap_mean, args='actionTime')

In [53]:
actiontime_gap_mean = pd.DataFrame(actiontime_gap_mean)
actiontime_gap_mean.columns = ['F3.24']
feature = feature.merge(actiontime_gap_mean, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.15,F3.16,F3.17,F3.18,F3.19,F3.20,F3.21,F3.22,F3.23,F3.24
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.552448,0.006993,0.027972,0.006993,0.223776,0.125874,0.006993,0.027972,0.020979,203785.8
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.333333,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,23.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.333333,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,1423920.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.340909,0.136364,0.045455,0.113636,0.227273,0.136364,0.0,0.0,0.0,412180.6
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.5,0.035714,0.083333,0.071429,0.166667,0.130952,0.011905,0.0,0.0,361943.5


In [54]:
# 计算时间间隔方差
def time_gap_var(df,args):
    t = df.loc[:,args]
    d = t.diff().dropna() # diff计算时间间隔，dropna删除空值
    return d.var()

actiontime_gap_var = action_info.groupby('userid').apply(time_gap_var, args='actionTime')

In [55]:
actiontime_gap_var = pd.DataFrame(actiontime_gap_var)
actiontime_gap_var.columns = ['F3.25']
feature = feature.merge(actiontime_gap_var, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.16,F3.17,F3.18,F3.19,F3.20,F3.21,F3.22,F3.23,F3.24,F3.25
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.006993,0.027972,0.006993,0.223776,0.125874,0.006993,0.027972,0.020979,203785.8,889258900000.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,23.0,200.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,1423920.0,3682693000000.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.136364,0.045455,0.113636,0.227273,0.136364,0.0,0.0,0.0,412180.6,1089375000000.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.035714,0.083333,0.071429,0.166667,0.130952,0.011905,0.0,0.0,361943.5,522407900000.0


In [56]:
# 计算时间间隔最小值
def time_gap_min(df,args):
    t = df.loc[:,args]
    d = t.diff().dropna() # diff计算时间间隔，dropna删除空值
    return d.min()

actiontime_gap_min = action_info.groupby('userid').apply(time_gap_min, args='actionTime')

In [57]:
actiontime_gap_min = pd.DataFrame(actiontime_gap_min)
actiontime_gap_min.columns = ['F3.26']
feature = feature.merge(actiontime_gap_min, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.17,F3.18,F3.19,F3.20,F3.21,F3.22,F3.23,F3.24,F3.25,F3.26
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.027972,0.006993,0.223776,0.125874,0.006993,0.027972,0.020979,203785.8,889258900000.0,2.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,23.0,200.0,13.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,1423920.0,3682693000000.0,46.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.045455,0.113636,0.227273,0.136364,0.0,0.0,0.0,412180.6,1089375000000.0,3.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.083333,0.071429,0.166667,0.130952,0.011905,0.0,0.0,361943.5,522407900000.0,2.0


In [58]:
# 计算时间间隔最大值
def time_gap_max(df,args):
    t = df.loc[:,args]
    d = t.diff().dropna() # diff计算时间间隔，dropna删除空值
    return d.max()

actiontime_gap_max = action_info.groupby('userid').apply(time_gap_max, args='actionTime')

In [59]:
actiontime_gap_max = pd.DataFrame(actiontime_gap_max)
actiontime_gap_max.columns = ['F3.27']
feature = feature.merge(actiontime_gap_max, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.18,F3.19,F3.20,F3.21,F3.22,F3.23,F3.24,F3.25,F3.26,F3.27
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.006993,0.223776,0.125874,0.006993,0.027972,0.020979,203785.8,889258900000.0,2.0,6648889.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.333333,0.333333,0.0,0.0,0.0,23.0,200.0,13.0,33.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,0.333333,0.0,0.333333,0.0,0.0,1423920.0,3682693000000.0,46.0,3766778.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.113636,0.227273,0.136364,0.0,0.0,0.0,412180.6,1089375000000.0,3.0,5072943.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.071429,0.166667,0.130952,0.011905,0.0,0.0,361943.5,522407900000.0,2.0,4051593.0


In [61]:
# 最后一个时间间隔
def last_time_gap(df, args):
    t = df.loc[:, args]
    d = t.diff().dropna()
    return d.iloc[-1] if len(d) > 1 else 0

action_last_time_gap = action_info.groupby('userid').apply(last_time_gap, args='actionTime')

In [67]:
action_last_time_gap = pd.DataFrame(action_last_time_gap)
action_last_time_gap.columns = ['F3.28']
feature = feature.merge(action_last_time_gap, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.19,F3.20,F3.21,F3.22,F3.23,F3.24,F3.25,F3.26,F3.27,F3.28
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.223776,0.125874,0.006993,0.027972,0.020979,203785.8,889258900000.0,2.0,6648889.0,240330.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.333333,0.333333,0.0,0.0,0.0,23.0,200.0,13.0,33.0,33.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.333333,0.0,0.333333,0.0,0.0,1423920.0,3682693000000.0,46.0,3766778.0,3266886.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.227273,0.136364,0.0,0.0,0.0,412180.6,1089375000000.0,3.0,5072943.0,5.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.166667,0.130952,0.011905,0.0,0.0,361943.5,522407900000.0,2.0,4051593.0,242804.0


In [75]:
# 倒数第二个时间间隔
def last2_time_gap(df, args):
    t = df.loc[:, args]
    d = t.diff().dropna()
    return d.iloc[-2] if len(d) > 2 else 0

action_last2_time_gap = action_info.groupby('userid').apply(last2_time_gap, args='actionTime')

In [76]:
action_last2_time_gap = pd.DataFrame(action_last2_time_gap)
action_last2_time_gap.columns = ['F3.29']
feature = feature.merge(action_last2_time_gap, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.23,F3.24,F3.25,F3.26,F3.27,F3.28,F3.29_x,F3.30,F3.29_y,F3.29
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.020979,203785.8,889258900000.0,2.0,6648889.0,240330.0,240330.0,240330.0,69377.0,69377.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,23.0,200.0,13.0,33.0,33.0,33.0,33.0,13.0,0.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,1423920.0,3682693000000.0,46.0,3766778.0,3266886.0,3266886.0,3266886.0,3766778.0,3766778.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.0,412180.6,1089375000000.0,3.0,5072943.0,5.0,5.0,5.0,55.0,55.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.0,361943.5,522407900000.0,2.0,4051593.0,242804.0,242804.0,242804.0,41.0,41.0


In [77]:
# 倒数第三个时间间隔
def last3_time_gap(df, args):
    t = df.loc[:, args]
    d = t.diff().dropna()
    return d.iloc[-3] if len(d) > 3 else 0

action_last3_time_gap = action_info.groupby('userid').apply(last3_time_gap, args='actionTime')

In [79]:
action_last3_time_gap = pd.DataFrame(action_last3_time_gap)
action_last3_time_gap.columns = ['F3.30']
feature = feature.merge(action_last3_time_gap, on='userid', how='left')

In [81]:
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.21,F3.22,F3.23,F3.24,F3.25,F3.26,F3.27,F3.28,F3.29,F3.30
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.006993,0.027972,0.020979,203785.8,889258900000.0,2.0,6648889.0,240330.0,69377.0,180836.0
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,23.0,200.0,13.0,33.0,33.0,0.0,0.0
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.333333,0.0,0.0,1423920.0,3682693000000.0,46.0,3766778.0,3266886.0,3766778.0,46.0
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.0,0.0,0.0,412180.6,1089375000000.0,3.0,5072943.0,5.0,55.0,9781.0
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.011905,0.0,0.0,361943.5,522407900000.0,2.0,4051593.0,242804.0,41.0,3.0


In [82]:
# 最后一个动作行为
def last_type(df, args):
    t = list(df.loc[:, args])
    return t[-1]

action_last_type = action_info.groupby('userid').apply(last_type, args='actionType')

In [83]:
action_last_type = pd.DataFrame(action_last_type)
action_last_type.columns = ['F3.31']
feature = feature.merge(action_last_type, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.22,F3.23,F3.24,F3.25,F3.26,F3.27,F3.28,F3.29,F3.30,F3.31
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.027972,0.020979,203785.8,889258900000.0,2.0,6648889.0,240330.0,69377.0,180836.0,6
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,23.0,200.0,13.0,33.0,33.0,0.0,0.0,6
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,0.0,1423920.0,3682693000000.0,46.0,3766778.0,3266886.0,3766778.0,46.0,7
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.0,0.0,412180.6,1089375000000.0,3.0,5072943.0,5.0,55.0,9781.0,2
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.0,0.0,361943.5,522407900000.0,2.0,4051593.0,242804.0,41.0,3.0,1


In [90]:
# 倒数第二个动作行为
def last2_type(df, args):
    t = list(df.loc[:, args])
    if len(t) > 1:
        return t[-2]
    return 0

action_last2_type = action_info.groupby('userid').apply(last2_type, args='actionType')

In [91]:
action_last2_type = pd.DataFrame(action_last2_type)
action_last2_type.columns = ['F3.32']
feature = feature.merge(action_last2_type, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.23,F3.24,F3.25,F3.26,F3.27,F3.28,F3.29,F3.30,F3.31,F3.32
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,0.020979,203785.8,889258900000.0,2.0,6648889.0,240330.0,69377.0,180836.0,6,1
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,23.0,200.0,13.0,33.0,33.0,0.0,0.0,6,5
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,0.0,1423920.0,3682693000000.0,46.0,3766778.0,3266886.0,3766778.0,46.0,7,7
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,0.0,412180.6,1089375000000.0,3.0,5072943.0,5.0,55.0,9781.0,2,2
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,0.0,361943.5,522407900000.0,2.0,4051593.0,242804.0,41.0,3.0,1,6


In [92]:
# 倒数第三个动作行为
def last3_type(df, args):
    t = list(df.loc[:, args])
    if len(t) > 2:
        return t[-3]
    return 0

action_last3_type = action_info.groupby('userid').apply(last3_type, args='actionType')

In [93]:
action_last3_type = pd.DataFrame(action_last3_type)
action_last3_type.columns = ['F3.33']
feature = feature.merge(action_last3_type, on='userid', how='left')
feature.head()

Unnamed: 0_level_0,F3.1,F3.2,F3.3,F3.4,F3.5,F3.6,F3.7,F3.8,F3.9,F3.10,...,F3.24,F3.25,F3.26,F3.27,F3.28,F3.29,F3.30,F3.31,F3.32,F3.33
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000013,143,85,58,79.0,1.0,4.0,1.0,32.0,18.0,1.0,...,203785.8,889258900000.0,2.0,6648889.0,240330.0,69377.0,180836.0,6,1,1
100000000111,3,1,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,23.0,200.0,13.0,33.0,33.0,0.0,0.0,6,5,1
100000000127,6,2,4,2.0,0.0,0.0,0.0,2.0,0.0,2.0,...,1423920.0,3682693000000.0,46.0,3766778.0,3266886.0,3766778.0,46.0,7,7,5
100000000231,44,28,16,15.0,6.0,2.0,5.0,10.0,6.0,0.0,...,412180.6,1089375000000.0,3.0,5072943.0,5.0,55.0,9781.0,2,2,1
100000000379,84,58,26,42.0,3.0,7.0,6.0,14.0,11.0,1.0,...,361943.5,522407900000.0,2.0,4051593.0,242804.0,41.0,3.0,1,6,5


In [94]:
# 最后三个动作的平均时间间隔
feature["F3.34"] = (feature["F3.28"]+feature["F3.29"]+feature["F3.30"])/3

In [96]:
# 最后三个动作的时间方差
feature["F3.35"] = ((feature["F3.28"]-feature["F3.34"])**2 + (feature["F3.29"]-feature["F3.34"])**2 + (feature["F3.30"]-feature["F3.34"])**2)/3

In [97]:
last = action_info.groupby("userid")["actionTime"].max()
last1 = action_1.groupby("userid")["actionTime"].max()
last2 = action_2.groupby("userid")["actionTime"].max()
last3 = action_3.groupby("userid")["actionTime"].max()
last4 = action_4.groupby("userid")["actionTime"].max()
last5 = action_5.groupby("userid")["actionTime"].max()
last6 = action_6.groupby("userid")["actionTime"].max()
last7 = action_7.groupby("userid")["actionTime"].max()
last8 = action_8.groupby("userid")["actionTime"].max()
last9 = action_9.groupby("userid")["actionTime"].max()

In [98]:
last = pd.DataFrame(last)
last1 = pd.DataFrame(last1)
last2 = pd.DataFrame(last2)
last3 = pd.DataFrame(last3)
last4 = pd.DataFrame(last4)
last5 = pd.DataFrame(last5)
last6 = pd.DataFrame(last6)
last7 = pd.DataFrame(last7)
last8 = pd.DataFrame(last8)
last9 = pd.DataFrame(last9)
last.columns = ["time"]
last1.columns = ["time"]
last2.columns = ["time"]
last3.columns = ["time"]
last4.columns = ["time"]
last5.columns = ["time"]
last6.columns = ["time"]
last7.columns = ["time"]
last8.columns = ["time"]
last9.columns = ["time"]

In [99]:
feature["F3.36"] = last["time"]-last1["time"]
feature["F3.37"] = last["time"]-last2["time"]
feature["F3.38"] = last["time"]-last3["time"]
feature["F3.39"] = last["time"]-last4["time"]
feature["F3.40"] = last["time"]-last5["time"]
feature["F3.41"] = last["time"]-last6["time"]
feature["F3.42"] = last["time"]-last7["time"]
feature["F3.43"] = last["time"]-last8["time"]
feature["F3.44"] = last["time"]-last9["time"]

In [100]:
action_1_mean = action_1.groupby("userid").apply(time_gap_mean,args="actionTime")
action_1_var = action_1.groupby("userid").apply(time_gap_var,args="actionTime")
action_1_min = action_1.groupby("userid").apply(time_gap_min,args="actionTime")
action_1_max = action_1.groupby("userid").apply(time_gap_max,args="actionTime")

In [101]:
action_2_mean = action_2.groupby("userid").apply(time_gap_mean,args="actionTime")
action_2_var = action_2.groupby("userid").apply(time_gap_var,args="actionTime")
action_2_min = action_2.groupby("userid").apply(time_gap_min,args="actionTime")
action_2_max = action_2.groupby("userid").apply(time_gap_max,args="actionTime")

In [102]:
action_3_mean = action_3.groupby("userid").apply(time_gap_mean,args="actionTime")
action_3_var = action_3.groupby("userid").apply(time_gap_var,args="actionTime")
action_3_min = action_3.groupby("userid").apply(time_gap_min,args="actionTime")
action_3_max = action_3.groupby("userid").apply(time_gap_max,args="actionTime")

In [103]:
action_4_mean = action_4.groupby("userid").apply(time_gap_mean,args="actionTime")
action_4_var = action_4.groupby("userid").apply(time_gap_var,args="actionTime")
action_4_min = action_4.groupby("userid").apply(time_gap_min,args="actionTime")
action_4_max = action_4.groupby("userid").apply(time_gap_max,args="actionTime")

In [104]:
action_5_mean = action_5.groupby("userid").apply(time_gap_mean,args="actionTime")
action_5_var = action_5.groupby("userid").apply(time_gap_var,args="actionTime")
action_5_min = action_5.groupby("userid").apply(time_gap_min,args="actionTime")
action_5_max = action_5.groupby("userid").apply(time_gap_max,args="actionTime")

In [105]:
action_6_mean = action_6.groupby("userid").apply(time_gap_mean,args="actionTime")
action_6_var = action_6.groupby("userid").apply(time_gap_var,args="actionTime")
action_6_min = action_6.groupby("userid").apply(time_gap_min,args="actionTime")
action_6_max = action_6.groupby("userid").apply(time_gap_max,args="actionTime")

In [106]:
action_7_mean = action_7.groupby("userid").apply(time_gap_mean,args="actionTime")
action_7_var = action_7.groupby("userid").apply(time_gap_var,args="actionTime")
action_7_min = action_7.groupby("userid").apply(time_gap_min,args="actionTime")
action_7_max = action_7.groupby("userid").apply(time_gap_max,args="actionTime")

In [107]:
action_8_mean = action_8.groupby("userid").apply(time_gap_mean,args="actionTime")
action_8_var = action_8.groupby("userid").apply(time_gap_var,args="actionTime")
action_8_min = action_8.groupby("userid").apply(time_gap_min,args="actionTime")
action_8_max = action_8.groupby("userid").apply(time_gap_max,args="actionTime")

In [108]:
action_9_mean = action_9.groupby("userid").apply(time_gap_mean,args="actionTime")
action_9_var = action_9.groupby("userid").apply(time_gap_var,args="actionTime")
action_9_min = action_9.groupby("userid").apply(time_gap_min,args="actionTime")
action_9_max = action_9.groupby("userid").apply(time_gap_max,args="actionTime")

In [109]:
df1_1 = pd.DataFrame(action_1_mean)
df1_2 = pd.DataFrame(action_1_var)
df1_3 = pd.DataFrame(action_1_min)
df1_4 = pd.DataFrame(action_1_max)
df1_1.columns = ["F3.45"]
df1_2.columns = ["F3.46"]
df1_3.columns = ["F3.47"]
df1_4.columns = ["F3.48"]

In [110]:
df2_1 = pd.DataFrame(action_2_mean)
df2_2 = pd.DataFrame(action_2_var)
df2_3 = pd.DataFrame(action_2_min)
df2_4 = pd.DataFrame(action_2_max)
df2_1.columns = ["F3.49"]
df2_2.columns = ["F3.50"]
df2_3.columns = ["F3.51"]
df2_4.columns = ["F3.52"]

In [111]:
df3_1 = pd.DataFrame(action_3_mean)
df3_2 = pd.DataFrame(action_3_var)
df3_3 = pd.DataFrame(action_3_min)
df3_4 = pd.DataFrame(action_3_max)
df3_1.columns = ["F3.53"]
df3_2.columns = ["F3.54"]
df3_3.columns = ["F3.55"]
df3_4.columns = ["F3.56"]


In [112]:
df4_1 = pd.DataFrame(action_4_mean)
df4_2 = pd.DataFrame(action_4_var)
df4_3 = pd.DataFrame(action_4_min)
df4_4 = pd.DataFrame(action_4_max)
df4_1.columns = ["F3.57"]
df4_2.columns = ["F3.58"]
df4_3.columns = ["F3.59"]
df4_4.columns = ["F3.60"]

In [113]:
df5_1 = pd.DataFrame(action_5_mean)
df5_2 = pd.DataFrame(action_5_var)
df5_3 = pd.DataFrame(action_5_min)
df5_4 = pd.DataFrame(action_5_max)
df5_1.columns = ["F3.61"]
df5_2.columns = ["F3.62"]
df5_3.columns = ["F3.63"]
df5_4.columns = ["F3.64"]

In [114]:
df6_1 = pd.DataFrame(action_6_mean)
df6_2 = pd.DataFrame(action_6_var)
df6_3 = pd.DataFrame(action_6_min)
df6_4 = pd.DataFrame(action_6_max)
df6_1.columns = ["F3.65"]
df6_2.columns = ["F3.66"]
df6_3.columns = ["F3.67"]
df6_4.columns = ["F3.68"]

In [115]:
df7_1 = pd.DataFrame(action_7_mean)
df7_2 = pd.DataFrame(action_7_var)
df7_3 = pd.DataFrame(action_7_min)
df7_4 = pd.DataFrame(action_7_max)
df7_1.columns = ["F3.69"]
df7_2.columns = ["F3.70"]
df7_3.columns = ["F3.71"]
df7_4.columns = ["F3.72"]

In [116]:
df8_1 = pd.DataFrame(action_8_mean)
df8_2 = pd.DataFrame(action_8_var)
df8_3 = pd.DataFrame(action_8_min)
df8_4 = pd.DataFrame(action_8_max)
df8_1.columns = ["F3.73"]
df8_2.columns = ["F3.74"]
df8_3.columns = ["F3.75"]
df8_4.columns = ["F3.76"]

In [117]:
df9_1 = pd.DataFrame(action_9_mean)
df9_2 = pd.DataFrame(action_9_var)
df9_3 = pd.DataFrame(action_9_min)
df9_4 = pd.DataFrame(action_9_max)
df9_1.columns = ["F3.77"]
df9_2.columns = ["F3.78"]
df9_3.columns = ["F3.79"]
df9_4.columns = ["F3.80"]

In [118]:
feature = feature.merge(df1_1,on="userid",how="left")
feature = feature.merge(df1_2,on="userid",how="left")
feature = feature.merge(df1_3,on="userid",how="left")
feature = feature.merge(df1_4,on="userid",how="left")
feature = feature.merge(df2_1,on="userid",how="left")
feature = feature.merge(df2_2,on="userid",how="left")
feature = feature.merge(df2_3,on="userid",how="left")
feature = feature.merge(df2_4,on="userid",how="left")
feature = feature.merge(df3_1,on="userid",how="left")
feature = feature.merge(df3_2,on="userid",how="left")
feature = feature.merge(df3_3,on="userid",how="left")
feature = feature.merge(df3_4,on="userid",how="left")
feature = feature.merge(df4_1,on="userid",how="left")
feature = feature.merge(df4_2,on="userid",how="left")
feature = feature.merge(df4_3,on="userid",how="left")
feature = feature.merge(df4_4,on="userid",how="left")
feature = feature.merge(df5_1,on="userid",how="left")
feature = feature.merge(df5_2,on="userid",how="left")
feature = feature.merge(df5_3,on="userid",how="left")
feature = feature.merge(df5_4,on="userid",how="left")
feature = feature.merge(df6_1,on="userid",how="left")
feature = feature.merge(df6_2,on="userid",how="left")
feature = feature.merge(df6_3,on="userid",how="left")
feature = feature.merge(df6_4,on="userid",how="left")
feature = feature.merge(df7_1,on="userid",how="left")
feature = feature.merge(df7_2,on="userid",how="left")
feature = feature.merge(df7_3,on="userid",how="left")
feature = feature.merge(df7_4,on="userid",how="left")
feature = feature.merge(df8_1,on="userid",how="left")
feature = feature.merge(df8_2,on="userid",how="left")
feature = feature.merge(df8_3,on="userid",how="left")
feature = feature.merge(df8_4,on="userid",how="left")
feature = feature.merge(df9_1,on="userid",how="left")
feature = feature.merge(df9_2,on="userid",how="left")
feature = feature.merge(df9_3,on="userid",how="left")
feature = feature.merge(df9_4,on="userid",how="left")

In [119]:
feature.to_csv('F3.csv')