In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
import time
import gc
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

#每次可以输出多个变量
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pylab import rcParams
rcParams['figure.figsize'] = 14, 6

import warnings
warnings.filterwarnings("ignore")

#中文字体
import matplotlib
matplotlib.use('qt4agg')
#指定默认字体
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = False

In [2]:
df_user_reg = pd.read_csv('./data/b/user_register_log.txt',sep='\t',header=None,names=['user_id','register_day','register_type','device_type'])
df_app_launch = pd.read_csv('./data/b/app_launch_log.txt',sep='\t',header=None,names=['user_id','day'])
df_video_create = pd.read_csv('./data/b/video_create_log.txt',sep='\t',header=None,names=['user_id','day'])
df_user_activity = pd.read_csv('./data/b/user_activity_log.txt',sep='\t',header=None,names=['user_id','day','page','video_id','author_id','action_type'])

## 粉丝群

In [3]:
author = df_user_activity.groupby(['author_id'])
fan_author = author.user_id.count().sort_values(ascending=False)[:3800]

In [4]:
fan_author[:5]

author_id
178632     27620
391186     21876
306604     21424
373087     16469
1249639    16052
Name: user_id, dtype: int64

In [5]:
df_user_activity = df_user_activity[df_user_activity.day < 24]

In [6]:
def get_fan_author(x):
    return list(df_user_activity[df_user_activity.user_id == x.user_id].author_id.unique())
def get_fan_author_list(x):
    return list(df_user_activity[df_user_activity.user_id == x.user_id].author_id)

In [7]:
%%time
df_user_reg['fan_author'] = df_user_reg.apply(get_fan_author, axis=1)
print('finish set')
df_user_reg['fan_author_list'] = df_user_reg.apply(get_fan_author_list, axis=1)
print('finish list')

finish set
finish list
CPU times: user 38min 40s, sys: 745 ms, total: 38min 41s
Wall time: 38min 41s


In [8]:
df_user_reg['set_len'] = df_user_reg['fan_author'].apply(lambda x:len(x))
df_user_reg['list_len'] = df_user_reg['fan_author_list'].apply(lambda x:len(x))

In [9]:
df_user_reg.head()

Unnamed: 0,user_id,register_day,register_type,device_type,fan_author,fan_author_list,set_len,list_len
0,167777,1,4,270,[],[],0,0
1,886972,1,0,5,"[980589, 1053611, 262128]","[980589, 980589, 980589, 980589, 980589, 98058...",3,28
2,921231,1,0,0,"[574167, 992903, 935597, 509753, 131355, 67238...","[574167, 574167, 574167, 574167, 574167, 57416...",25,103
3,904908,1,1,49,"[801356, 329977, 1070990, 602664, 141229, 5303...","[801356, 329977, 1070990, 602664, 602664, 1412...",426,594
4,460291,2,0,72,"[323908, 871685, 1225904, 77426, 127022, 33167...","[323908, 871685, 1225904, 77426, 77426, 127022...",1412,1875


In [10]:
df_user_reg.set_len.max()
df_user_reg.list_len.max()
len(df_user_activity.author_id.unique())

5530

13761

922332

In [11]:
def multi_hot(x):
    tmp = np.zeros(len(fan_author.index))
    for i, genre in enumerate(fan_author.index):
        if genre in x:
            tmp[i] = 1
    return list(map(int, tmp))

def multi_cnt_hot(x):
    tmp = np.zeros(len(fan_author.index))
    for i, genre in enumerate(fan_author.index):
        if genre in x:
            tmp[i] = x.count(genre)
    return list(map(int, tmp))

In [12]:
%%time
df_user_reg['fan_author_multi_hot'] = df_user_reg.fan_author.apply(multi_hot)
df_user_reg['fan_author_list_multi_hot'] = df_user_reg.fan_author_list.apply(multi_cnt_hot)

CPU times: user 1h 25min 38s, sys: 2.35 s, total: 1h 25min 41s
Wall time: 1h 25min 41s


## 相似用户群

In [13]:
user_act_cnt = df_user_activity.groupby(['user_id', 'day'], as_index=False)['user_id'].agg({'cnt':'count'})
user_act_cnt = user_act_cnt.sort_values(by=['user_id', 'day'])
user_act_cnt.shape
user_act_cnt.head()

(133749, 3)

Unnamed: 0,user_id,day,cnt
0,8,9,21
1,8,10,39
2,8,11,5
3,8,13,31
4,8,18,83


In [14]:
user_act_route = df_user_reg[['user_id']]
for i in range(1, 31):
    user_act_route['day_cnt_' + str(i)] = list(np.zeros([user_act_route.shape[0], ]).astype('int16'))
user_act_route = user_act_route.sort_values(by=['user_id'])

In [15]:
for i in tqdm_notebook(range(user_act_cnt.shape[0])):
    user_id = user_act_cnt.iloc[i].user_id
    day = user_act_cnt.iloc[i].day
    user_act_route['day_cnt_' + str(day)][user_act_route.user_id == user_id] = user_act_cnt.iloc[i].cnt




In [16]:
user_act_route.shape
user_act_route.head()

(51480, 31)

Unnamed: 0,user_id,day_cnt_1,day_cnt_2,day_cnt_3,day_cnt_4,day_cnt_5,day_cnt_6,day_cnt_7,day_cnt_8,day_cnt_9,day_cnt_10,day_cnt_11,day_cnt_12,day_cnt_13,day_cnt_14,day_cnt_15,day_cnt_16,day_cnt_17,day_cnt_18,day_cnt_19,day_cnt_20,day_cnt_21,day_cnt_22,day_cnt_23,day_cnt_24,day_cnt_25,day_cnt_26,day_cnt_27,day_cnt_28,day_cnt_29,day_cnt_30
25589,8,0,0,0,0,0,0,0,0,21,39,5,0,31,0,0,0,0,83,0,0,0,4,0,0,0,0,0,0,0,0
14698,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1766,73,0,0,0,5,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19867,129,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
26656,194,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## KMeans

In [18]:
from sklearn.cluster import KMeans

In [19]:
%%time
kmeans = KMeans(n_clusters=15, n_jobs=-1, random_state=2018).fit(user_act_route.iloc[:, :-7].drop(['user_id'], axis=1))
user_act_route['group'] = kmeans.labels_

CPU times: user 4.16 s, sys: 406 ms, total: 4.56 s
Wall time: 7.22 s


In [22]:
user_act_route.groupby(['group']).user_id.count()

group
0     42107
1       919
2       159
3      2386
4       501
5       140
6       306
7      1277
8        72
9       241
10      764
11      923
12      710
13      543
14      432
Name: user_id, dtype: int64

In [23]:
user_act_route[['user_id', 'group']].to_csv('./features/group_noleak_b.csv', index=None)

In [16]:
df_user_reg[['user_id', 'fan_author_multi_hot', 'fan_author_list_multi_hot']].to_csv('./features/set_list_multi_hot_noleak.csv', index=None)

In [24]:
set_group = df_user_reg[['user_id', 'fan_author_multi_hot']].copy()
list_group = df_user_reg[['user_id', 'fan_author_list_multi_hot']].copy()

set_group.shape
list_group.shape

(51480, 2)

(51480, 2)

In [25]:
for i in range(3800):
    set_group['fan_author_multi_hot_' + str(i)] = set_group['fan_author_multi_hot'].apply(lambda x:x.pop())
    list_group['fan_author_list_multi_hot_' + str(i)] = list_group['fan_author_list_multi_hot'].apply(lambda x:x.pop())

set_group.shape
list_group.shape

(51480, 3802)

(51480, 3802)

In [66]:
# set_group.to_csv('./features/set_multi_hot.csv', index=None)
# list_group.to_csv('./features/list_multi_hot.csv', index=None)

In [41]:
%%time
kmeans_set = KMeans(n_clusters=4, n_jobs=-1, random_state=2018).fit(set_group.drop(['user_id', 'fan_author_multi_hot'], axis=1))

CPU times: user 46.4 s, sys: 4.47 s, total: 50.9 s
Wall time: 1min 32s


In [42]:
df_user_reg['set_group'] = kmeans_set.labels_
kmeans_set.cluster_centers_

array([[2.73224044e-02, 8.65209472e-02, 7.92349727e-02, ...,
        1.83060109e-01, 5.30965392e-01, 5.68306011e-01],
       [4.16011833e-04, 6.93353055e-04, 1.27114727e-03, ...,
        7.30331885e-03, 8.36646020e-03, 2.84505870e-02],
       [1.13293051e-02, 1.51057402e-02, 9.21450151e-02, ...,
        8.38368580e-02, 1.03474320e-01, 4.69788520e-01],
       [6.39032815e-03, 1.70984456e-02, 2.93609672e-02, ...,
        9.44732297e-02, 1.72538860e-01, 3.72193437e-01]])

In [28]:
# %%time
# kmeans_list = KMeans(n_clusters=3, n_jobs=-1, random_state=2018).fit(list_group.drop(['user_id', 'fan_author_list_multi_hot'], axis=1))

CPU times: user 40.2 s, sys: 3.99 s, total: 44.2 s
Wall time: 1min


In [29]:
# df_user_reg['list_group'] = kmeans_list.labels_
# kmeans_list.cluster_centers_

array([[0.00382688, 0.01350091, 0.01301527, ..., 0.25439994, 0.28192626,
        0.31526089],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [43]:
df_user_reg.groupby(['set_group']).user_id.count()
# df_user_reg.groupby(['list_group']).user_id.count()

set_group
0     1098
1    43268
2     1324
3     5790
Name: user_id, dtype: int64

In [44]:
df_user_reg[['user_id', 'set_group', 'list_group']].head()

Unnamed: 0,user_id,set_group,list_group
0,167777,1,0
1,886972,1,0
2,921231,1,0
3,904908,3,0
4,460291,2,0


In [45]:
df_user_reg[['user_id', 'set_group']].to_csv('./features/set_group_noleak_b.csv', index=None)