# 1.读取数据

In [20]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random

In [21]:
#读取用户表数据
user_data = pd.read_csv('tianchi_mobile_recommend_train_user.csv',encoding='utf-8')
user_data

Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category,time
0,98047837,232431562,1,,4245,2014-12-06 02
1,97726136,383583590,1,,5894,2014-12-09 20
2,98607707,64749712,1,,2883,2014-12-18 11
3,98662432,320593836,1,96nn52n,6562,2014-12-06 10
4,98145908,290208520,1,,13926,2014-12-16 21
...,...,...,...,...,...,...
12256901,93812622,378365755,1,95q6d6a,11,2014-12-13 21
12256902,93812622,177724753,1,,12311,2014-12-14 21
12256903,93812622,234391443,1,,8765,2014-12-11 16
12256904,93812622,26452000,1,95q6dqc,7951,2014-12-08 22


In [22]:
#读取商品表数据
item_data = pd.read_csv('tianchi_mobile_recommend_train_item.csv',encoding='utf-8')
item_data

Unnamed: 0,item_id,item_geohash,item_category
0,312051294,,8270
1,99999754,,7393
2,131746128,,7876
3,385731330,,10544
4,100004415,,3064
...,...,...,...
480718,312331852,,10431
480719,258547749,,9618
480720,131745195,,11991
480721,31233101,,8099


# 2.数据预处理

### 2.1删除不需要的列

In [23]:
#删除'user_geohash'列和'item_geohash'列，原始数据发生改变
user_data.drop(labels=['user_geohash'],axis=1,inplace=True)
item_data.drop(labels=['item_geohash'],axis=1,inplace=True)

### 2.2商品列表去重

In [24]:
#对商品表进行去重
item_data = item_data.drop_duplicates()
item_data 

Unnamed: 0,item_id,item_category
0,312051294,8270
1,99999754,7393
2,131746128,7876
3,385731330,10544
4,100004415,3064
...,...,...
480718,312331852,10431
480719,258547749,9618
480720,131745195,11991
480721,31233101,8099


### 2.3合并用户和商品表

In [25]:
#删除用户表中不存在商品表中的数据，并合并两个表，内连接方式
data = pd.merge(user_data,item_data)
data

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,98047837,232431562,1,4245,2014-12-06 02
1,98047837,232431562,1,4245,2014-12-03 20
2,98047837,232431562,1,4245,2014-12-07 23
3,98047837,232431562,1,4245,2014-12-03 20
4,98047837,232431562,1,4245,2014-12-06 02
...,...,...,...,...,...
1403717,62126989,106796394,1,2284,2014-12-15 01
1403718,62126989,122790346,1,10431,2014-12-12 23
1403719,76314785,6627230,1,10467,2014-11-19 00
1403720,76314785,322071819,1,6648,2014-12-12 00


### 2.4数据映射

In [26]:
# 将behavior_type列中1,2,3,4,分别映射为 pv，collect，cart，buy
data.loc[data['behavior_type']==1,'behavior_type']='pv'
data.loc[data['behavior_type']==2,'behavior_type']='collect'
data.loc[data['behavior_type']==3,'behavior_type']='cart'
data.loc[data['behavior_type']==4,'behavior_type']='buy'
data

Unnamed: 0,user_id,item_id,behavior_type,item_category,time
0,98047837,232431562,pv,4245,2014-12-06 02
1,98047837,232431562,pv,4245,2014-12-03 20
2,98047837,232431562,pv,4245,2014-12-07 23
3,98047837,232431562,pv,4245,2014-12-03 20
4,98047837,232431562,pv,4245,2014-12-06 02
...,...,...,...,...,...
1403717,62126989,106796394,pv,2284,2014-12-15 01
1403718,62126989,122790346,pv,10431,2014-12-12 23
1403719,76314785,6627230,pv,10467,2014-11-19 00
1403720,76314785,322071819,pv,6648,2014-12-12 00


### 2.5数据离散化

In [27]:
# 对 behavior_type列 离散成 buy，cart，collect，pv 四列
behavior_type = pd.get_dummies(data['behavior_type'],dtype =int)
behavior_type

Unnamed: 0,buy,cart,collect,pv
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
1403717,0,0,0,1
1403718,0,0,0,1
1403719,0,0,0,1
1403720,0,0,0,1


In [28]:
# 与原数据表合并
data = pd.concat([data.drop('behavior_type',axis=1),behavior_type],axis=1)
data

Unnamed: 0,user_id,item_id,item_category,time,buy,cart,collect,pv
0,98047837,232431562,4245,2014-12-06 02,0,0,0,1
1,98047837,232431562,4245,2014-12-03 20,0,0,0,1
2,98047837,232431562,4245,2014-12-07 23,0,0,0,1
3,98047837,232431562,4245,2014-12-03 20,0,0,0,1
4,98047837,232431562,4245,2014-12-06 02,0,0,0,1
...,...,...,...,...,...,...,...,...
1403717,62126989,106796394,2284,2014-12-15 01,0,0,0,1
1403718,62126989,122790346,10431,2014-12-12 23,0,0,0,1
1403719,76314785,6627230,10467,2014-11-19 00,0,0,0,1
1403720,76314785,322071819,6648,2014-12-12 00,0,0,0,1


### 2.6 time列数据映射

In [29]:
#时间标准化
data['time']=pd.to_datetime(data['time'])
#将时间转化成一年中的第几周
data['week'] = [i.week for i in (data['time'])]
#将week重新赋值
data.loc[data['week']==47,'week']='1'
data.loc[data['week']==48,'week']='2'
data.loc[data['week']==49,'week']='3'
data.loc[data['week']==50,'week']='4'
data.loc[data['week']==51,'week']='5'
data

Unnamed: 0,user_id,item_id,item_category,time,buy,cart,collect,pv,week
0,98047837,232431562,4245,2014-12-06 02:00:00,0,0,0,1,3
1,98047837,232431562,4245,2014-12-03 20:00:00,0,0,0,1,3
2,98047837,232431562,4245,2014-12-07 23:00:00,0,0,0,1,3
3,98047837,232431562,4245,2014-12-03 20:00:00,0,0,0,1,3
4,98047837,232431562,4245,2014-12-06 02:00:00,0,0,0,1,3
...,...,...,...,...,...,...,...,...,...
1403717,62126989,106796394,2284,2014-12-15 01:00:00,0,0,0,1,5
1403718,62126989,122790346,10431,2014-12-12 23:00:00,0,0,0,1,4
1403719,76314785,6627230,10467,2014-11-19 00:00:00,0,0,0,1,1
1403720,76314785,322071819,6648,2014-12-12 00:00:00,0,0,0,1,4


### 2.7 item_category列重新编码

In [30]:
#对商品类别重新编码
item_category = data['item_category']
# 去重,返回一个长度991的列表
item_category = item_category.drop_duplicates() 
# 对item_category列重新编码，0-990
for index,d in enumerate(item_category):
    data.loc[data['item_category']==d,'item_category']=index
data

Unnamed: 0,user_id,item_id,item_category,time,buy,cart,collect,pv,week
0,98047837,232431562,0,2014-12-06 02:00:00,0,0,0,1,3
1,98047837,232431562,0,2014-12-03 20:00:00,0,0,0,1,3
2,98047837,232431562,0,2014-12-07 23:00:00,0,0,0,1,3
3,98047837,232431562,0,2014-12-03 20:00:00,0,0,0,1,3
4,98047837,232431562,0,2014-12-06 02:00:00,0,0,0,1,3
...,...,...,...,...,...,...,...,...,...
1403717,62126989,106796394,60,2014-12-15 01:00:00,0,0,0,1,5
1403718,62126989,122790346,9,2014-12-12 23:00:00,0,0,0,1,4
1403719,76314785,6627230,403,2014-11-19 00:00:00,0,0,0,1,1
1403720,76314785,322071819,18,2014-12-12 00:00:00,0,0,0,1,4


# 3.特征构造 划分训练集和测试集

### 3.1 封装方法

In [31]:
# 按周划分数据方法
def divide_week(data,week):
    # 查询第几周数据
    week=str(week)
    data1 = data[data.week.str.contains(week)]
    # 用户对此类型商品执行各项type操作的总次数
    user_category_feature = data1.loc[:,['user_id','item_category','buy','cart','collect','pv']]
    user_category_feature = user_category_feature.groupby(by=['user_id','item_category'],as_index=False).sum()
    # 修改各属性标签名
    user_category_feature.columns = ['user_id','item_category','uc_buy_sum','uc_cart_sum','uc_collect_sum','uc_pv_sum']
    return user_category_feature

In [32]:
# 特征构造 对每周数据划分x，y方法
def process_data(user_category_feature):
    group = user_category_feature.groupby(by='user_id')
    # 用于存一周内输入数据
    x = {}
    # 用于存一周内输出数据
    y = {}
    for g in group:
        # 用于判断buy中是否有 >0数
        buy_no = []
        # 一个用户id
        key=g[0]
        # 数据并重排序 并创建index列
        val=g[1].reset_index()
        # 用于存一个用户数据 总类型为991
        cart_data = np.zeros(991)
        collect_data = np.zeros(991)
        pv_data = np.zeros(991)
        buy_data = np.zeros(991)
        for i in range(len(val)):
            # 取item_category当作索引
            index = val.loc[i,'item_category']
            # 取uc_cart_sum  uc_collect_sum uc_pv_sum中数据
            cart = val.loc[i,'uc_cart_sum']
            collect = val.loc[i,'uc_collect_sum']
            pv= val.loc[i,'uc_pv_sum']
            # 取uc_buy_sum中数据
            buy= val.loc[i,'uc_buy_sum']
            # 取用户id 作为x,y的键
            uid = val.loc[i,'user_id']
            # 存数据
            cart_data[index]=cart
            collect_data[index]=collect
            pv_data[index]=pv
            buy_data[index]=buy
            
            buy_no.append(buy)
        # y 加一列 有买过东西的用户追加最后一列0 没有买过东西追加 1
        buy_no = np.array(buy_no)
        if buy_no[buy_no>0].sum() > 0:
            buy_data = np.insert(buy_data, 991, 0)
        else:
            buy_data = np.insert(buy_data, 991, 1)
        x[uid] =  ([list(cart_data),list(collect_data),list(pv_data)])
        y[uid] = (list(buy_data))
    return x,y

In [33]:
# 划分x，y训练集测试集方法
train_x = []
train_y = []
test_x = []
test_y = []
def train_data(x,y,train_x,train_y):
    # 取 x,y 中uid相同键组成列表
    same = x.keys() & y.keys()
    same = list(same)
    for i in same:
        train_x.append(x[i])
        train_y.append(y[i])
    return train_x,train_y
def test_data(x,y,test_x,test_y):
    same = x.keys() & y.keys()
    same = list(same)
    for i in same:
        test_x.append(x[i])
        test_y.append(y[i])
    return test_x,test_y

### 3.2调用方法

In [34]:
# 划分第一周数据
week_1 = divide_week(data,1)
x_1,y_1 = process_data(week_1)
# 划分第二周数据
week_2 = divide_week(data,2)
x_2,y_2 = process_data(week_2)
# 划分第三周数据
week_3 = divide_week(data,3)
x_3,y_3 = process_data(week_3)
# 划分第四周数据
week_4 = divide_week(data,4)
x_4,y_4 = process_data(week_4)
# 划分第五周数据
week_5 = divide_week(data,5)
x_5,y_5 = process_data(week_5)

In [35]:
taobao_train = {}
taobao_test = {}
# 划分训练集和测试集
''' 
训练集 三组：x_1  -> y_2
             x_2  -> y_3
             x_3  -> y_4
             
测试集 一组  x_4  -> y_5
'''
# 训练集
train_x,train_y = train_data(x_1,y_2,train_x,train_y)
train_x,train_y = train_data(x_2,y_3,train_x,train_y)
train_x,train_y = train_data(x_3,y_4,train_x,train_y)
taobao_train['train_x'] = train_x
taobao_train['train_y'] = train_y

# 测试集
test_x,test_y = train_data(x_4,y_5,test_x,test_y)
taobao_test['test_x'] = test_x
taobao_test['test_y'] = test_y
len(train_x),len(train_y),len(test_x),len(test_y)

(13047, 13047, 4027, 4027)

### 3.3保存数据为json格式

In [36]:
import json
# 数据保存在本地
with open(r'd:\data\taobao_sum\train.json', 'w') as f:
    f.write(json.dumps(taobao_train))
    print('train.json保存成功')

train.json保存成功


In [37]:
with open(r'd:\data\taobao_sum\test.json', 'w') as f:
    f.write(json.dumps(taobao_test))
    print('test.json保存成功')

test.json保存成功


In [19]:
'''
仅仅第一周
train_x数据格式：(5392, 3, 991)
[[[0., 0., 0., ..., 0., 0., 0.],   # cart
  [0., 0., 0., ..., 0., 0., 0.],   # collect
  [0., 0., 0., ..., 0., 0., 0.]],  # pv

 [[0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.]],

  [[0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.]],

  ...,

 [[0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.]],

 [[0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.]],

 [[0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.],
  [0., 0., 0., ..., 0., 0., 0.]]]
每三行表示一个用户,总共3971用户，3种行为，关于991商品种类 
'''
'''
train_y数据格式： (5392, 991)
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
每一行表示一个用户的购买,总共3971用户，关于991商品种类
'''
print( )


