In [82]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [83]:
# 读取数据库
df = pd.read_csv('./data/tianchi_ali/tianchi_mobile_recommend_train_user.csv', usecols=[0, 2, 4, 5])
df.columns = ['userId', 'behav', 'itemCat', 'timestp']
df['userId'] = df['userId'].astype('uint32')
df['behav'] = df['behav'].astype('uint8')
df['itemCat'] = df['itemCat'].astype('uint32')
df['timestp'] = df['timestp'].str.replace('-','').str.replace(' ', '').astype(int).astype('uint32') // 100

clone = df.copy()

clone.dtypes

userId     uint32
behav       uint8
itemCat    uint32
timestp    uint32
dtype: object

In [84]:
# 计算内存占用，单位MB
df = clone.copy()

df.memory_usage() // 1024 // 1024

Index       0
userId     46
behav      11
itemCat    46
timestp    46
dtype: int64

In [85]:
# 数据分片，此处暂未分片
df

Unnamed: 0,userId,behav,itemCat,timestp
0,98047837,1,4245,20141206
1,97726136,1,5894,20141209
2,98607707,1,2883,20141218
3,98662432,1,6562,20141206
4,98145908,1,13926,20141216
...,...,...,...,...
12256901,93812622,1,11,20141213
12256902,93812622,1,12311,20141214
12256903,93812622,1,8765,20141211
12256904,93812622,1,7951,20141208


In [86]:
# 提取数据各类别全集，同时也作为后续数据的索引列表
userId = list(df['userId'].unique())
itemCat = list(df['itemCat'].unique())
timestp = sorted(list(df['timestp'].unique()))
print(f'userId:  [{len(userId)}]\nitemCat: [{len(itemCat)}]\ntimestp: [{len(timestp)}]')

userId:  [10000]
itemCat: [8916]
timestp: [31]


In [87]:
# 构建三维数组
shape = (len(userId), len(itemCat), len(timestp))
input = np.zeros(shape, dtype=np.float32)
size = input.nbytes
print(f'input array storage: {size} B = {size // 1024} KB = {size // 1024 // 1024} MB = {size // 1024 // 1024 // 1024} GB')

input array storage: 11055840000 B = 10796718 KB = 10543 MB = 10 GB


In [88]:
# 定义时间衰减函数
alpha = 0.23 # 超参数，衰减因子，控制每一天的衰减程度，0.23大约是3日便会衰减一半
def decay_func(x):
    return np.exp(-x * alpha)

In [89]:
# 提取最后一天的交互
last_day = timestp[-1]

interact = df[df['timestp'] == last_day] # 提取df中最后一天的所有交互
interact = interact.drop(columns=['behav', 'timestp']).drop_duplicates() # 删除behav，timestp列

# 在df中剔除最后一天的交互
df = df[df['timestp'] != last_day]

df

Unnamed: 0,userId,behav,itemCat,timestp
0,98047837,1,4245,20141206
1,97726136,1,5894,20141209
3,98662432,1,6562,20141206
4,98145908,1,13926,20141216
5,93784494,1,3979,20141203
...,...,...,...,...
12256901,93812622,1,11,20141213
12256902,93812622,1,12311,20141214
12256903,93812622,1,8765,20141211
12256904,93812622,1,7951,20141208


In [90]:
import time

# 记录开始前的时间
last_time = int(time.time())
# 填充input数组
for index, row in df.iterrows():
    # 进度输出
    if index % 100000 == 0:
        this_time = int(time.time())
        print(f'loop {index}/{len(df)} || time_spent: [{this_time-last_time}s]')
        last_time = this_time
    
    # input[u][c][t]
    u = userId.index(row.userId)
    c = itemCat.index(row.itemCat)
    t = timestp.index(row.timestp)
    behav = row.behav
    if behav == 1:
        score = 10
    elif behav == 2 or behav == 3:
        score = 20
    elif behav == 4:
        score = 40
    # print(f'user: {u}, cata: {c}, time: {t}, score: {score}')

    # 将数据填入数组
    input[u, c, t] = input[u, c, t] + score

    # 计算时间衰减
    for day in range(t+1, len(timestp)):
        input[u, c, day] = input[u, c, day] + score * decay_func(day-t)

loop 0/11881309 || time_spent: [0s]
loop 100000/11881309 || time_spent: [10s]
loop 200000/11881309 || time_spent: [11s]
loop 300000/11881309 || time_spent: [12s]
loop 400000/11881309 || time_spent: [13s]
loop 500000/11881309 || time_spent: [15s]
loop 600000/11881309 || time_spent: [15s]
loop 800000/11881309 || time_spent: [34s]
loop 900000/11881309 || time_spent: [18s]
loop 1000000/11881309 || time_spent: [20s]
loop 1100000/11881309 || time_spent: [23s]
loop 1200000/11881309 || time_spent: [25s]
loop 1300000/11881309 || time_spent: [19s]
loop 1400000/11881309 || time_spent: [17s]
loop 1500000/11881309 || time_spent: [17s]
loop 1600000/11881309 || time_spent: [16s]
loop 1700000/11881309 || time_spent: [15s]
loop 1800000/11881309 || time_spent: [17s]
loop 1900000/11881309 || time_spent: [15s]
loop 2000000/11881309 || time_spent: [16s]
loop 2100000/11881309 || time_spent: [16s]
loop 2200000/11881309 || time_spent: [17s]
loop 2300000/11881309 || time_spent: [15s]
loop 2400000/11881309 || t

In [123]:
# 测试
slice = input[0, :, :]
slice = pd.DataFrame(slice)
slice = slice.loc[~(slice == 0).all(axis=1)]

slice

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.0,0.000000,0.000000,0.000000,20.000000,25.890671,20.571009,16.344358,12.986142,10.317925,...,194.945236,154.890472,123.065651,107.779793,85.634727,68.039642,54.059795,42.952312,34.127075,27.115105
1,20.0,15.890672,12.625673,10.031522,7.970381,6.332736,45.031570,55.779095,44.318367,95.212433,...,107.678963,85.554558,67.975967,54.009186,122.912094,117.657799,93.483070,74.275452,59.014351,46.888874
6,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,20.000000,15.890672,12.625673,10.031522,7.970381,6.332736,5.031571
12,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,3.176348,2.523716,2.005177,1.593180,1.265835,1.005749,0.799101,0.634913,0.504460,0.400810
14,20.0,15.890672,12.625673,10.031522,7.970381,6.332736,5.031571,43.997753,34.957691,27.775061,...,23.981340,19.053982,15.139028,12.028467,29.557024,33.484043,26.604202,21.137930,16.794796,13.344030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3305,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,20.000000,15.890672,12.625673,10.031522,7.970381,6.332736
3398,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,12.625673,10.031522,7.970381,6.332736,5.031571,3.997752,3.176348,2.523716,2.005177,1.593180
4327,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.312837,5.015761,3.985190,3.166368,2.515785,1.998876,1.588174,1.261858,1.002588,0.796590
5084,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,12.625673,10.031522,7.970381,6.332736,5.031571,3.997752,3.176348,2.523716,2.005177,1.593180


In [114]:
input.shape

(10000, 8916, 31)

In [133]:
# 构建dataframe对象
four_tuples = pd.DataFrame(columns=["Dim1", "Dim2", "Dim3", "Value"])

four_tuples.shape

(0, 4)

In [141]:
# 将数据输出为文档
# 构建dataframe对象
four_tuples = pd.DataFrame(columns=["Dim1", "Dim2", "Dim3", "Value"])

# 遍历每个元素并添加到 DataFrame
for i in range(input.shape[0]):
    print(f'processing [{i+1}/{input.shape[0]}]')
    # 将第一维度i对应的二维数组进行切片，并清除空行，转换为dataframe
    slice = pd.DataFrame(input[i, :, :])
    slice = slice.loc[slice[30] != 0]
    # 遍历slice中的每一行
    for index, row in slice.iterrows():
        j = index
        for k in range(31):
            value = row[k]
            four_tuples.loc[len(four_tuples)] = [i, j, k, value]

# 清除value为0的数据
four_tuples = four_tuples[four_tuples['Value'] != 0]

# 将 DataFrame 保存为 CSV 文件
df.to_csv("./data/tianchi_ali/preprocessed/user_interest_data.csv", index=False)

four_tuples

KeyboardInterrupt: 

In [142]:
# 将数据输出为文档
# 将三维数组转换为二维数组
shape = input.shape
my_2d_array = input.reshape(-1, shape[-1])

# 创建多级索引
index_tuples = [(i, j, k) for i in range(shape[0]) for j in range(shape[1]) for k in range(shape[2])]
multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['Dim1', 'Dim2', 'Dim3'])

# 创建 DataFrame
four_tuples_df = pd.DataFrame(my_2d_array, columns=['Value'], index=multi_index)

# 保存为 CSV 文件
output_path = './data/tianchi_ali/preprocessed/user_interest_data.csv'
four_tuples_df.to_csv(output_path)

four_tuples_df

KeyboardInterrupt: 