In [1]:
import cudf
import pandas as pd
from tqdm import tqdm

In [7]:
chunksize = 100000000

# readers = [cudf.read_csv('dataset\\tianchi_fresh_comp_train_user_online_partA.txt', sep='\t', header=None, names=['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category', 'time'], chunksize=chunksize), cudf.read_csv('dataset\\tianchi_fresh_comp_train_user_online_partB.txt', sep='\t', header=None, names=['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category', 'time'], chunksize=chunksize)]
behavior_data = pd.read_csv('dataset/tianchi_fresh_comp_train_user_online_partA.txt', sep='\t', header=None, names=['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category', 'time'])
behavior_data = pd.concat([behavior_data, pd.read_csv('dataset/tianchi_fresh_comp_train_user_online_partB.txt', sep='\t', header=None, names=['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category', 'time'])])

In [8]:
user_behavior = behavior_data.groupby(['user_id','item_id'])['behavior_type'].value_counts().unstack(fill_value=0)
user_behavior.rename(columns={1: 'view_count', 2: 'collect_count', 3: 'cart_count', 4: 'buy_count'}, inplace=True)

In [3]:
print(len(user_behavior))

444526713


In [12]:
import feather
user_behavior.reset_index().to_feather('feature/user_behavior.feather')

In [4]:
# 统计用户的总浏览次数、总收藏次数、总加购物车次数、总购买次数
# 计算每次处理的数据量
chunk_size = len(user_behavior) // 10
# 存储最终结果
user_stats = None
item_stats = None

counts = ['view_count','collect_count','cart_count','buy_count']

# 分块处理
for i in range(0, len(user_behavior), chunk_size):
    # 取一个块的数据
    chunk = user_behavior.iloc[i:i + chunk_size]
    # 将 pandas DataFrame 转换为 cudf DataFrame
    cudf_chunk = cudf.from_pandas(chunk)
    # 按用户分组统计
    user_group = cudf_chunk.groupby('user_id')[counts].sum()
    # 按商品分组统计
    item_group = cudf_chunk.groupby('item_id')[counts].sum()
    if user_stats is None:
        user_stats = user_group
        item_stats = item_group
    else:
        user_stats = user_stats.add(user_group, fill_value=0)
        item_stats = item_stats.add(item_group, fill_value=0)

# 将结果转换回 pandas DataFrame
# user_stats = user_stats.to_pandas().reset_index().rename(columns={elem: f'user_{elem}' for elem in counts})
# item_stats = item_stats.to_pandas().reset_index().rename(columns={elem: f'item_{elem}' for elem in counts})
# 修改列名
user_stats = user_stats.reset_index().rename(columns={elem: f'user_{elem}' for elem in counts})
item_stats = item_stats.reset_index().rename(columns={elem: f'item_{elem}' for elem in counts})


In [7]:
user_stats.to_feather('feature/user_feature.feather')
item_stats.to_feather('feature/item_feature.feather')



In [2]:
user_behavior = pd.read_feather('feature/user_behavior.feather')

In [None]:
# 合并用户统计结果到原表
user_behavior = pd.merge(user_behavior, user_stats, on='user_id', how='left')
# 合并商品统计结果到原表
user_behavior = pd.merge(user_behavior, item_stats, on='item_id', how='left')

In [6]:
user_stats = cudf.from_pandas(user_stats)
item_stats = cudf.from_pandas(item_stats)

In [10]:
# 将新特征合并到原表中
# 计算每次处理的数据量
chunk_size = len(user_behavior) // 10
# 存储最终结果
user_item_feature = None
# 分块处理
for i in range(0, len(user_behavior), chunk_size):
    chunk = user_behavior.iloc[i:i + chunk_size]
    cudf_chunk = cudf.from_pandas(chunk)
    merged = cudf_chunk.merge(user_stats, on='user_id', how='left')
    merged = merged.merge(item_stats, on='item_id', how='left')
    merged = merged.to_pandas()
    if user_item_feature is None:
        user_item_feature = merged
    else:
        user_item_feature = pd.concat([user_item_feature, merged])

In [12]:
user_item_feature.to_feather('feature/user_item_feature.feather')

In [14]:
user_item_feature = pd.read_feather('feature/user_item_feature.feather')

In [16]:
user_item_feature['user_buy_view_rate'] = user_item_feature['user_buy_count'] / user_item_feature['user_view_count']
user_item_feature['item_buy_view_rate'] = user_item_feature['item_buy_count'] / user_item_feature['item_view_count']

In [18]:
user_item_feature.to_feather('feature/user_item_feature_with_rate.feather')