In [1]:
import pandas as pd
from tqdm import tqdm
import feather

In [None]:
item_data = pd.read_csv('dataset\\tianchi_fresh_comp_train_item_online.txt', sep='\t', header=None, names=['item_id', 'item_geohash', 'item_category'])
print(item_data.head())  # 查看前几行数据
print(item_data.info())  # 查看数据的基本信息，如列名、非空值数量、数据类型等

In [None]:
# 统计item_geohash缺失值的数量
item_geohash_null = item_data['item_geohash'].isnull().sum()
print("item_geohash缺失：{:} {:.1f}%".format(item_geohash_null, item_geohash_null / len(item_data) * 100))

In [3]:
chunksize = 20000000

readers = [pd.read_csv('dataset\\tianchi_fresh_comp_train_user_online_partA.txt', sep='\t', header=None, names=['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category', 'time'], chunksize=chunksize), pd.read_csv('dataset\\tianchi_fresh_comp_train_user_online_partB.txt', sep='\t', header=None, names=['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category', 'time'], chunksize=chunksize)]


以下内容用以前期数据统计

In [None]:
behavior_num = 0
behavior_geohash_null = 0
# 统计购买行为，即behavior_type=4的数量
behavior_buy_num = 0

for reader in readers:
    for chunk in tqdm(reader):
        behavior_num += len(chunk)
        behavior_geohash_null += chunk['user_geohash'].isnull().sum()
        behavior_buy_num += len(chunk[chunk['behavior_type'] == 4])
        # user_ids.update(chunk['user_id'].values)
    
print("用户行为总数：", behavior_num)
print("user_geohash缺失：{:} {:.1f}%".format(behavior_geohash_null, behavior_geohash_null / behavior_num * 100))


以下内容用以从行为数据中提取特征

In [None]:
import pandas as pd

# 假设原有的数据
old_data = pd.DataFrame({
    'index_col': ['a', 'b', 'c'],
    'value_col1': [1, 2, 3],
    'value_col2': [4, 5, 6]
}).set_index('index_col')

# 假设新的数据
new_data = pd.DataFrame({
    'index_col': ['b', 'c', 'd'],
    'value_col1': [7, 8, 9],
    'value_col2': [10, 11, 12]
}).set_index('index_col')


# 合并数据
result_df = pd.concat([old_data, new_data], axis=0)
result_df = result_df.groupby(result_df.index).sum()


print(result_df)

In [None]:
# 创建一个DataFrame对象，用于存储用户行为特征（用户ID，商品ID，浏览次数、收藏次数、购物车次数、购买次数）
user_behavior = pd.DataFrame(columns=['user_id', 'item_id', 'view_count', 'collect_count', 'add_count', 'buy_count'])

for reader in readers:
    for chunk in tqdm(reader):
        # 统计每个用户对每个商品的浏览、收藏、加购物车、购买次数
        behavior_count = chunk.groupby(['user_id','item_id'])['behavior_type'].value_counts().unstack(fill_value=0)
        behavior_count.rename(columns={1: 'view_count', 2: 'collect_count', 3: 'cart_count', 4: 'buy_count'}, inplace=True)
        user_behavior = pd.concat([user_behavior, behavior_count], axis=0)
        user_behavior = user_behavior.groupby(['user_id', 'item_id']).sum()


In [None]:
behavior_count = None
user_behavior.reset_index().to_feather('feature/user_behavior_halfdata.feather')

In [None]:
# 统计出现的所有用户ID
user_ids = set()

for reader in readers:
    for chunk in tqdm(reader):
        user_ids.update(chunk['user_id'].values)
        
print(len(user_ids))

# 创建一个DataFrame对象，用以存储用户特征(用户ID、用户浏览次数、用户收藏次数、用户购物车次数、用户购买次数)
user_feature = pd.DataFrame(columns=['user_id', 'user_view_count', 'user_collect_count', 'user_cart_count', 'user_buy_count'])
user_feature['user_id'] = list(user_ids)
user_feature.fillna(0, inplace=True)
user_feature.set_index('user_id', inplace=True)
# 创建一个DataFrame对象，用以存储商品特征(商品ID、商品被浏览次数、商品被收藏次数、商品被购物车次数、商品被购买次数)
item_feature = pd.DataFrame(columns=['item_id', 'item_view_count', 'item_collect_count', 'item_cart_count', 'item_buy_count'])
item_feature['item_id'] = item_data['item_id']
item_feature.fillna(0, inplace=True)
item_feature.set_index('item_id', inplace=True)

# 统计用户和商品特征
for reader in readers:
    for chunk in tqdm(reader):
        # 统计用户特征
        user_count = chunk.groupby('user_id')['behavior_type'].value_counts().unstack(fill_value=0)
        user_count.rename(columns={1: 'user_view_count', 2: 'user_collect_count', 3: 'user_cart_count', 4: 'user_buy_count'}, inplace=True)
        user_feature = user_feature.add(user_count, fill_value=0)
        # 统计商品特征
        item_count = chunk.groupby('item_id')['behavior_type'].value_counts().unstack(fill_value=0)
        item_count.rename(columns={1: 'item_view_count', 2: 'item_collect_count', 3: 'item_cart_count', 4: 'item_buy_count'}, inplace=True)
        item_feature = item_feature.add(item_count, fill_value=0)

user_feature.reset_index()
item_feature.reset_index()
        

In [None]:
import feather
# 读取用户特征和商品特征
user_feature = pd.read_feather('user_feature.feather')
item_feature = pd.read_feather('item_feature.feather')

In [8]:
# 改数据类型为整数
user_feature = user_feature.astype(int)
item_feature = item_feature.astype(int)

In [None]:
print(user_feature.sort_values('user_buy_count', ascending=False).head())
print(item_feature.sort_values('item_buy_count', ascending=False).head())

In [None]:
# 去除浏览次数为0的用户和商品
user_feature = user_feature[user_feature['user_view_count'] > 0]
item_feature = item_feature[item_feature['item_view_count'] > 0] 
# 输出去除后的数据量
print("用户数：", len(user_feature))
print("商品数：", len(item_feature))

# 计算用户浏览购买比率
user_feature['user_buy_view_rate'] = user_feature['user_buy_count'] / user_feature['user_view_count']
# 计算商品被浏览购买比率
item_feature['item_buy_view_rate'] = item_feature['item_buy_count'] / item_feature['item_view_count']

In [None]:
# 展示用户浏览量购买率最高的5个用户
print(user_feature.sort_values('user_buy_view_rate', ascending=False).head())
# 展示商品浏览量购买率最高的5个商品
print(item_feature.sort_values('item_buy_view_rate', ascending=False).head())

In [12]:
import feather
user_feature.reset_index().to_feather('user_feature.feather')
item_feature.reset_index().to_feather('item_feature.feather')

In [None]:
# 统计购买数大于10的用户数
user_buy_count_10 = user_feature[user_feature['user_buy_count'] > 10].shape[0]
print("购买数大于10的用户数：", user_buy_count_10)
# 统计购买数大于100的用户数
user_buy_count_100 = user_feature[user_feature['user_buy_count'] > 100].shape[0]
print("购买数大于100的用户数：", user_buy_count_100)
# 统计购买数大于10的商品数
item_buy_count_10 = item_feature[item_feature['item_buy_count'] > 10].shape[0]
print("购买数大于10的商品数：", item_buy_count_10)
# 统计购买数大于100的商品数
item_buy_count_100 = item_feature[item_feature['item_buy_count'] > 100].shape[0]
print("购买数大于100的商品数：", item_buy_count_100)

In [None]:
# 选取购买数大于100的用户
user_feature = user_feature[user_feature['user_buy_count'] > 100]
# 选取购买数大于200的商品
item_feature = item_feature[item_feature['item_buy_count'] > 200]
# 采用用户和商品一一对应的方式，构建用户-商品交叉集
cross_feature = user_feature.merge(item_feature, how='cross')

print(len(cross_feature))

In [None]:
print(len(user_feature[user_feature['user_buy_count'] > 100]))
print(len(item_feature[item_feature['item_buy_count'] > 300]))

In [5]:
# 创建预测日为2014-12-17至2014-12-18的数据表
date_range = pd.date_range('2014-12-17', '2014-12-18', freq='D')
pred_date_df = pd.DataFrame(date_range, columns=['pred_date'])
# 创建预测日和交叉集的交叉集
cross_feature = cross_feature.merge(pred_date_df, how='cross')
cross_feature['label'] = 0

In [None]:
# 改变列的顺序，将user_id,item_id,pred_date列放在最前面
cross_feature = cross_feature[['user_id', 'item_id', 'pred_date'] + [col for col in cross_feature.columns if col not in ['user_id', 'item_id', 'pred_date']]]

print(cross_feature.head())

In [None]:
cross_feature.drop(['index_x', 'index_y'], axis=1, inplace=True)
print(cross_feature.head())

In [None]:
cross_feature = None
cross_feature = feather.read_dataframe('cross_feature.feather')
print(cross_feature.head())

In [23]:
cross_feature.to_feather('cross_feature.feather')

In [None]:
# 取cross_feature的user_id,item_id,pred_date列
base_feature = cross_feature[['user_id', 'item_id', 'pred_date']]
res_feature = base_feature.copy()
res_feature.set_index(['user_id', 'item_id', 'pred_date'], inplace=True)
print(res_feature)

In [5]:
# 时间范围和对应的列名
time_ranges = [
    (1, 'one_day'),
    (3, 'three_day'),
    (7, 'seven_day'),
    (15, 'fifteen_day'),
    (30, 'thirty_day')
]

date_range = pd.date_range('2014-12-17', '2014-12-18', freq='D')

# 初始化res_feature
for day in date_range:
    for days, prefix in time_ranges:
        for behavior_name in ['view', 'collect', 'cart', 'buy']:
            res_feature[f'{prefix}_{behavior_name}'] = 0
res_feature['label'] = 0

In [None]:
for reader in readers:
    for chunk in reader:
        for day in tqdm(date_range):
            # 选取pred_date = day的数据
            result_df = base_feature.copy()
            result_df = result_df[result_df['pred_date'] == day]
            for days, prefix in time_ranges:
                # 计算不同时间范围内的数据
                chunk['time'] = pd.to_datetime(chunk['time'])
                time_range_df = chunk[(day > chunk['time']) & ((day - chunk['time']).dt.days <= days)]
                for behavior, behavior_name in zip([1, 2, 3, 4], ['view', 'collect', 'cart', 'buy']):
                    behavior_count = time_range_df[time_range_df['behavior_type'] == behavior].groupby(['user_id', 'item_id']).size().reset_index(name=f'{prefix}_{behavior_name}')
                    result_df = result_df.merge(behavior_count, on=['user_id', 'item_id'], how='left')
            # 统计当日是否购买，计入label列
            result_df['label'] = chunk[(chunk['behavior_type'] == 4) & (chunk['time'].dt.date == day)].groupby(['user_id', 'item_id']).size().reset_index(name='label')['label']
            result_df.fillna(0, inplace=True)
            result_df.set_index(['user_id', 'item_id', 'pred_date'], inplace=True)
        res_feature = res_feature.add(result_df, fill_value=0)
res_feature['label'] = (res_feature['label'] > 0).astype(int)          
final_feature = cross_feature.merge(res_feature, on=['user_id', 'item_id', 'pred_date'], how='left')
    

In [10]:
final_feature = cross_feature.merge(res_feature, on=['user_id', 'item_id', 'pred_date'], how='left')

In [14]:
for day in date_range:
    for days, prefix in time_ranges:
        for behavior_name in ['view', 'collect', 'cart', 'buy']:
            # 设定数据类型为整数
            final_feature[f'{prefix}_{behavior_name}'] = final_feature[f'{prefix}_{behavior_name}'].astype(int)

In [16]:
final_feature.to_feather('final_feature.feather')

In [None]:
# print(result_df.sort_values('one_day_view', ascending=False).head())
# print(res_feature.sort_values('one_day_buy', ascending=False).head())
# 正负样本比例
label_num = final_feature['label'].value_counts()
print("正负样本比例：1:{:.0f}".format(label_num[0] / label_num[1]))

以下用于统计预测数据特征，用于结果输出

In [4]:
user_feature = pd.read_feather('user_feature.feather')
item_feature = pd.read_feather('item_feature.feather')
# 选取购买数大于100的用户
user_feature = user_feature[user_feature['user_buy_count'] > 100]
# 选取购买数大于100的商品
item_feature = item_feature[item_feature['item_buy_count'] > 100]
# 采用用户和商品一一对应的方式，构建用户-商品交叉集
cross_feature = user_feature.merge(item_feature, how='cross')
cross_feature.drop(['index_x', 'index_y'], axis=1, inplace=True)

In [None]:
cross_feature = feather.read_dataframe('cross_feature.feather')

cross_feature = cross_feature[cross_feature['pred_date'] == '2014-12-18']
cross_feature.drop('pred_date', axis=1, inplace=True)


In [None]:
base_feature = cross_feature[['user_id', 'item_id']]
res_feature = base_feature.copy()
res_feature.set_index(['user_id', 'item_id'], inplace=True)
print(res_feature)

In [6]:
# 时间范围和对应的列名
time_ranges = [
    (1, 'one_day'),
    (3, 'three_day'),
    (7, 'seven_day'),
    (15, 'fifteen_day'),
    (30, 'thirty_day')
]

# 初始化res_feature
for days, prefix in time_ranges:
    for behavior_name in ['view', 'collect', 'cart', 'buy']:
        res_feature[f'{prefix}_{behavior_name}'] = 0

In [None]:
day = pd.to_datetime('2014-12-19')
for reader in readers:
    for chunk in tqdm(reader):
        # 选取pred_date = day的数据
        result_df = base_feature.copy()
        for days, prefix in time_ranges:
            # 计算不同时间范围内的数据
            chunk['time'] = pd.to_datetime(chunk['time'])
            time_range_df = chunk[(day - chunk['time']).dt.days <= days]
            for behavior, behavior_name in zip([1, 2, 3, 4], ['view', 'collect', 'cart', 'buy']):
                behavior_count = time_range_df[time_range_df['behavior_type'] == behavior].groupby(['user_id', 'item_id']).size().reset_index(name=f'{prefix}_{behavior_name}')
                result_df = result_df.merge(behavior_count, on=['user_id', 'item_id'], how='left')
        
        result_df.fillna(0, inplace=True)
        result_df.set_index(['user_id', 'item_id'], inplace=True)
        res_feature = res_feature.add(result_df, fill_value=0)         
pred_feature = cross_feature.merge(res_feature, on=['user_id', 'item_id'], how='left')

In [None]:
pred_feature

In [10]:
pred_feature.to_feather('pred_feature_100_100.feather')