In [2]:
import re
import json

def extract_fields(log_line):
    # 使用正则表达式提取JSON部分
    json_match = re.search(r'\{.*\}', log_line)
    if not json_match:
        return None, 0

    try:
        # 解析JSON数据
        data = json.loads(json_match.group())
        
        # 提取role_id
        role_id = data.get('role_id')
        
        # 提取vip，如果不存在则默认为0
        vip = data.get('vip', 0)
        
        return role_id, vip
    except json.JSONDecodeError:
        return None, 0

# 测试函数
log_sample = '1068 1529597015396 g60-database-380.i.nease.net /home/g60/gamedata/log/g60_GameStatistic_20180622.log 380_game02 G60_GameStatistic 1529596878_35 [2018-06-22 00:01:18][MercLevelUp],{"merc_capacity":2739,"cur_level":70,"account_id":"aebfr2ualeolqplc@ad.netease.win.163.com","type_id":307,"udid":"66e0b0278fbef358","ip":"101.90.127.224","role_sex":1,"app_channel":"netease","server":"380","role_occ":4,"role_id":"698382","vip":6,"pre_level":69,"guid":6882678726665895936,"role_level":80}'

role_id, vip = extract_fields(log_sample)
print(f"提取的role_id: {role_id}")
print(f"提取的vip: {vip}")


提取的role_id: 698382
提取的vip: 6


In [12]:
import pandas as pd

# 读取文件，假设无表头，使用tab分隔符
df = pd.read_csv('src_rec_action_day.txt', sep='\t', header=None, names=['t_when', 'role_id', 'action', 'itemid'])

# 转换时间列为datetime格式
df['t_when'] = pd.to_datetime(df['t_when'])

# 提取 rec、look、buy 行为的记录
rec_df = df[df['action'] == 'rec']
look_df = df[df['action'] == 'look']
buy_df = df[df['action'] == 'buy']

# 将推荐和购买记录进行合并，确保推荐时间在购买时间之前
rec_buy_df = pd.merge(rec_df, buy_df, on=['role_id', 'itemid'], suffixes=('_rec', '_buy'))
rec_buy_df = rec_buy_df[rec_buy_df['t_when_rec'] < rec_buy_df['t_when_buy']]

# 找到在 rec 和 buy 之间的 look 记录
valid_look_df = pd.merge(rec_buy_df, look_df, on=['role_id', 'itemid'])
valid_look_df = valid_look_df[
    (valid_look_df['t_when'] > valid_look_df['t_when_rec']) &
    (valid_look_df['t_when'] < valid_look_df['t_when_buy'])
]

# 对每个成功推荐的rec-buy对，找到第一条look记录
first_look_df = valid_look_df.sort_values('t_when').groupby(['role_id', 'itemid', 't_when_rec', 't_when_buy']).first().reset_index()

# 选择需要的字段，重命名为输出格式
result_df = first_look_df[['t_when', 'role_id', 'itemid']]
result_df['action'] = 'look'

# 输出结果
print(result_df)

# 如果需要保存为文件
result_df.to_csv('successful_rec_first_look.csv', index=False, sep='\t', header=False)

               t_when  role_id  itemid action
0 2020-01-01 12:10:00      110   10001   look
1 2020-01-01 12:10:00      110   10001   look
2 2020-01-01 08:20:00      110   10002   look
3 2020-01-01 13:20:00      110   10003   look


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['action'] = 'look'


In [10]:
result_df



Unnamed: 0,t_when,role_id,action,itemid
0,2020-01-01 12:10:00,110,look,10001
1,2020-01-01 12:10:00,110,look,10001
2,2020-01-01 08:20:00,110,look,10002
3,2020-01-01 13:20:00,110,look,10003


In [15]:
import pandas as pd

# 读取文件，假设无表头，使用tab分隔符
df = pd.read_csv('src_rec_action_day.txt', sep='\t', header=None, names=['t_when', 'role_id', 'action', 'itemid'])

# 转换时间列为datetime格式
df['t_when'] = pd.to_datetime(df['t_when'])

# 提取 rec、look、buy 行为的记录
rec_df = df[df['action'] == 'rec']
look_df = df[df['action'] == 'look']
buy_df = df[df['action'] == 'buy']

# 将推荐和购买记录进行合并，确保推荐时间在购买时间之前
rec_buy_df = pd.merge(rec_df, buy_df, on=['role_id', 'itemid'], suffixes=('_rec', '_buy'))
rec_buy_df = rec_buy_df[rec_buy_df['t_when_rec'] < rec_buy_df['t_when_buy']]

# 找到在 rec 和 buy 之间的 look 记录
valid_look_df = pd.merge(rec_buy_df, look_df, on=['role_id', 'itemid'])
valid_look_df = valid_look_df[
    (valid_look_df['t_when'] > valid_look_df['t_when_rec']) &
    (valid_look_df['t_when'] < valid_look_df['t_when_buy'])
]

# 对每个成功推荐的rec-buy对，找到第一条look记录
first_look_df = valid_look_df.sort_values('t_when').groupby(['role_id', 'itemid', 't_when_rec', 't_when_buy']).first().reset_index()

# 选择需要的字段，重命名为输出格式
result_df = first_look_df[['t_when', 'role_id', 'itemid']]
result_df['action'] = 'look'

# 输出结果
print(result_df)

# 如果需要保存为文件
result_df.to_csv('successful_rec_first_look.csv', index=False, sep='\t', header=False)

               t_when  role_id  itemid action
0 2020-01-01 12:10:00      110   10001   look
1 2020-01-01 12:10:00      110   10001   look
2 2020-01-01 08:20:00      110   10002   look
3 2020-01-01 13:20:00      110   10003   look


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['action'] = 'look'


In [16]:
import pandas as pd

# 读取文件，假设无表头，使用tab分隔符
df = pd.read_csv('src_rec_action_day.txt', sep='\t', header=None, names=['t_when', 'role_id', 'action', 'itemid'])

# 转换时间列为datetime格式
df['t_when'] = pd.to_datetime(df['t_when'])

# 提取 rec、look、buy 行为的记录
rec_df = df[df['action'] == 'rec']
look_df = df[df['action'] == 'look']
buy_df = df[df['action'] == 'buy']

# 将推荐和购买记录进行合并，确保推荐时间在购买时间之前
rec_buy_df = pd.merge(rec_df, buy_df, on=['role_id', 'itemid'], suffixes=('_rec', '_buy'))
rec_buy_df = rec_buy_df[rec_buy_df['t_when_rec'] < rec_buy_df['t_when_buy']]

# 找到在 rec 和 buy 之间的 look 记录
valid_look_df = pd.merge(rec_buy_df, look_df, on=['role_id', 'itemid'])
valid_look_df = valid_look_df[
    (valid_look_df['t_when'] > valid_look_df['t_when_rec']) &
    (valid_look_df['t_when'] < valid_look_df['t_when_buy'])
]

# 对每个成功推荐的rec-buy对，找到唯一的第一条look记录，先按时间排序再groupby
first_look_df = valid_look_df.sort_values('t_when').drop_duplicates(subset=['role_id', 'itemid', 't_when_rec'], keep='first')

# 选择需要的字段，重命名为输出格式
result_df = first_look_df[['t_when', 'role_id', 'itemid']].copy()
result_df['action'] = 'look'

# 按照时间排序
result_df = result_df.sort_values('t_when').reset_index(drop=True)

# 输出结果
print(result_df)

# 如果需要保存为文件
result_df.to_csv('successful_rec_first_look.csv', index=False, sep='\t', header=False)

               t_when  role_id  itemid action
0 2020-01-01 08:20:00      110   10002   look
1 2020-01-01 12:10:00      110   10001   look
2 2020-01-01 13:20:00      110   10003   look
