In [16]:
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm
from collections import Counter
import json
import math
import copy
import os, sys

import jieba
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import roc_auc_score, f1_score
from scipy.special import softmax

## 读取entity_map

In [17]:
entity_map = {}
with open('./data/rec_data/all_content.txt') as f:
    for line in f:
        if len(line.strip()):
            js = json.loads(line)
            entity_map[int(js['id'])] = js['entity']
print(len(entity_map))

146791


## lda_model

In [18]:
def lda_model(clean_data, dictionary, corpus, num_topics):
    # 使用主题一致性作为评价指标
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    lda_cm = CoherenceModel(model=lda, texts=clean_data, dictionary=dictionary, coherence='c_v')
#     lda.save(fr'model/model-{num_topics}')
    return lda_cm.get_coherence()

def get_topicNum(documents_list):
    if len(documents_list) < 10:
        return -1
    coherence = 0
    best_topic = 0
    dictionary = corpora.Dictionary(documents_list)
    corpus = [dictionary.doc2bow(text) for text in documents_list]
    y_label = []
    for i in range(1, int(len(documents_list)/2)):
        tmp_coherence = lda_model(documents_list, dictionary, corpus, i)
        y_label.append(tmp_coherence)
        if coherence < tmp_coherence:
            coherence = tmp_coherence
            best_topic = i
    return best_topic

## BM25_Model

In [19]:
class BM25_Model(object):
    def __init__(self, documents_list, k1=2, k2=1, b=0.75):
        self.documents_list = documents_list
        self.documents_number = len(documents_list)
        self.avg_documents_len = sum([len(document) for document in documents_list]) / (self.documents_number+1)
        self.f = []
        self.idf = {}
        self.k1 = k1
        self.k2 = k2
        self.b = b
        self.init()

    def init(self):
        df = {}
        for document in self.documents_list:
            temp = {}
            for word in document:
                temp[word] = temp.get(word, 0) + 1
            self.f.append(temp)
            for key in temp.keys():
                df[key] = df.get(key, 0) + 1
        for key, value in df.items():
            self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5))

    def get_score(self, index, query):
        score = 0.0
        document_len = len(self.f[index])
        qf = Counter(query)
        for q in query:
            if q not in self.f[index]:
                continue
            score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / (
                    self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * (
                             qf[q] * (self.k2 + 1) / (qf[q] + self.k2))

        return score

    def get_documents_score(self, query):
        score_list = []
        for i in range(self.documents_number):
            score_list.append(self.get_score(i, query))
        return score_list

## 历史信息统计

In [20]:
entityCounter = Counter()
itemCounter = Counter()

In [25]:
def generate_historyFeature(feature):
    print(f'总长={len(feature)}')
    out = {'curEntitys':[],'historyEntitys':[],'historyItems':[],'historyLogTs':[],'bm25':[]}
    wrongHisCnt = 0
#     for item in tqdm(feature.sample(1000)[['itemId','logTs','seq', 'pvId']].itertuples()):
    for item in tqdm(feature[['itemId','logTs','seq', 'pvId']].itertuples()):
#         print(item.label)
        out['curEntitys'].append(entity_map[item.itemId])
        curTime = int(item.logTs)
        seqArr = item.seq
        if len(seqArr) == 1:  # 无历史信息
            out['historyEntitys'].append([])
            out['historyItems'].append([])
            out['historyLogTs'].append([])
            out['bm25'].append(-1)
            continue
        seqArr = json.loads(seqArr.replace('\'', '\"'))
        assert len(seqArr)%2 == 0
        entitys = []
        timeGaps = []
        items = []
        for i in range(0,len(seqArr),2):
            timeGap = int((curTime-int(seqArr[i+1]))/1000/3600)
            itemId = int(seqArr[i])
            if len(entity_map[itemId]) == 0 or timeGap < 0:
                wrongHisCnt += 1
                continue
            itemCounter[itemId] += 1
            entitys.append(entity_map[itemId])
            timeGaps.append(timeGap)
            items.append(itemId)
            for entity in entity_map[itemId]:
                entityCounter[entity] += 1
#             print(1111, entity_map[itemId], itemId, timeGap)
        bm25 = BM25_Model(entitys)
#         print(2222, entity_map[item.itemId])
#         print(3333, bm25.get_documents_score(entity_map[item.itemId]))
#         pprint(entitys)
        out['historyEntitys'].append(entitys)
        out['historyLogTs'].append(timeGaps)
        out['historyItems'].append(items)
        bm25_score = sum(bm25.get_documents_score(entity_map[item.itemId]))
        out['bm25'].append(min(99,int(bm25_score+0.5))) # 范围0~99
#         break
    print(f'错误历史数据数量={wrongHisCnt}')
    print(f'历史item词典大小={len(itemCounter)}')
    print(f'历史实体词典大小={len(entityCounter)}')
    return out
        

### 保存历史实体信息

In [26]:
for name in ['newTrain', 'newTest', 'test']:
    path = f'./data/feature/{name}_all_feature.csv'
    print(path)
    data = pd.read_csv(path)
    history_df = generate_historyFeature(data)
    history_df = pd.DataFrame(history_df)
    history_df.to_csv(f'./data/feature/{name}_history.csv', index=False)

./data/feature/newTrain_all_feature.csv
总长=1399019


1399019it [06:13, 3741.45it/s]


错误历史数据数量=1717814
历史item词典大小=37287
历史实体词典大小=107327
./data/feature/newTest_all_feature.csv
总长=1405338


1405338it [06:21, 3687.09it/s]


错误历史数据数量=1833569
历史item词典大小=58548
历史实体词典大小=158326
./data/feature/test_all_feature.csv
总长=1422218


1422218it [06:13, 3808.02it/s]


错误历史数据数量=1502333
历史item词典大小=76670
历史实体词典大小=202183


In [33]:
path = f'./temp.csv'
print(path)
data = pd.read_csv(path)
history_df = generate_historyFeature(data)

./temp.csv
总长=4233804


4233804it [19:05, 3694.85it/s]


错误历史数据数量=4150527
历史item词典大小=126766
历史实体词典大小=312983


### 保存历史文章频次和实体频次信息

In [34]:
his_item_cnt = pd.DataFrame({'itemId':list(itemCounter.keys()), 'his_item_cnt':list(itemCounter.values())})
his_item_cnt.to_csv('./data/feature/his_item_cnt.csv', index=False)
his_item_cnt

Unnamed: 0,itemId,his_item_cnt
0,10126829,215281
1,10130460,180247
2,10124534,95483
3,10123331,311842
4,10119437,86634
...,...,...
126761,10036831,15
126762,10021120,15
126763,10036785,15
126764,10016904,15


In [35]:
his_entity_cnt = pd.DataFrame({'entity':list(entityCounter.keys()), 'his_entity_cnt':list(entityCounter.values())})
his_entity_cnt.to_csv('./data/feature/his_entity_cnt.csv', index=False)
his_entity_cnt

Unnamed: 0,entity,his_entity_cnt
0,梅朗雄,256246
1,北约,2250698
2,总统,2899768
3,朗雄,235299
4,吕克·梅,235299
...,...,...
312978,天华学校,15
312979,方一博,15
312980,刘晚桂,15
312981,钟霞兰,15


## BM25计算

In [36]:
def cal_bm25(feature):
    out = []
    for item in tqdm(feature.itertuples()):
    # for item in testHistory[testHistory['bm25']!=-1].sample(1).itertuples():
    # def cal_bm25(item_bm25, item_curEntitys, item_historyEntitys, item_historyLogTs):
        if item.bm25 == -1:
            out.append([-1, 0, -1, 0]) 
            continue
        try:
            curTemp = json.loads(item.curEntitys.replace('\'', '\"'))
            hisTemp = json.loads(item.historyEntitys.replace('\'', '\"'))
            historyLogTs = json.loads(item.historyLogTs)
        except:
            out.append([-1, 0, -1, 0]) 
            continue
        curEntitys = []
        for entity in curTemp:
            if len(entity) > 5:
                words = jieba.cut(entity, cut_all=False)
                curEntitys.extend(words)
            else:
                curEntitys.append(entity)
        historyEntitys = []
        for group in hisTemp:
            tempGroup = []
            for entity in group:
                if len(entity) > 5:
                    words = jieba.cut(entity, cut_all=False)
                    tempGroup.extend(words)
                else:
                    tempGroup.append(entity)
            historyEntitys.append(tempGroup)

        bm25Model = BM25_Model(historyEntitys)
        bm25Arr = bm25Model.get_documents_score(curEntitys)
        bm25Arr = np.abs(bm25Arr)
        bm25 = int(sum(bm25Arr)+0.5) # query 与历史的bm25相似度


        co_scoreArr = []
        for i in range(len(historyEntitys)):
            for j in range(i+1,len(historyEntitys)):
                tempScore = bm25Model.get_score(j, historyEntitys[i])
                co_scoreArr.append(tempScore)
        co_scoreArr = np.abs(co_scoreArr)
        co_score = int(sum(co_scoreArr)+0.5)
        out.append([bm25, len(bm25Arr), co_score, len(co_scoreArr)]) 
    return out
# #     topicNum = get_topicNum(historyEntitys)
#     print(f'bm25之和={int(sum(bm25Arr)+0.5)} | 有效历史长度={len(bm25Arr)} | 最佳主题数=')
#     print(curEntitys)
#     for score, history, gap in zip(bm25Arr, historyEntitys, historyLogTs):
#         print(f'score={score}|timeGap={gap}|', history)
    
#     print()

In [44]:
for name in ['newTrain', 'newTest', 'test']:
    path = f'./data/feature/{name}_history.csv'
    data = pd.read_csv(path)
    print(path, len(data))
    bm25_df = cal_bm25(data)
    bm25_df = pd.DataFrame(bm25_df, columns=['bm25','bm25Len','co_bm25','co_bm25Len'])
    data = pd.read_csv(f'./data/feature/{name}_all_feature.csv')
    del data['seq']
    merge_df = pd.concat([data, bm25_df],axis=1)
    merge_df.to_csv(f'./data/feature/{name}_all_feature.csv', index=False)

./data/feature/newTrain_history.csv 1399019


1399019it [34:37, 673.30it/s] 


./data/feature/newTest_history.csv 1405338


1405338it [35:46, 654.65it/s]


./data/feature/test_history.csv 1422218


1422218it [34:50, 680.45it/s]


### 情感值计算

In [41]:
# data = pd.read_csv('./data/feature/newTest_all_feature.csv')

# allData = pd.concat([data,data],axis=1)
# allData

In [104]:
print(len(itemCounter), len(entityCounter))
test_history = generate_historyFeature(test_feature)
for key in test_history.keys():
    print(key, len(test_history[key]))

76075 200362
总长=1422218


1422218it [06:33, 3618.42it/s]

错误历史数据数量=1502333
历史item词典大小=93073
历史实体词典大小=240073
curEntitys 1422218
historyEntitys 1422218
historyItems 1422218
historyLogTs 1422218
bm25 1422218





Unnamed: 0,itemId,his_item_cnt
0,10094375,66013
1,10080793,164484
2,10085633,1024
3,10077096,12425
4,10081852,137332
...,...,...
93068,10046424,10
93069,10056259,10
93070,10059001,10
93071,10093255,10


Unnamed: 0,entity,his_entity_cnt
0,大马尼拉地区,66041
1,防疫指挥部,66013
2,以色列,316934
3,路透社,529271
4,法国公共卫生署,66207
...,...,...
240068,蜗牛的家,10
240069,黄一山大骂,10
240070,黄一山照,10
240071,新浪新闻,10


In [119]:
train_history = pd.DataFrame(train_history)
train_history

Unnamed: 0,curEntitys,historyEntitys,historyItems,historyLogTs,bm25
0,"[湖北省政协, 咸宁市, 武汉市委, 省人大常委会, 宜昌, 李玉妹, 甘肃省委, 咸宁, ...","[[大马尼拉地区, 防疫指挥部, 以色列, 路透社, 法国公共卫生署, 德斯特, 美国, 俄...","[10094375, 10080793, 10085633, 10077096, 10081...","[33, 56, 57, 33, 33, 21, 56, 12, 12, 0, 56, 33...",0
1,"[美国国民警卫队, 台湾淡江大学外交与国际关系学系, 美国, 美国国会, 国台办, 陈一新,...","[[大马尼拉地区, 防疫指挥部, 以色列, 路透社, 法国公共卫生署, 德斯特, 美国, 俄...","[10094375, 10080793, 10085633, 10077096, 10081...","[33, 56, 57, 33, 33, 21, 56, 12, 12, 0, 56, 33...",20
2,"[南非茨瓦内, 新闻周刊, 于德尔塔, 卫报, 病例, 南非]","[[大马尼拉地区, 防疫指挥部, 以色列, 路透社, 法国公共卫生署, 德斯特, 美国, 俄...","[10094375, 10080793, 10085633, 10077096, 10081...","[33, 56, 57, 33, 33, 21, 56, 12, 12, 0, 56, 33...",5
3,"[总统, 卡德罗夫, 杜达耶夫, 普京, 俄罗斯, 俄联邦军队, 老卡德罗夫, 车臣委员会,...","[[大马尼拉地区, 防疫指挥部, 以色列, 路透社, 法国公共卫生署, 德斯特, 美国, 俄...","[10094375, 10080793, 10085633, 10077096, 10081...","[33, 56, 57, 33, 33, 21, 56, 12, 12, 0, 56, 33...",10
4,"[清华大学建筑学院, 巴基斯坦, 西安咸阳机场航站楼, 抗疫路, 西安咸阳国际机场, 西安,...","[[大马尼拉地区, 防疫指挥部, 以色列, 路透社, 法国公共卫生署, 德斯特, 美国, 俄...","[10094375, 10080793, 10085633, 10077096, 10081...","[33, 56, 57, 33, 33, 21, 56, 12, 12, 0, 56, 33...",2
...,...,...,...,...,...
4233799,"[中国, 布伦丹·卡尔, 美国电话电报公司, 路透社, 联邦通信委员会, 美国, 美国联邦航...","[[血管], [陈女士, 萧山市, 滨江儿保医院, 北派出所, 赵李, 李云霄, 金一路口,...","[10115131, 10112494, 10115949, 10119476, 10107...","[21, 17, 22, 9, 1, 1, 1, 7, 1, 1, 17, 1, 1, 1,...",8
4233800,"[蓝大仙人, 京东商城, 商报君, 电商, 京东联盟, 佣金, 订单, 孙世建, 刘柱]","[[血管], [陈女士, 萧山市, 滨江儿保医院, 北派出所, 赵李, 李云霄, 金一路口,...","[10115131, 10112494, 10115949, 10119476, 10107...","[21, 17, 22, 9, 1, 1, 1, 7, 1, 1, 17, 1, 1, 1,...",0
4233801,"[蚂蚁集团, 中国信达, 业务, 信贷, 债权收益权融资70亿元, 银保监会, 渝富资本]","[[血管], [陈女士, 萧山市, 滨江儿保医院, 北派出所, 赵李, 李云霄, 金一路口,...","[10115131, 10112494, 10115949, 10119476, 10107...","[21, 17, 22, 9, 1, 1, 1, 7, 1, 1, 17, 1, 1, 1,...",0
4233802,"[祖晓麟, 巴沟, 北京, 周璨, 蒲连, 北京安贞医院, 高海, 葛师傅]","[[血管], [陈女士, 萧山市, 滨江儿保医院, 北派出所, 赵李, 李云霄, 金一路口,...","[10115131, 10112494, 10115949, 10119476, 10107...","[21, 17, 22, 9, 1, 1, 1, 7, 1, 1, 17, 1, 1, 1,...",0


In [120]:
train_history.to_csv('./data/feature/train_history.csv', index=False)

In [116]:
test_all_feature = pd.read_csv('./data/feature/test_all_feature.csv')
test_all_feature

Unnamed: 0,pvId,suv,itemId,logTs,logTs_gap,operator,deviceType,osType,province,city,...,Min,seqLen,sum,mean,historyCnt,allCnt,entity_cnt,mean_emotion_gap,topic,bm25
0,1640655422697H3zq4ze,16412563805927MMCH5,10113821,1641257054454,236,2224589,2326891,3585570,3826939,2414137,...,524,0,0,0,21289,2098,0,1.000000,1,-1
1,1640655422697H3zq4ze,16412563805927MMCH5,10123383,1641257039412,221,2224589,2326891,3585570,3826939,2414137,...,523,0,0,0,86997,35176,0,1.000000,8,-1
2,1640655422697H3zq4ze,16412563805927MMCH5,10113964,1641256862448,44,2224589,2326891,3585570,3826939,2414137,...,521,0,0,0,581581,24340,0,1.000000,7,-1
3,1640655422697H3zq4ze,16412563805927MMCH5,10124534,1641256863017,44,2224589,2326891,3585570,3826939,2414137,...,521,0,0,0,44543,15857,0,1.000000,5,-1
4,1640655422697H3zq4ze,16412563805927MMCH5,10107295,1641256818096,0,2224589,2326891,3585570,3826939,2414137,...,520,0,0,0,311551,48760,0,1.000000,11,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1422213,1641312084289gjEbHAy,1626561703969AfPhwD,10120628,1641311927935,4,2203920,2326891,3585570,3260970,3247054,...,1438,200,220,10,9270,1339,5,0.020000,7,4
1422214,1641312084289gjEbHAy,1626561703969AfPhwD,10127149,1641311923748,0,2203920,2326891,3585570,3260970,3247054,...,1438,200,300,14,8092,9129,8,0.106250,7,15
1422215,1641312084289gjEbHAy,1626561703969AfPhwD,10113797,1641311927935,4,2203920,2326891,3585570,3260970,3247054,...,1438,200,160,7,4143,4032,6,0.000000,7,13
1422216,1641312084289gjEbHAy,1626561703969AfPhwD,10132834,1641311923198,0,2203920,2326891,3585570,3260970,3247054,...,1438,200,260,12,902,972,7,0.178571,7,21


In [None]:
curEntitys 1422218
historyEntitys 1422218
historyItems 1422218
historyLogTs 1422218
bm25 1422218