### 注意: data中数据文件只包括少量数据，用于示例，请从天池下载完整数据后训练

In [1]:
# 引入头文件
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings 
import re
import tools # 工具实现在tools.py文件中

warnings.filterwarnings("ignore") 

In [2]:
# 读训练数据
data = pd.read_csv('data/weibo_train_data.txt',sep='\t',header=None)
data = tools.prepare(data)
data = tools.add_features(data)

In [3]:
# 切分训练集和验证集
train,val = train_test_split(data, test_size=0.2, random_state=0)
val = val.reset_index(drop=True)

In [4]:
# 按用户分组
if True:
    grp = train.groupby('uid')
    user_data = pd.DataFrame()
    user_data['f'] = grp['f'].mean()
    user_data['c'] = grp['c'].mean()
    user_data['l'] = grp['l'].mean()
else:
    user_data = pd.read_csv("train_user.csv")
user_data_2 = user_data.rename(columns={'l':'avg_l','c':'avg_c','f':'avg_f'})
print(user_data.head())

                                    f    c    l
uid                                            
000c663a24a2f91f4ba156fcd4f8b9f2  0.0  0.0  0.0
001e00fddab72bf7e6be3455e199904a  0.0  0.0  0.0
00629276bf87e3b0ffb8930d658d21bd  0.0  0.0  0.0
00c8986de0a9e2e8de08d9b7a315c690  0.0  0.0  0.0
00e77096cebd1a5c5e88b2603429866c  0.0  0.0  0.0


In [14]:
# 模型训练
import xgboost as xgb

def testme_f(preds,dtrain):
    labels=pd.Series(dtrain.get_label())
    tmp = pd.DataFrame()
    d = ((preds - labels)/(labels + 5.0)).apply(lambda x: abs(x))
    count_i = labels
    precision = 1 - d
    sign = np.sign(precision - 0.8).apply(lambda x: 0 if x == -1 else 1)
    count_i[count_i > 50] = 50
    count_1 = sum((count_i + 1) * sign)
    count_2 = sum(count_i + 1)
    return 'testme', 1 - count_1/count_2

def testme_lc(preds,dtrain):
    labels=pd.Series(dtrain.get_label())
    tmp = pd.DataFrame()
    d = ((preds - labels)/(labels + 3.0)).apply(lambda x: abs(x))
    count_i = labels
    precision = 1 - d
    sign = np.sign(precision - 0.8).apply(lambda x: 0 if x == -1 else 1)
    count_i[count_i > 25] = 25
    count_1 = sum((count_i + 1) * sign)
    count_2 = sum(count_i + 1)
    return 'testme', 1 - count_1/count_2

def calc(grp1, grp2, features, key, params, feval):
    train_X = grp1[features]
    train_Y = grp1[key]
    val_X = grp2[features]
    val_Y = grp2[key]
    dtrain = xgb.DMatrix(train_X, train_Y)
    dval = xgb.DMatrix(val_X, val_Y)
    watchlist  = [(dtrain,'train'),(dval,'val')]
    model = xgb.train(params,dtrain, evals=watchlist, feval=feval, 
                      num_boost_round=200, early_stopping_rounds=10)
    model.save_model('model_'+key)
    dic = model.get_fscore()
    dic2= sorted(dic.items(), key=lambda d:d[1], reverse = True)
    print("feature importance", dic2)
    return model

params={
    'max_depth':7,
    'subsample':0.7,
    'eta': 0.05,
    'seed':5,
    'objective':'reg:linear'
}

In [15]:
# 提取关键词
import jieba

# tmp=data.sample(n = 100000) # adjust
tmp = data.sample(n = 1000)
arr = tmp['content'].unique()
print(len(arr))
arr_all = []
for i in arr:
    arr = jieba.lcut(i, cut_all=True)
    arr_zh = [i for i in arr if len(re.findall(r"^[#\+a-z0-9A-Z\\-_]+$",i,re.M)) == 0 and len(i) > 1]
    arr_all.extend(arr_zh)
print(len(arr_all))

result = pd.value_counts(arr_all)
arr_word = []
for key,value in result.items():
    if value > 5:
        arr_word.append(key)
print(arr_word)

986
16032
['红包', '分享', '一个', '打车', '数据', '什么', '可以', '技术', '自己', '一起', '我们', '联网', '安全', '就是', '互联', '网络', '开发', '没有', '互联网', '现金', '今天', '试试', '这个', '服务', '学习', '手机', '中国', '信息', '问题', '代金', '代金券', '已经', '来试', '快来', '用户', '下载', '系统', '智能', '生活', '文章', '公司', '应用', '一种', '第一', '开始', '视频', '发出', '试手', '不要', '推荐', '看看', '市场', '手气', '使用', '如何', '刚刚', '正在', '真的', '企业', '起来', '工作', '大家', '现在', '管理', '发红包', '产品', '还是', '发红', '科技', '个人', '快乐', '时候', '通过', '这里', '移动', '项目', '支付', '博客', '如果', '知识', '更多', '行业', '觉得', '客户', '时间', '活动', '不是', '程序', '阅读', '知道', '能力', '幸福', '世界', '城市', '礼包', '好运', '一次', '你们', '关心', '体验', '网站', '设备', '发布', '内容', '进行', '免费', '北京', '喜欢', '大学', '小伙', '抢到', '支付宝', '平台', '需要', '来自', '感觉', '音乐', '代码', '运动', '看到', '伙伴', '中心', '文件', '怎么', '机器', '不能', '以及', '设计', '哈哈', '客户端', '支持', '更新', '计算', '投资', '空间', '链接', '希望', '成为', '基础', '出来', '不错', '学生', '抢红包', '有人', '功能', '硬件', '提供', '环境', '还有', '实现', '钱包', '时代', '每天', '这些', '专业', '创新', '资源', '很多', '小伙伴', '电脑', '因为', '最新', '错过', '小财神

In [16]:
# 从文字中提取特征
from scipy import stats

def get_dic(arr_word, dst, count, data):
    print(len(arr_word))
    dic_key = {}
    for idx,i in enumerate(arr_word):
        df1 = data[data['content'].str.contains(i)==False]
        df2 = data[data['content'].str.contains(i)==True]
        ret2 = stats.levene(df1[dst], df2[dst])
        if ret2[1] < 0.05:
            dic_key[i] = [ret2[1], df2[dst].mean(), len(df2)]
            print(idx, i, dic_key[i], len(dic_key))
            if len(dic_key) > count:
                break
    return dic_key

dic_key_f = get_dic(arr_word, 'f', 100, data[:100000])
dic_key_c = get_dic(arr_word, 'c', 50, data[:100000])
dic_key_l = get_dic(arr_word, 'l', 100, data[:100000])


537
1 分享 [3.082848305651958e-05, 16.245960502693, 557] 1
54 如何 [0.00520579922219767, 23.354545454545455, 110] 2
59 起来 [8.027846753915509e-08, 43.739583333333336, 96] 3
61 大家 [8.875207952819432e-16, 55.70634920634921, 126] 4
89 知道 [0.03718461527945237, 18.256637168141594, 113] 5
102 发布 [0.014234816405484923, 22.921348314606742, 89] 6
109 小伙 [2.4263845761329027e-38, 112.58108108108108, 74] 7
113 需要 [7.123769961612894e-20, 72.07446808510639, 94] 8
120 伙伴 [2.1299206990320807e-36, 106.92307692307692, 78] 9
158 小伙伴 [3.906941480410842e-43, 126.22727272727273, 66] 10
161 最新 [0.02284157318557919, 31.0, 38] 11
172 软件 [5.362786164083642e-09, 49.95294117647059, 85] 12
196 开发者 [0.007818458014427756, 38.935483870967744, 31] 13
200 方法 [2.3215041709751946e-07, 62.90243902439025, 41] 14
213 不知 [0.011825257223198257, 26.432835820895523, 67] 15
245 真正 [0.0020993951143830785, 58.588235294117645, 17] 16
277 工具 [0.027052104617776296, 28.363636363636363, 44] 17
330 主要 [0.010142145345259306, 36.72727272727273

In [19]:
val = pd.merge(val, user_data_2, on='uid', how='left')
train = pd.merge(train, user_data_2, on='uid', how='left')

In [20]:
# 生成新模型
def calc_dic(train, val, dst, dic):
    train_new = train.copy()
    for key in dic.keys():
        #print(key)
        train_new[key] = train['content'].str.contains(key).apply(lambda x: 1 if x else 0)
    val_new = val.copy()
    for key in dic.keys():
        val_new[key] = val['content'].str.contains(key).apply(lambda x: 1 if x else 0)
    features = ['weekday', 'hour',
           'c_has_link', 'c_has_at', 'c_has_ex', 'c_has_video', 'c_has_ads',
           'c_has_share', 'c_has_it', 'avg_l', 'avg_c', 'avg_f', 'c_has_topic']
    features_new = features + list(dic.keys())
    model = calc(train_new, val_new, features_new, dst, params, testme_f)
    return model

model_f = calc_dic(train, val, 'f', dic_key_f)
model_c = calc_dic(train, val, 'c', dic_key_c)
model_l = calc_dic(train, val, 'l', dic_key_l)

[0]	train-rmse:79.1155	val-rmse:14.868	train-testme:0.618992	val-testme:0.612693
Multiple eval metrics have been passed: 'val-testme' will be used for early stopping.

Will train until val-testme hasn't improved in 10 rounds.
[1]	train-rmse:78.3617	val-rmse:15.1642	train-testme:0.610059	val-testme:0.610252
[2]	train-rmse:76.394	val-rmse:16.1338	train-testme:0.606564	val-testme:0.592352
[3]	train-rmse:74.0633	val-rmse:18.3971	train-testme:0.601709	val-testme:0.594793
[4]	train-rmse:71.9015	val-rmse:19.9814	train-testme:0.596271	val-testme:0.593979
[5]	train-rmse:69.7289	val-rmse:22.7618	train-testme:0.596854	val-testme:0.586249
[6]	train-rmse:67.65	val-rmse:25.7422	train-testme:0.596757	val-testme:0.593572
[7]	train-rmse:67.0117	val-rmse:26.3773	train-testme:0.588795	val-testme:0.58991
[8]	train-rmse:65.1006	val-rmse:29.3537	train-testme:0.583261	val-testme:0.586249
[9]	train-rmse:63.1097	val-rmse:31.1847	train-testme:0.576075	val-testme:0.58869
[10]	train-rmse:61.4073	val-rmse:34.062	t

[16]	train-rmse:20.8199	val-rmse:37.2579	train-testme:0.470681	val-testme:0.486607
[17]	train-rmse:20.7175	val-rmse:37.2179	train-testme:0.457125	val-testme:0.484127
[18]	train-rmse:20.6213	val-rmse:37.1746	train-testme:0.443674	val-testme:0.485119
[19]	train-rmse:20.1237	val-rmse:37.4385	train-testme:0.431379	val-testme:0.488095
[20]	train-rmse:19.6186	val-rmse:37.7455	train-testme:0.41467	val-testme:0.488095
[21]	train-rmse:19.6031	val-rmse:37.7056	train-testme:0.406894	val-testme:0.486607
[22]	train-rmse:19.0869	val-rmse:37.9837	train-testme:0.393232	val-testme:0.487599
[23]	train-rmse:19.0307	val-rmse:37.9332	train-testme:0.381883	val-testme:0.479663
[24]	train-rmse:18.5831	val-rmse:38.2128	train-testme:0.375158	val-testme:0.479663
[25]	train-rmse:18.1105	val-rmse:38.4903	train-testme:0.364439	val-testme:0.479663
[26]	train-rmse:17.5909	val-rmse:38.825	train-testme:0.355612	val-testme:0.479663
[27]	train-rmse:17.6616	val-rmse:38.7662	train-testme:0.346574	val-testme:0.475198
[28]	t

In [21]:
# 保存模型
dic = {}
dic['model_f'] = model_f
dic['model_c'] = model_c
dic['model_l'] = model_l
dic['dic_key_f'] = dic_key_f
dic['dic_key_c'] = dic_key_c
dic['dic_key_l'] = dic_key_l
dic['user_data_2'] = user_data_2

from sklearn.externals import joblib
joblib.dump(dic, 'model.pkl')

['model.pkl']

In [22]:
def do_pred(model, val, dic):
    val_new = val.copy()
    for key in dic.keys():
        val_new[key] = val['content'].str.contains(key).apply(lambda x: 1 if x else 0)
    features = ['weekday', 'hour',
           'c_has_link', 'c_has_at', 'c_has_ex', 'c_has_video', 'c_has_ads',
           'c_has_share', 'c_has_it', 'avg_l', 'avg_c', 'avg_f', 'c_has_topic']
    features_new = features + list(dic.keys())
    tmp = val_new[features_new]
    dtest = xgb.DMatrix(tmp)
    out = model.predict(dtest)
    out = pd.Series(out).apply(lambda x:int(x))
    return out

def do_pred_all(df):
    out = df.copy()
    out['f'] = do_pred(model_f, df, dic_key_f)
    out['l'] = do_pred(model_l, df, dic_key_l)
    out['c'] = do_pred(model_c, df, dic_key_c)
    return out

# 对验证集预测
out = do_pred_all(val)
print(tools.do_score(val, out))

# 预测并生成提交数据
test = pd.read_csv('data/weibo_predict_data.txt',sep='\t',header=None)
test = tools.prepare(test)
test = tools.add_features(test)
test = pd.merge(test, user_data_2, on='uid', how='left')
test = test.fillna(0)
out = do_pred_all(test)
out['ss'] = out['f'].astype(str) + "," + out['c'].astype(str) + ',' + out['l'].astype(str)
out = out[['uid','mid','ss']]
print(out.shape)
print(out.head())
out.to_csv("result_190624.txt", index=False, header=None, sep='\t')

0.28978036754818465
(177923, 3)
                                uid                               mid     ss
0  c01014739c046cd31d6f1b4fb71b440f  0cd5ef13eb11ed0070f7625b14136ec9  0,0,0
1  fa5aed172c062c61e196eac61038a03b  7cce78a4ad39a91ec1f595bcc7fb5eba  0,0,0
2  77fc723c196a45203e70f4d359c96946  a3494d8cf475a92739a2ffd421640ddf  5,3,4
3  e4097b07f34366399b623b94f174f60c  6b89aea5aa7af093dde0894156c49dd3  0,0,0
4  d43f7557c303b84070b13aa4eeeb21d3  0bdeff19392e15737775abab46dc5437  0,0,0
