### 注意：代码目录只包含用于示例的部分数据，实际数据请从天池竞赛平台下载
### https://tianchi.aliyun.com/competition/gameList/activeList

In [1]:
import pandas as pd
import numpy as np
import datetime
from pandas.api.types import is_numeric_dtype # 用于判断特征类型
from sklearn.model_selection import cross_val_score, train_test_split # 切分数据集
from sklearn.metrics import mean_squared_error # 评价函数

data = pd.read_csv('data/happiness_train_min.csv', encoding='gb2312')
test = pd.read_csv('data/happiness_test_min.csv', encoding='gb2312')

In [2]:
# 特征工程

def get_mean(fea, data, test): # 同时变换训练集和测试集
    arr1 = data[fea].unique()
    arr2 = test[fea].unique()
    arr3 = list(arr1)
    arr3.extend(arr2) # 有的数据只出现在训练集或测试集中
    arr4 = list(set(arr3))
    dic = {}
    for x in arr4:
        dic[x] = data[data[fea] == x][label].mean() # 取其因变量均值
    data[fea] = data[fea].apply(lambda x: dic[x]) # 数据替换
    test[fea] = test[fea].apply(lambda x: dic[x])
    return data,test

label = 'happiness' # 目标变量
features = []

data, test = get_mean('city', data, test)
data, test = get_mean('invest_other', data, test)
data, test = get_mean('province', data, test)

for col in data.columns:
    if not is_numeric_dtype(data[col]): # 非数值型特征
        continue
    elif col != label and col != 'id' and col not in ['public_service_7']: # 去掉干扰特征
        features.append(col)
        data[col] = data[col].apply(lambda x: np.nan if x < 0 else x) # 优化点一
        test[col] = test[col].apply(lambda x: np.nan if x < 0 else x)

data_all = pd.concat([data,test]) # 优化点二
data = data[data['happiness'] > 0] # 去掉因变量缺失的数据
x = data[features] # 自变量
y = data[label] # 目标变量
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=0)
x_train = x_train.fillna(data_all[features].mean()) # 空值填充训练集
x_val = x_val.fillna(data_all[features].mean()) # 空值填充验证集
x_test = test.fillna(data_all[features].mean()) # 空值填充测试集
x = x.fillna(data_all[features].mean()) # 空值填充全集

TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
# 训练模型

import xgboost as xgb
from sklearn.cross_validation import KFold
import numpy as np

def my_eval(preds, train): # 自定义评价函数
    score = mean_squared_error(train.get_label(), preds)
    return 'myeval', score

my_params = {"booster":'gbtree','eta': 0.005, 'max_depth': 6, 'subsample': 0.7, 
              'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 
              'silent': True, 'nthread': 4} # 模型参数

train_preds = np.zeros(len(data)) # 用于保存预测结果
test_preds = np.zeros(len(test))
kf = KFold(len(data), n_folds = 5, shuffle=True, random_state=0) # 5折交叉验证
for fold, (trn_idx, val_idx) in enumerate(kf):
    print("fold {}".format(fold+1))
    train_data = xgb.DMatrix(data[features].iloc[trn_idx], data[label].iloc[trn_idx]) # 训练集
    val_data = xgb.DMatrix(data[features].iloc[val_idx], data[label].iloc[val_idx]) # 验证集
    watchlist = [(train_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=train_data, num_boost_round=5000, evals=watchlist, 
               early_stopping_rounds=200, verbose_eval=100, 
               params=my_params,feval = my_eval)
    train_preds[val_idx] = clf.predict(xgb.DMatrix(data[features].iloc[val_idx]),
               ntree_limit=clf.best_ntree_limit)
    test_preds += clf.predict(xgb.DMatrix(test[features]), 
               ntree_limit=clf.best_ntree_limit) / kf.n_folds
print("CV score: {:<8.8f}".format(mean_squared_error(train_preds, data[label])))

df = pd.DataFrame() # 生成提交结果
df['id'] = test.id
df['happiness'] = test_preds
df.to_csv('out/submit_{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),index=False)


In [None]:
import matplotlib.pyplot as plt

fig,ax = plt.subplots()
fig.set_size_inches(40,6)
xgb.plot_tree(clf, ax=ax, num_trees=0) # 显示模型中的第一棵树
plt.savefig('tmp.png',dpi=300)

In [None]:
# 检测干扰变量

from sklearn.ensemble import GradientBoostingRegressor

baseline = 0.4887 # 误差baseline
for i in features:
    features_new = [x for x in features if x != i]
    clf = GradientBoostingRegressor(criterion='mse', random_state=0)
    clf.fit(x_train[features_new], y_train)
    mse = mean_squared_error(y_val, [round(i) for i in clf.predict(x_val[features_new])])
    if mse < baseline:
        print("remove", i, "MSE: %.4f" % mse)