### 注意：代碼目錄只包含用於示例的部分數據，實際數據請從天池競賽平臺下載
### https://tianchi.aliyun.com/competition/gameList/activeList

In [1]:
import pandas as pd
import numpy as np
import datetime
from pandas.api.types import is_numeric_dtype # 用於判斷特徵類型
from sklearn.model_selection import cross_val_score, train_test_split # 切分數據集
from sklearn.metrics import mean_squared_error # 評價函數

data = pd.read_csv('data/happiness_train_min.csv', encoding='gb2312')
test = pd.read_csv('data/happiness_test_min.csv', encoding='gb2312')

In [2]:
# 特徵工程

def get_mean(fea, data, test): # 同時變換訓練集和測試集
    arr1 = data[fea].unique()
    arr2 = test[fea].unique()
    arr3 = list(arr1)
    arr3.extend(arr2) # 有的數據只出現在訓練集或測試集中
    arr4 = list(set(arr3))
    dic = {}
    for x in arr4:
        dic[x] = data[data[fea] == x][label].mean() # 取其因變量均值
    data[fea] = data[fea].apply(lambda x: dic[x]) # 數據替換
    test[fea] = test[fea].apply(lambda x: dic[x])
    return data,test

label = 'happiness' # 目標變量
features = []

data, test = get_mean('city', data, test)
data, test = get_mean('invest_other', data, test)
data, test = get_mean('province', data, test)

for col in data.columns:
    if not is_numeric_dtype(data[col]): # 非數值型特徵
        continue
    elif col != label and col != 'id' and col not in ['public_service_7']: # 去掉干擾特徵
        features.append(col)
        data[col] = data[col].apply(lambda x: np.nan if x < 0 else x) # 優化點一
        test[col] = test[col].apply(lambda x: np.nan if x < 0 else x)

data_all = pd.concat([data,test]) # 優化點二
data = data[data['happiness'] > 0] # 去掉因變量缺失的數據
x = data[features] # 自變量
y = data[label] # 目標變量
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=0)
x_train = x_train.fillna(data_all[features].mean()) # 空值填充訓練集
x_val = x_val.fillna(data_all[features].mean()) # 空值填充驗證集
x_test = test.fillna(data_all[features].mean()) # 空值填充測試集
x = x.fillna(data_all[features].mean()) # 空值填充全集

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [3]:
# 訓練模型

import xgboost as xgb
from sklearn.cross_validation import KFold
import numpy as np

def my_eval(preds, train): # 自定義評價函數
    score = mean_squared_error(train.get_label(), preds)
    return 'myeval', score

my_params = {"booster":'gbtree','eta': 0.005, 'max_depth': 6, 'subsample': 0.7, 
              'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 
              'silent': True, 'nthread': 4} # 模型參數

train_preds = np.zeros(len(data)) # 用於保存預測結果
test_preds = np.zeros(len(test))
kf = KFold(len(data), n_folds = 5, shuffle=True, random_state=0) # 5折交叉驗證
for fold, (trn_idx, val_idx) in enumerate(kf):
    print("fold {}".format(fold+1))
    train_data = xgb.DMatrix(data[features].iloc[trn_idx], data[label].iloc[trn_idx]) # 訓練集
    val_data = xgb.DMatrix(data[features].iloc[val_idx], data[label].iloc[val_idx]) # 驗證集
    watchlist = [(train_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=train_data, num_boost_round=5000, evals=watchlist, 
               early_stopping_rounds=200, verbose_eval=100, 
               params=my_params,feval = my_eval)
    train_preds[val_idx] = clf.predict(xgb.DMatrix(data[features].iloc[val_idx]),
               ntree_limit=clf.best_ntree_limit)
    test_preds += clf.predict(xgb.DMatrix(test[features]), 
               ntree_limit=clf.best_ntree_limit) / kf.n_folds
print("CV score: {:<8.8f}".format(mean_squared_error(train_preds, data[label])))

df = pd.DataFrame() # 生成提交結果
df['id'] = test.id
df['happiness'] = test_preds
df.to_csv('out/submit_{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),index=False)


fold 1
[0]	train-rmse:3.44227	valid_data-rmse:3.47826	train-myeval:11.8492	valid_data-myeval:12.0983
Multiple eval metrics have been passed: 'valid_data-myeval' will be used for early stopping.

Will train until valid_data-myeval hasn't improved in 200 rounds.


  if getattr(data, 'base', None) is not None and \


[100]	train-rmse:2.15998	valid_data-rmse:2.19651	train-myeval:4.66551	valid_data-myeval:4.82466
[200]	train-rmse:1.40795	valid_data-rmse:1.44959	train-myeval:1.98231	valid_data-myeval:2.10133
[300]	train-rmse:0.983552	valid_data-rmse:1.03569	train-myeval:0.967374	valid_data-myeval:1.07265
[400]	train-rmse:0.756499	valid_data-rmse:0.823961	train-myeval:0.57229	valid_data-myeval:0.678911
[500]	train-rmse:0.641039	valid_data-rmse:0.724438	train-myeval:0.410932	valid_data-myeval:0.524811
[600]	train-rmse:0.581517	valid_data-rmse:0.680049	train-myeval:0.338162	valid_data-myeval:0.462467
[700]	train-rmse:0.548352	valid_data-rmse:0.660345	train-myeval:0.30069	valid_data-myeval:0.436055
[800]	train-rmse:0.52708	valid_data-rmse:0.651311	train-myeval:0.277813	valid_data-myeval:0.424206
[900]	train-rmse:0.510916	valid_data-rmse:0.646678	train-myeval:0.261035	valid_data-myeval:0.418193
[1000]	train-rmse:0.497112	valid_data-rmse:0.644209	train-myeval:0.247121	valid_data-myeval:0.415006
[1100]	train

[300]	train-rmse:0.979591	valid_data-rmse:1.04016	train-myeval:0.959598	valid_data-myeval:1.08194
[400]	train-rmse:0.751583	valid_data-rmse:0.839116	train-myeval:0.564877	valid_data-myeval:0.704116
[500]	train-rmse:0.635863	valid_data-rmse:0.747358	train-myeval:0.404321	valid_data-myeval:0.558544
[600]	train-rmse:0.57695	valid_data-rmse:0.708381	train-myeval:0.332872	valid_data-myeval:0.501804
[700]	train-rmse:0.544043	valid_data-rmse:0.691651	train-myeval:0.295982	valid_data-myeval:0.478381
[800]	train-rmse:0.523189	valid_data-rmse:0.684391	train-myeval:0.273727	valid_data-myeval:0.468391
[900]	train-rmse:0.506777	valid_data-rmse:0.680831	train-myeval:0.256823	valid_data-myeval:0.463531
[1000]	train-rmse:0.493576	valid_data-rmse:0.67878	train-myeval:0.243617	valid_data-myeval:0.460742
[1100]	train-rmse:0.481373	valid_data-rmse:0.677462	train-myeval:0.23172	valid_data-myeval:0.458955
[1200]	train-rmse:0.469686	valid_data-rmse:0.676994	train-myeval:0.220605	valid_data-myeval:0.45832
[13

In [4]:
import matplotlib.pyplot as plt

fig,ax = plt.subplots()
fig.set_size_inches(40,6)
xgb.plot_tree(clf, ax=ax, num_trees=0) # 顯示模型中的第一棵樹
plt.savefig('tmp.png',dpi=300)

In [6]:
# 檢測干擾變量

from sklearn.ensemble import GradientBoostingRegressor

baseline = 0.4887 # 誤差baseline
for i in features:
    features_new = [x for x in features if x != i]
    clf = GradientBoostingRegressor(criterion='mse', random_state=0)
    clf.fit(x_train[features_new], y_train)
    mse = mean_squared_error(y_val, [round(i) for i in clf.predict(x_val[features_new])])
    if mse < baseline:
        print("remove", i, "MSE: %.4f" % mse)

  from numpy.core.umath_tests import inner1d


remove survey_type MSE: 0.4712
remove province MSE: 0.4732
remove city MSE: 0.4807
remove county MSE: 0.4697
remove gender MSE: 0.4812
remove birth MSE: 0.4762
remove nationality MSE: 0.4687
remove religion MSE: 0.4772
remove religion_freq MSE: 0.4702
remove edu MSE: 0.4707
remove edu_status MSE: 0.4787
remove edu_yr MSE: 0.4737
remove income MSE: 0.4637
remove political MSE: 0.4727
remove join_party MSE: 0.4702
remove floor_area MSE: 0.4752
remove property_0 MSE: 0.4717
remove property_1 MSE: 0.4707
remove property_2 MSE: 0.4707
remove property_3 MSE: 0.4707
remove property_4 MSE: 0.4707
remove property_5 MSE: 0.4707
remove property_6 MSE: 0.4687
remove property_7 MSE: 0.4707
remove property_8 MSE: 0.4712
remove height_cm MSE: 0.4757
remove weight_jin MSE: 0.4722
remove health MSE: 0.4712
remove health_problem MSE: 0.4757
remove hukou MSE: 0.4712
remove hukou_loc MSE: 0.4707
remove media_1 MSE: 0.4707
remove media_2 MSE: 0.4712
remove media_3 MSE: 0.4752
remove media_4 MSE: 0.4717
rem