### 注意：代碼目錄只包含用於示例的部分數據，實際數據請從天池競賽平臺下載
### https://tianchi.aliyun.com/competition/gameList/activeList

In [1]:
import pandas as pd
import numpy as np
import datetime
from pandas.api.types import is_numeric_dtype # 用於判斷特徵類型
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #分類模型
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # 迴歸模型
from sklearn.model_selection import cross_val_score, train_test_split # 切分數據集
from sklearn.metrics import mean_squared_error # 評價函數

data = pd.read_csv('data/happiness_train_min.csv', encoding='gb2312')
test = pd.read_csv('data/happiness_test_min.csv', encoding='gb2312')

print(data.columns.tolist()) # 查看所有特徵
print(data.dtypes) # 查看各特徵類型

  from numpy.core.umath_tests import inner1d


['id', 'happiness', 'survey_type', 'province', 'city', 'county', 'survey_time', 'gender', 'birth', 'nationality', 'religion', 'religion_freq', 'edu', 'edu_other', 'edu_status', 'edu_yr', 'income', 'political', 'join_party', 'floor_area', 'property_0', 'property_1', 'property_2', 'property_3', 'property_4', 'property_5', 'property_6', 'property_7', 'property_8', 'property_other', 'height_cm', 'weight_jin', 'health', 'health_problem', 'depression', 'hukou', 'hukou_loc', 'media_1', 'media_2', 'media_3', 'media_4', 'media_5', 'media_6', 'leisure_1', 'leisure_2', 'leisure_3', 'leisure_4', 'leisure_5', 'leisure_6', 'leisure_7', 'leisure_8', 'leisure_9', 'leisure_10', 'leisure_11', 'leisure_12', 'socialize', 'relax', 'learn', 'social_neighbor', 'social_friend', 'socia_outing', 'equity', 'class', 'class_10_before', 'class_10_after', 'class_14', 'work_exper', 'work_status', 'work_yr', 'work_type', 'work_manage', 'insur_1', 'insur_2', 'insur_3', 'insur_4', 'family_income', 'family_m', 'family_st

In [2]:
# 特徵工程

features = []
label = 'happiness' # 目標變量

for col in data.columns:
    if not is_numeric_dtype(data[col]): # 非數值型特徵
        print(col, data[col].dtype)
        print(data[col].unique()[:5])
    elif col != label and col != 'id': # 加入可直接代入模型的特徵
        features.append(col)
        
x = data[features] # 自變量
y = data[label] # 目標變量
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=0)
x_train = x_train.fillna(x.mean()) # 空值填充訓練集
x_val = x_val.fillna(x.mean()) # 空值填充驗證集
x_test = test.fillna(x.mean()) # 空值填充測試集
x = x.fillna(x.mean()) # 空值填充全集

survey_time object
['2015/8/4 14:18' '2015/7/21 15:04' '2015/7/21 13:24' '2015/7/25 17:33'
 '2015/8/10 9:50']
edu_other object
[nan '夜校']
property_other object
[nan '無產權' '已購買，但未過戶' '家庭共同所有' '待辦']
invest_other object
[nan '理財產品' '民間借貸' '銀行理財' '儲蓄存款']


In [3]:
# 訓練模型生成提交數據

#clf = RandomForestRegressor(criterion='mse', random_state=0) # 隨機森林迴歸
#clf = GradientBoostingClassifier(criterion='mse',random_state=0) # GBDT分類
clf = GradientBoostingRegressor(criterion='mse', random_state=0) # GBDT迴歸

if True: # 用於本地測試
    clf.fit(x_train, y_train)
    mse = mean_squared_error(y_val, [round(i) for i in clf.predict(x_val)])
    print("MSE: %.4f" % mse)
else: # 用於遠程提交
    clf.fit(x, y) # 全量數據訓練
    df = pd.DataFrame()
    df['id'] = test.id
    df['happiness'] = clf.predict(x_test[features])
    df.to_csv('out/submit_{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')),index=False)

MSE: 0.7085
