In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, RepeatedKFold
from scipy import sparse
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
from datetime import datetime

In [35]:
#导入数据
Train_data = pd.read_csv('happiness_train_abbr.csv')
Test_data = pd.read_csv('happiness_test_abbr.csv')

In [36]:
#查看数据分布
Train_data.shape

In [None]:
Test_data.shape

In [None]:
Train_data.head()

In [None]:
Test_data.head()

In [None]:
Test_data.info()

In [None]:
Train_data.isnull().sum()

In [None]:
Test_data.isnull().sum()

In [None]:
Test_data.describe()

In [None]:
# -8为异常值，需进行替换，数量较少，选择众数替换
Train_data['happiness'] = Train_data['happiness'].map(lambda x:4 if x == -8 else x)
Train_data['happiness'].value_counts()

In [None]:
'''使用聚合和分组运算将性别和happiness组合，算出男性和女性之间的性别计数'''
import matplotlib.pyplot as plt
import seaborn as sns
Train_data.groupby(['gender','happiness'])['happiness'].count()

In [None]:
'''hue 参数加入另一个嵌套的分类变量，首先使用第一个参数gender进行分组，然后再分组中再使用happiness进行count每一个元素所含的个数'''
f,ax=plt.subplots(figsize=(10,8))
sns.countplot('gender',hue='happiness',data=Train_data)
ax.set_title('Sex:happiness')

In [None]:
'''使用.dt.year将survey_time转换成year的时间'''
Train_data['survey_time'] = pd.to_datetime(Train_data['survey_time'],format='%Y-%m-%d %H:%M:%S')
Test_data['survey_time'] = pd.to_datetime(Test_data['survey_time'],format='%Y-%m-%d %H:%M:%S')
Train_data['survey_time'] = Train_data['survey_time'].dt.year
Test_data['survey_time'] = Test_data['survey_time'].dt.year

In [None]:
'''通过birth和survey时间计算出接受采访者的年龄'''
Train_data['Age'] = Train_data['survey_time']-Train_data['birth']
Test_data['Age'] = Test_data['survey_time']-Test_data['birth']

In [None]:
'''通过现在dataframe的形状可以看出现在多了一列，为添加的Age项'''
Train_data.shape

In [None]:
figure, ax = plt.subplots(1,1)
Train_data['Age'].plot.hist(ax = ax,color='blue')
plt.show()

In [None]:
combine=[Train_data,Test_data] #连接这两个表，做分组

In [None]:
'''
对dataframe中的数据进行简单的编码
小于16岁的为0；16-32的为1；32-48的为2；48-64的为3；64-80的为4；大于80的为5
'''
for dataset in combine:
    dataset.loc[dataset['Age']<=16,'Age']=0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 64) & (dataset['Age'] <= 80), 'Age'] = 4
    dataset.loc[ dataset['Age'] > 80, 'Age'] = 5

In [None]:
print(Train_data['Age'].head())

In [None]:
'''将数据按照年龄进行分类，再对'''
sns.countplot('Age',hue='happiness',data=Train_data)

In [None]:
Train_data.shape

In [None]:
Train_data.hist(figsize=(6*3,23*3), layout=(23,6), bins=12)
plt.show()

In [None]:
#收入分组
def income_cut(x):
    if x<0:
        return 0
    elif  0<=x<1200:
        return 1
    elif  1200<x<=10000:
        return 2
    elif  10000<x<24000:
        return 3
    elif  24000<x<40000:
        return 4
    elif  40000<=x:
        return 5


Train_data["income_cut"]=Train_data["income"].map(income_cut)
Test_data["income_cut"]=Test_data["income"].map(income_cut)

In [None]:
Train_data.shape

In [None]:
Test_data.shape

本想人工区分数字特征与类别特征，但是特征太多不方便，采用下面的方法

In [None]:
'''
绘制以下数据的热度图，
.corr()方法表示的是计算dataframe多个指标的相关系数矩阵，默认使用pearson计算方法
train[]表示传入热度图的数据，
annot（布尔类型），用于控制是否在个字中间标明数字，
cmap表示控制热度的渐变色，
linewidths表示每个单元格的线的宽度'''
sns.heatmap(Train_data[['happiness','Age','inc_ability','gender','status_peer','family_status','health','equity','class','work_exper','health_problem','family_m','house','depression','learn','relax','edu']].corr(),annot=True,cmap='RdYlGn',linewidths=0.2) #data.corr()-->correlation matrix
fig=plt.gcf()  #获取当前的图表和子图
fig.set_size_inches(15,15)  #设置图像的密集度：设置图像的长和宽
plt.show()

In [None]:
'''绘制不同年龄阶段及其工作经验的统计图'''
figure,ax = plt.subplots(1,1,figsize=(14,6))
sns.countplot('Age',hue='work_exper',ax = ax, data=Train_data)
plt.show()

In [None]:
'''
最后选择的特征为Age年龄,inc_ability收入是否合理,gender性别,status_peer与同龄人相比的收入情况,work_exper工作经历及情况,family_status家庭年收入情况,health身体健康状况,equity认为社会是否公平,class认为应该处于的社会阶层,health_problem影响健康的程度,family_m家庭人数,house拥有房产数量,depression压力沮丧程度,learn是否学习充电,relax休闲放松,edu教育程度
'''
features=['Age','inc_ability','gender','status_peer','work_exper','family_status','health','equity','class','health_problem','family_m','house','depression','learn','relax','edu']

In [None]:
'''
设置训练模型参数，y_train为模型输出的标签项，X_train为模型输入，X_test为测试集数据,使用xgboost 建模
'''
y_train = Train_data['happiness']
X_train = Train_data[features]
X_test = Test_data[features]

In [None]:
from sklearn.metrics import mean_squared_error
from xgboost.sklearn import XGBRegressor
model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
train_predict = model.predict(X_train)

In [None]:
train_mse = mean_squared_error(y_train, train_predict)
train_mse

简单建模后，效果一般，进行调参，用交叉验证的方式

In [None]:
model_2 = XGBRegressor(base_score=0.5, booster='gbtree', n_estimators=400,colsample_bylevel=0.1,max_depth=5,
      gamma=0.11, learning_rate=0.069, max_delta_step=0,eta=0.005,
        colsample_bytree=0.8, objective='reg:linear', eval_metric= 'rmse', silent=True, nthread=8)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model_2, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=5)

In [None]:
j = 1
for i in scores:
    print('第{0}次mse为{1}'.format(j, -i))
    j += 1

In [None]:
model_2.fit(X_train, y_train)
X_predict = model_2.predict(X_test)
#数据导出
test = pd.read_csv('happiness_test_abbr.csv')
test['happiness'] = X_predict
submit = test[['id','happiness']]
submit.to_csv("submit20230625.csv",index=False)