In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression

In [3]:
df=pd.read_csv('E:/UMICH/Courses/stats 507/employment-data-project/dataset_for_ml_models.zip')
df.head()

Unnamed: 0,YYYY,YYYYMM,INCOME,HOMEOWN,AGE,INFL_HOMEAMT,INFL_INVAMT,REGION_Northeast,REGION_South,REGION_West,...,EHSGRD_NA,EHSGRD_No,EHSGRD_Yes,ECLGRD_NA,ECLGRD_No,ECLGRD_Yes,POLAFF_Democrat,"POLAFF_Independent, No Preference",POLAFF_NA,POLAFF_Republican
0,2006,200609,52000,1,52,386127.458739,166892.013889,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1,2006,200609,150000,1,70,386127.458739,96283.854167,1,0,0,...,0,0,1,0,0,1,0,1,0,0
2,2006,200609,38000,1,60,386127.458739,437017.193297,0,1,0,...,0,0,1,0,1,0,0,1,0,0
3,2006,200609,175000,1,53,386127.458739,962838.541667,0,0,1,...,0,0,1,0,0,1,0,0,0,1
4,2006,200609,24000,2,59,386127.458739,6420.207396,0,1,0,...,0,0,1,0,0,1,0,0,0,1


In [4]:
data=df
data['AGE']=scale(data['AGE'])
data['INFL_HOMEAMT']=scale(data['INFL_HOMEAMT'])
data['INFL_INVAMT']=scale(data['INFL_INVAMT'])
data['INCOME']=scale(data['INCOME'])
data['YYYY'] = LabelEncoder().fit_transform(data['YYYY'])
data.head()

Unnamed: 0,YYYY,YYYYMM,INCOME,HOMEOWN,AGE,INFL_HOMEAMT,INFL_INVAMT,REGION_Northeast,REGION_South,REGION_West,...,EHSGRD_NA,EHSGRD_No,EHSGRD_Yes,ECLGRD_NA,ECLGRD_No,ECLGRD_Yes,POLAFF_Democrat,"POLAFF_Independent, No Preference",POLAFF_NA,POLAFF_Republican
0,0,200609,-0.56235,1,0.007788,-3.488944e-15,-0.360122,0,0,1,...,0,0,1,0,0,1,1,0,0,0
1,0,200609,0.514108,1,1.07795,-3.488944e-15,-0.454185,1,0,0,...,0,0,1,0,0,1,0,1,0,0
2,0,200609,-0.71613,1,0.483415,-3.488944e-15,-0.000262,0,1,0,...,0,0,1,0,1,0,0,1,0,0
3,0,200609,0.788715,1,0.067241,-3.488944e-15,0.700235,0,0,1,...,0,0,1,0,0,1,0,0,0,1
4,0,200609,-0.86991,2,0.423962,-3.488944e-15,-0.573901,0,1,0,...,0,0,1,0,0,1,0,0,0,1


In [74]:
X,y=data.drop(['INCOME','YYYYMM'],axis=1),data["INCOME"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3,random_state=1
)


In [9]:



# define griding space
param_grid = {'n_estimators': [50, 100, 200], 
              'learning_rate': [0.1, 0.01, 0.001],
              'max_depth': [3, 5, 7]}

# XGBoost regressor
gb = GradientBoostingRegressor(random_state=42)

# use GridSearchCV to tune the parameters
grid_search = GridSearchCV(gb, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
print("Test set score:", grid_search.score(X_test, y_test))


Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best cross-validation score: 0.5362809217099855
Test set score: 0.5389610296387719


Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best cross-validation score: 0.5362809217099855
Test set score: 0.5389610296387719

In [15]:
reg = GradientBoostingRegressor(**grid_search.best_params_)

In [16]:
model_xgboost=reg.fit(X_train, y_train)

In [17]:
mse = mean_squared_error(y_test, reg.predict(X_test))
rmse = mean_squared_error(y_test, reg.predict(X_test),squared=False)
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
print("The RMSE on test set: {:.4f}".format(rmse))
print("The R_square on train set: {:.4f}".format(model_xgboost.score(X_train, y_train)))
print("The R_square on test set: {:.4f}".format(model_xgboost.score(X_test, y_test)))

The mean squared error (MSE) on test set: 0.4401
The R_square on train set: 0.5936
The R_square on test set: 0.5388


In [46]:
new_data = dict(X_test.iloc[0,:])
from sklearn.preprocessing import StandardScaler

# 使用原始数据计算均值和标准差
scaler = StandardScaler()
scaler.fit(X_train[['AGE', 'INFL_HOMEAMT', 'INFL_INVAMT','INCOME']])

# 将数据转换为 Pandas DataFrame
new_df = pd.DataFrame([new_data])

# 对新数据进行标准化，使用相同的均值和标准差
new_data_scaled = scaler.transform(new_df[['AGE', 'INFL_HOMEAMT', 'INFL_INVAMT']])

# 将标准化后的新数据赋值回原数据的相应列
new_df[['AGE', 'INFL_HOMEAMT', 'INFL_INVAMT']] = new_data_scaled
new_df['YYYY'] = LabelEncoder().fit_transform(new_df['YYYY'])


In [56]:
# 创建 StandardScaler 对象，用于对 INCOME 变量进行标准化
scaler_y = StandardScaler()

# 对 INCOME 变量进行标准化，并将结果保存在一个新的变量 y_scaled 中
y_scaled=scaler_y.fit_transform(np.array(y_train).reshape(-1, 1))

In [57]:
y_scaled

array([[ 1.58789417],
       [ 0.28233838],
       [-0.31604137],
       ...,
       [ 0.17354206],
       [-0.04405057],
       [-0.92965259]])

In [51]:
# 使用训练好的模型对标准化后的新数据进行预测
income_pred = reg.predict(new_df)

# 将预测结果反向标准化
income_pred_unscaled = scaler_y.inverse_transform(income_pred.reshape(-1, 1))

In [52]:
income_pred_unscaled

array([[0.1405234]])

In [58]:
import pickle
filehandler=open("model_xgboost.pkl",'wb')
pickle.dump(model_xgboost,filehandler)


In [60]:
# filehandler2=open("model_xgboost.pkl",'rb')
# model_xgboost2=pickle.load(filehandler2)

EOFError: Ran out of input