In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler


In [2]:
import feather
df = feather.read_dataframe('feature/train_feature_2day.feather')

In [3]:
# 分离特征和标签
X = df.drop(['user_id', 'item_id', 'pred_date','label'], axis=1).values
y = df['label'].values

# 特征选择（使用 SelectKBest 和 f_classif）
selector = SelectKBest(score_func = f_classif, k = 10)
X = selector.fit_transform(X, y)

# 数据预处理（标准化）
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [4]:
# 初始化 XGBoost 分类器并设置 GPU 相关参数
model = xgb.XGBClassifier(
    # tree_method = 'gpu_hist',
    # predictor = 'gpu_predictor',
    max_depth = 3,
    learning_rate = 0.1,
    n_estimators = 100,
    objective = 'binary:logistic',
    random_state = 42,
    use_label_encoder = False
)

In [5]:
# 训练模型
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='approx', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [6]:
# 进行预测
y_pred = model.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9990517116207677


In [7]:
from datetime import datetime
# 保存模型
model.save_model('model/XGBoost/model_{:}'.format(datetime.now().strftime('%m%d%H%M')))
# 保存特征选择器
import pickle
with open('model/XGBoost/selector_{:}.pkl'.format(datetime.now().strftime('%m%d%H%M')), 'wb') as f:
    pickle.dump(selector, f)

In [16]:
import feather
\
# 读取预测数据
df = feather.read_dataframe('feature/pred_feature_100_200.feather')
df_id = df[['user_id', 'item_id']]
df_X = df.drop(columns=['user_id', 'item_id'])
if 'index_x' in df.columns:
    df_X = df_X.drop(columns=['index_x', 'index_y'])
df_X = df_X.values

In [17]:
# 对预测数据进行特征选择和数据预处理
df_X = selector.transform(df_X)
df_X = scaler.transform(df_X)

# 进行预测
df_y = model.predict(df_X)


In [18]:

# 将正样本对应的df_id保存到文件
df_id['label'] = df_y
df_id = df_id[df_id['label'] == 1]
df_id = df_id.drop(columns=['label'])
df_id.to_csv('result/XGBoost/result_{:}.txt'.format(datetime.now().strftime('%m%d%H%M')), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
df_id.to_csv('result/XGBoost/result_{:}.txt'.format(datetime.now().strftime('%m%d%H%M')), index=False)