In [2]:
import pandas as pd
import cudf
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
import feather
df = feather.read_dataframe('feature/train_feature.feather')

In [4]:
# 去除特征值含有 inf 的样本
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

In [None]:
# df = cudf.from_pandas(df)
# 取 2014-12-18 作为验证集
df_check = df[df['pred_date'] == pd.Timestamp('2014-12-18')]
# 取 2014-12-16、2014-12-17 作为训练集
df = df[df['pred_date'] <= pd.Timestamp('2014-12-17')]

In [5]:
# 分离特征和标签
X = df.drop(['user_id', 'item_id', 'pred_date','label'], axis=1).values
y = df['label'].values
# X_check = df_check.drop(['user_id', 'item_id', 'pred_date','label'], axis=1).values
# y_check = df_check['label'].values

In [5]:
# 特征选择（使用 SelectKBest 和 f_classif）
# selector = SelectKBest(score_func = f_classif, k = 10)
# X = selector.fit_transform(X, y)

# 数据预处理（标准化）
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [7]:
X = scaler.transform(X)

In [8]:

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
def f1_score(preds, dtrain):
    labels = dtrain > 0.5
    preds = preds > 0.5  # 根据概率阈值确定预测类别（假设二分类，阈值为0.5）
    tp = np.sum((preds == 1) & (labels == 1))
    fp = np.sum((preds == 1) & (labels == 0))
    fn = np.sum((preds == 0) & (labels == 1))
    precision = tp / (tp + fp + 1e-10)
    recall = tp / (tp + fn + 1e-10)
    f1 = 2 * precision * recall / (precision + recall + 1e-10)
    return f1

In [9]:
# 初始化 XGBoost 分类器并设置 GPU 相关参数
model = xgb.XGBClassifier(
    tree_method = "hist",
    device = "cuda",
    max_depth = 3,
    learning_rate = 0.1,
    n_estimators = 100,
    objective = 'binary:logistic',
    random_state = 42
)

In [None]:
# 训练模型
model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot as plt
plot_importance(model)
plt.show() 

In [22]:
thresholds = np.sort(model.feature_importances_)
threshold = thresholds[-5]

In [None]:
threshold

In [None]:
from sklearn.feature_selection import SelectFromModel
selection = SelectFromModel(model, threshold=threshold, prefit=True)
select_X_train = selection.transform(X_train)
select_X_train.shape[1]

In [None]:
model_sel = xgb.XGBClassifier(
    tree_method = "hist",
    device = "cuda",
    max_depth = 3,
    learning_rate = 0.1,
    n_estimators = 100,
    objective = 'binary:logistic',
    random_state = 42
)
model_sel.fit(select_X_train, y_train)

In [None]:
from datetime import datetime
# 保存模型
model_sel.save_model('model/XGBoost/model_sel_{:}.model'.format(datetime.now().strftime('%m%d%H%M')))
import pickle
# 保存特征选择器
pickle.dump(selection, open('model/XGBoost/selection_{:}.pkl'.format(datetime.now().strftime('%m%d%H%M')), 'wb'))
# 保存标准化器
pickle.dump(scaler, open('model/XGBoost/scaler_{:}.pkl'.format(datetime.now().strftime('%m%d%H%M')), 'wb'))

In [None]:
model = xgb.XGBClassifier()
model.load_model('model/XGBoost/model_rank10_01171856')
import pickle
scaler = pickle.load(open('model/XGBoost/scaler_01171856.pkl', 'rb'))

In [36]:
item_data = feather.read_dataframe('dataset/item_data.feather')
item_data.drop(['item_geohash', 'item_category'], axis=1, inplace=True)

In [None]:
from datetime import datetime
# 保存模型
model.save_model('model/XGBoost/model_{:}'.format(datetime.now().strftime('%m%d%H%M')))
# # 保存特征选择器
# import pickle
# with open('model/XGBoost/selector_{:}.pkl'.format(datetime.now().strftime('%m%d%H%M')), 'wb') as f:
#     pickle.dump(selector, f)

In [29]:
import feather
\
# 读取预测数据
df = feather.read_dataframe('feature/pred_feature.feather')
df_id = df[['user_id', 'item_id']]
df_X = df.drop(columns=['user_id', 'item_id'])
if 'index_x' in df.columns:
    df_X = df_X.drop(columns=['index_x', 'index_y'])
df_X = df_X.values
df_X = df_X[~np.isinf(df_X).any(axis=1)]

In [30]:
model = model_sel

In [None]:
# 对预测数据进行特征选择和数据预处理
# df_X = selector.transform(df_X)
df_X = scaler.transform(df_X)

# 进行预测
df_y = model.predict_proba(df_X)

In [None]:
# 特征选择
df_X = selection.transform(df_X)

# 进行预测
df_y = model_sel.predict_proba(df_X)

In [None]:
df_id['prob'] = df_y[:,1]

In [None]:
df_id.sort_values(by=['prob'], ascending=[False], inplace=True)

In [None]:
result = df_id.merge(item_data, on='item_id', how='inner')

In [38]:
from datetime import datetime
# 输出前 300000 条数据
result.head(300000)[['user_id','item_id']].to_csv('result/XGBoost/result_300000_{:}.txt'.format(datetime.now().strftime('%m%d%H%M')), index=False, header=False, sep='\t')

In [None]:

# 将正样本对应的df_id保存到文件
df_id['label'] = df_y
df_id = df_id[df_id['label'] == 1]
df_id = df_id.drop(columns=['label'])
df_id.to_csv('result/XGBoost/result_{:}.txt'.format(datetime.now().strftime('%m%d%H%M')), index=False, header=False, sep='\t')

In [13]:
df_id.to_csv('result/XGBoost/result_{:}.txt'.format(datetime.now().strftime('%m%d%H%M')), index=False, header=False, sep='\t')

In [1]:
import pandas as pd
result = pd.read_csv('result/XGBoost/result_300000_01171928.txt', sep='\t', header=None)
result = result.head(30000)

In [2]:
from datetime import datetime
result.to_csv('result/XGBoost/result_30000_{:}.txt'.format(datetime.now().strftime('%m%d%H%M')), index=False, header=False, sep='\t')