In [None]:
# Python libraries
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# import plotly.offline as py
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.tools as tls
# import plotly.figure_factory as ff
# py.init_notebook_mode(connected=True)

# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
import lightgbm as lgbm
from lightgbm.sklearn import LGBMRanker

# Stats
import scipy.stats as ss
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

# Time
from contextlib import contextmanager
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set()

# Read Data

In [None]:
# Reading dataset
# train = pd.read_csv("../input/2nd-assignment-dmt2022/training_set_VU_DM.csv")
# test = pd.read_csv("../input/2nd-assignment-dmt2022/test_set_VU_DM.csv")

# fast read
train_df = pd.read_pickle("../input/project/train.pkl")
test_df = pd.read_pickle("../input/project/test.pkl")
display(f"train_df.shape: {train_df.shape}", f"test_df.shape: {test_df.shape}")

# Datasets statistics

In [None]:
train_df

In [None]:
train_df.describe()

In [None]:
test_df.describe()

以上图表显示train data 有53列，其中click_bool，booking_bool为标签; test 有49列

In [None]:
train_test_cols_diff = list(set(train_df.columns.tolist()) - set(test_df.columns.tolist()))
train_test_cols_diff

以上代码表明：train data 比test data 多4列，都是target或者target衍生列

In [None]:
def get_target(row):
    """
    0=not clicked at all, 1=clicked but not booked, 5=booked
    """
    if row.booking_bool>0:
        return 5.0
    if row.click_bool>0:
        return 1.0
    return 0.0

def featurize_df(df):
    """
    Extract more features
    """
    df["weekday"] = df["date_time"].dt.weekday
    df["week_of_year"] = df["date_time"].dt.week

    df["hour"] = df["date_time"].dt.hour
    df["minute"] = df["date_time"].dt.minute
    ## total time elapsed - allows model to learn continous trend over time to a degree
    df["time_epoch"] = df["date_time"].astype('int64')//1e9
    ## if we were looking at fraud: df["seconds"] = df.timestamp.dt.second
    df["early_night"] = ((df["hour"]>19) | (df["hour"]<3)) # no added value from feature
    
    df["nans_count"] = df.isna().sum(axis=1)
    
    ## we won't make any time series features for now
    ## We could add time series features per property/hotel. We'd need to check for unaries, and to add a shift/offset dependant on forecast horizon

    return df

# 生成 时间相关特征
train_df["date_time"] = pd.to_datetime(train_df["date_time"],infer_datetime_format=True)
test_df["date_time"] = pd.to_datetime(test_df["date_time"],infer_datetime_format=True)

train_df = featurize_df(train_df)
test_df = featurize_df(test_df)

# 生成target
train_df["target"] = train_df.apply(get_target,axis=1)
train_df = train_df.drop(train_test_cols_diff, axis=1, errors="ignore")

In [None]:
# 检查null值
train_null_rate = (train_df.isnull().sum()/len(train_df)*100)
# train_null_rate.sort_values(ascending=False)

以上代码表明：很多列都的null占比都非常大

# Dataset pre-processing

In [None]:
# (['comp3_rate',
#        'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv', 'comp5_rate',
#        'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate', 'comp8_inv',
#        'comp8_rate_percent_diff'],axis=1)

In [None]:
print("before len used_columns:", len(train_null_rate))
used_columns = train_null_rate[train_null_rate<90].index.to_list() # 只保留null值占比小于90%的特征

used_columns = [col for col in used_columns if col not in ["target","date_time", # 移除target和 date_time
                                                           'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff', 'comp4_inv',
                                                           'comp5_rate', 'comp5_inv', 'comp5_rate_percent_diff', 'comp8_rate',
                                                           'comp8_inv', 'comp8_rate_percent_diff'
                                                          ]]
# used_columns.remove("target") # 移除target
# used_columns.remove("date_time") # 移除时间特征

target_col = ["target"] # 标记出target
categorical_cols = ['prop_id',"srch_destination_id"] # 标记出类别特征
print("after len used_columns:", len(used_columns))

In [None]:
train_df = train_df[used_columns+target_col]
test_df = test_df[used_columns]
display(f"train_df.shape: {train_df.shape}", f"test_df.shape: {test_df.shape}")

# independent variables analysis

In [None]:
train_df_corr = train_df.corr() # 特征和target之间的相关系数
sns.heatmap(train_df_corr)

# Kfold

In [None]:
# split feature and target
train_X = train_df.drop('target', 1) 
train_y = train_df['target']
display(f"train_X.shape: {train_X.shape}", f"train_y.shape: {train_y.shape}")
display("target value_counts:", train_y.value_counts()) # 查看target的value分布

In [None]:
from sklearn.model_selection import GroupKFold
train_id = train_X["srch_id"]
group_kfold = GroupKFold(n_splits=5)
for train_index, valid_index in group_kfold.split(train_X, train_y, train_id):
    X_train = train_X.iloc[train_index]
    y_train = train_y.iloc[train_index]
    X_valid = train_X.iloc[valid_index]
    y_valid = train_y.iloc[valid_index]
    break
display(f"X_train.shape: {X_train.shape}", f"y_train.shape: {y_train.shape}")
display(f"X_valid.shape: {X_valid.shape}", f"y_valid.shape: {y_valid.shape}")
# display("y_train value_counts:", y_train.value_counts())
# display("y_valid value_counts:", y_valid.value_counts())

In [None]:
train_id_group = X_train.groupby(["srch_id"])["srch_id"].count().values
valid_id_group = X_valid.groupby(["srch_id"])["srch_id"].count().values
# test_id_group = test_df.groupby(["srch_id"])["srch_id"].count().values

# Modeling

本次任务推荐系统领域，是预测用户在酒店搜索结果中列出的酒店属性最可能被点击，更准确的说是对用户可能点击/预定酒店进行排序，所以使用LGBMRanker。
LGBMRanker模型直接返回一个排序列表,例如用户点击方面的主题--会直接预测最后酒店的一个相对顺序,返回一个排序后的列表。 当然，该排序算法也应用于在线广告、协同过滤、多媒体检索等领域。

In [None]:
# import optuna
# def objective(trial):
#     #     param = {
#     #         ""
#     #     }
    
#     lgbm_ranker = LGBMRanker(
#         objective="lambdarank",
#         metric="ndcg",
#         boosting_type="dart",
#         n_estimators=trial.suggest_int("n_estimators",50,100),
#         importance_type="gain",
#         verbose=50,
        
#         reg_alpha=trial.suggest_float("reg_alpha",1e-8,10.0,log=True),
#         reg_lambda=trial.suggest_float("reg_lambda",1e-8,10.0,log=True),
#         num_leaves=trial.suggest_int("num_leaves",2,128),
#         min_child_weight=trial.suggest_float("min_child_weight",0.001,0.002),
#         min_child_samples=trial.suggest_int("min_child_samples",15,25),
#         max_depth=trial.suggest_int("max_depth",6,10),
#         # subsample_freq=trial.suggest_int("subsample_freq",2,10),
#     )

#     # Fit model
#     lgbm_ranker.fit(X_train,
#                     y_train,
#                     group=train_id_group,
#                     eval_set = [(X_valid, y_valid)],
#                     eval_group = [list(valid_id_group)],
#                     eval_at=[5],
#                     categorical_feature = categorical_cols
#                    )
    
#     best_score = lgbm_ranker.best_score_["valid_0"]["ndcg@5"]
#     return best_score


# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100)
# print(study.best_params)

In [None]:
from lightgbm.sklearn import LGBMRanker

# Class instance
lgbm_ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=80,
    importance_type="gain",
    verbose=10,
    
#     reg_alpha=0.000124684,
#     reg_lambda=1.4166555,
#     num_leaves=77,
#     min_child_weight=0.00134829,
#     min_child_samples=23,
#     max_depth=6,
#     # subsample_freq=trial.suggest_int("subsample_freq",2,10),
)

# Fit model
lgbm_ranker.fit(X_train,
                y_train,
                group=train_id_group,
                eval_set = [(X_valid, y_valid)],
                eval_group = [list(valid_id_group)],
                eval_at=[5],
                categorical_feature = categorical_cols
               )

参数上我们选择模型的默认参数，模型获得验证集 NDCG@5 0.37 的成绩

In [None]:
X_valid_copy = X_valid.copy()
X_valid_copy["predictions"] = lgbm_ranker.predict(X_valid)
X_valid_copy

In [None]:
test_df["predictions"] = lgbm_ranker.predict(test_df)
test_df_result = test_df.sort_values(['srch_id', 'predictions'], ascending=[True,False])[["srch_id","prop_id","predictions"]]
test_df_result[["srch_id","prop_id"]].to_csv("submission.csv",index=False)
test_df_result

# Archive

In [None]:
# valid_pred_list = X_valid.sort_values(['srch_id', 'predictions'], ascending=False).groupby('srch_id')['prop_id'].apply(list).values.tolist()

In [None]:
# lgbm_rg = lgbm.LGBMRegressor(n_estimators = 1000,
#                               random_state=42,
#                               # silent=True,
#                               n_jobs=4)
# lgbm_rg.fit(X_train, y_train)
# lgbm_rg.score(X_train, y_train)

In [None]:
# plt.hist(pred_valid ,bins=20, color='red',histtype='stepfilled',alpha=0.75)

In [None]:
# np.percentile(pred_valid, 90)