In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, classification_report

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# !pip install category_encoders

In [None]:
from category_encoders import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install lightgbm

In [None]:
import lightgbm as lgb


In [None]:
# Model and performance evaluation
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
lbl = preprocessing.LabelEncoder()
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score
# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Crossroads/engg_data/crossroads_full_feature_set.csv")

In [None]:
df.isnull().sum()

acct_id                           0
acct_type_desc                91418
event_name                        0
event_date                        0
plan_event_name              279172
comp_name                     80644
section_name                      0
row_name                          0
SeatNum                           0
price_code                        0
PC1                               0
Price                             0
paid                         154145
add_datetime                  80644
class_name                        0
status                            0
Sales_Source                1280704
isHost                            0
SeatType                          0
TicketClass                       0
Start Year                  1146325
LastYear                    1146325
Term                        1146445
TicketType                        0
SeatUniqueID                      0
Season                            0
ClubExpYear                 1146325
Tenure                      

In [None]:
def pivot(train_data, group_cols, agg_col):
    data = train_data.groupby(group_cols).agg({agg_col: "count"}).reset_index()
    data = pd.pivot_table(data, values=agg_col, index=[group_cols[0]], columns=[group_cols[1]], aggfunc=np.sum).reset_index().rename_axis(None, axis=1)
    data = data.fillna(0)
    data["count"] = data["Yes"] + data["No"]
    data["%Attended"] = 100*data["Yes"]/(data["Yes"]+data["No"])
    return data

In [None]:
accnt_type_df = pivot(df, ["acct_id", "isAttended"], "UniqueID")
accnt_type_df.head()

Unnamed: 0,acct_id,???,No,Yes,count,%Attended
0,190,6467.0,68700.0,369.0,69069.0,0.534248
1,14999990,0.0,8.0,0.0,8.0,0.0
2,16999990,0.0,1.0,0.0,1.0,0.0
3,110002390,14.0,29.0,97.0,126.0,76.984127
4,110003490,20.0,25.0,135.0,160.0,84.375


In [None]:
test_acct_ids = df[df["isAttended"]=="???"]["acct_id"].values

In [None]:
temp = df[df["isAttended"]!="???"]
acct_count_df = temp.groupby("acct_id").agg({"event_name": "nunique"}).reset_index()
acct_count_df = acct_count_df.rename(columns={"event_name": "event_name_count"})
acct_count_df.head()

Unnamed: 0,acct_id,event_name_count
0,190,18
1,14999990,2
2,16999990,1
3,110002390,18
4,110003490,16


In [None]:
"count", "%Attended", "event_name_count"

('count', '%Attended', 'event_name_count')

In [None]:
df = df.merge(accnt_type_df[["acct_id", "count", "%Attended"]], on="acct_id", how="left")
df = df.merge(acct_count_df[["acct_id", "event_name_count"]], on="acct_id", how="left")

In [None]:
df.head()

Unnamed: 0,acct_id,acct_type_desc,event_name,event_date,plan_event_name,comp_name,section_name,row_name,SeatNum,price_code,PC1,Price,paid,add_datetime,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Start Year,LastYear,Term,TicketType,SeatUniqueID,Season,ClubExpYear,Tenure,UniqueID,isAttended,Resold,ResalePrice,ResaleDate,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Game Time,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag,count,%Attended,event_name_count
0,117948790,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,1,FR,F,111.0,Y,2021-03-31 16:08:52,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-1,2021,,25.0,CLT21HOU-101-10-1,Yes,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,54.0,94.444444,18.0
1,126665090,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,10,FR,F,111.0,Y,2021-03-31 16:08:19,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-10,2021,,20.0,CLT21HOU-101-10-10,No,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,72.0,70.833333,18.0
2,126665090,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,11,FR,F,111.0,Y,2021-03-31 16:08:19,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-11,2021,,20.0,CLT21HOU-101-10-11,No,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,72.0,70.833333,18.0
3,1489599590,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,12,FR,F,111.0,Y,2021-03-31 16:10:20,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-12,2021,,4.0,CLT21HOU-101-10-12,Yes,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,18.0,83.333333,18.0
4,115101990,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,13,FR,F,111.0,Y,2021-03-31 16:08:21,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-13,2021,,32.0,CLT21HOU-101-10-13,No,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,144.0,72.222222,18.0


In [None]:
df.isnull().sum()["%Attended"]

13443

In [None]:
t0 = df[(df["isAttended"]=="???") & (df["%Attended"]<21) & (df["count"]>=10) & (df["event_name_count"]>2)]
t0.shape, t0.acct_id.nunique()

((7128, 69), 96)

In [None]:
t = df[df["isAttended"]=="???"]
t.shape, t.acct_id.nunique()

((128688, 69), 22102)

In [None]:
100.0*7128/128688

5.538977993286087

In [None]:
att  = df["isAttended"].values
df = df.drop("isAttended", 1)
df["isAttended"] = att

In [None]:
df.head()

Unnamed: 0,acct_id,acct_type_desc,event_name,event_date,plan_event_name,comp_name,section_name,row_name,SeatNum,price_code,PC1,Price,paid,add_datetime,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Start Year,LastYear,Term,TicketType,SeatUniqueID,Season,ClubExpYear,Tenure,UniqueID,Resold,ResalePrice,ResaleDate,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Game Time,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag,count,%Attended,event_name_count,isAttended
0,117948790,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,1,FR,F,111.0,Y,2021-03-31 16:08:52,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-1,2021,,25.0,CLT21HOU-101-10-1,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,54.0,94.444444,18.0,Yes
1,126665090,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,10,FR,F,111.0,Y,2021-03-31 16:08:19,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-10,2021,,20.0,CLT21HOU-101-10-10,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,72.0,70.833333,18.0,No
2,126665090,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,11,FR,F,111.0,Y,2021-03-31 16:08:19,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-11,2021,,20.0,CLT21HOU-101-10-11,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,72.0,70.833333,18.0,No
3,1489599590,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,12,FR,F,111.0,Y,2021-03-31 16:10:20,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-12,2021,,4.0,CLT21HOU-101-10-12,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,18.0,83.333333,18.0,Yes
4,115101990,Season,CLT21HOU,2021-10-17,21FS,Not Comp,101,10,13,FR,F,111.0,Y,2021-03-31 16:08:21,OPEN,SOLD,,0,GA,Manifest,,,,Renewal,101-10-13,2021,,32.0,CLT21HOU-101-10-13,,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,13:00,108,89,2.5,184.4,52.12,0.0,39.62,1,144.0,72.222222,18.0,No


In [None]:
df = df.drop(["acct_id", "event_date", "section_name", "row_name", "SeatNum", "Game Time", 
              "add_datetime", "Start Year", "LastYear", "SeatUniqueID", "ResaleDate", 
              "ClubExpYear"], 1)


In [None]:
df["Season"] = df["Season"].astype(str)

In [None]:
cat_cols = []
for k,v in dict(df.dtypes).items():
  if v=="object":
    cat_cols.append(k)

cat_cols = list(set(cat_cols) - set(["event_name", "UniqueID", "isAttended"]))

In [None]:
# cat_cols

In [None]:
from pandas.core.indexes import category
for col in cat_cols:
  df[col] = df[col].astype("category")

In [None]:
def label_encode(row, col):
  if row[col]=="Yes":
    return 1
  elif row[col]=="No":
    return 0
  else:
    return row[col]

df["isAttended"] = df.apply(lambda x: label_encode(x, "isAttended"), 1)

In [None]:
df.shape

(1286916, 57)

In [None]:
df.head()

Unnamed: 0,acct_type_desc,event_name,plan_event_name,comp_name,price_code,PC1,Price,paid,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Term,TicketType,Season,Tenure,UniqueID,Resold,ResalePrice,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag,count,%Attended,event_name_count,isAttended
0,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,25.0,CLT21HOU-101-10-1,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,54.0,94.444444,18.0,1
1,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,CLT21HOU-101-10-10,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,72.0,70.833333,18.0,0
2,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,CLT21HOU-101-10-11,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,72.0,70.833333,18.0,0
3,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,4.0,CLT21HOU-101-10-12,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,18.0,83.333333,18.0,1
4,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,32.0,CLT21HOU-101-10-13,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,144.0,72.222222,18.0,0


In [None]:
train_df = df[(df["isAttended"]!="???") & (~df["event_name"].isin(["CLT22PIT", "CLT22LAC"]))]
val_df = df[(df["isAttended"]!="???") & (df["event_name"].isin(["CLT22PIT", "CLT22LAC"]))]
test_df = df[df["isAttended"]=="???"]

In [None]:
train_cnt = train_df["count"].values
train_att = train_df["%Attended"].values
train_event_cnt = train_df["event_name_count"].values

val_cnt = val_df["count"].values
val_att = val_df["%Attended"].values
val_event_cnt = val_df["event_name_count"].values

test_cnt = test_df["count"].values
test_att = test_df["%Attended"].values
test_event_cnt = test_df["event_name_count"].values

In [None]:
train_df = train_df.drop(["count", "%Attended", "event_name_count"], 1)
val_df = val_df.drop(["count", "%Attended", "event_name_count"], 1)
test_df = test_df.drop(["count", "%Attended", "event_name_count"], 1)

In [None]:
train_df.head()

Unnamed: 0,acct_type_desc,event_name,plan_event_name,comp_name,price_code,PC1,Price,paid,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Term,TicketType,Season,Tenure,UniqueID,Resold,ResalePrice,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag,isAttended
0,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,25.0,CLT21HOU-101-10-1,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,1
1,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,CLT21HOU-101-10-10,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,0
2,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,CLT21HOU-101-10-11,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,0
3,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,4.0,CLT21HOU-101-10-12,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,1
4,Season,CLT21HOU,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,32.0,CLT21HOU-101-10-13,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1,0


In [None]:
test_df.head()

Unnamed: 0,acct_type_desc,event_name,plan_event_name,comp_name,price_code,PC1,Price,paid,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Term,TicketType,Season,Tenure,UniqueID,Resold,ResalePrice,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag,isAttended
193038,Season,CLT21LV,21FS,Not Comp,FR,F,135.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,25.0,CLT21LV-101-10-1,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,276.2,,,1.0,High,High,High,High,Reg,17,Sunday,420,316,2.2,1761.0,29.27,0.0,24.77,1,???
193039,Season,CLT21LV,21FS,Not Comp,FR,F,135.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,CLT21LV-101-10-10,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,276.2,,,1.0,High,High,High,High,Reg,17,Sunday,420,316,2.2,1761.0,29.27,0.0,24.77,1,???
193040,Season,CLT21LV,21FS,Not Comp,FR,F,135.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,CLT21LV-101-10-11,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,276.2,,,1.0,High,High,High,High,Reg,17,Sunday,420,316,2.2,1761.0,29.27,0.0,24.77,1,???
193041,Season,CLT21LV,21FS,Not Comp,FR,F,135.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,4.0,CLT21LV-101-10-12,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,276.2,,,1.0,High,High,High,High,Reg,17,Sunday,420,316,2.2,1761.0,29.27,0.0,24.77,1,???
193042,Season,CLT21LV,21FS,Not Comp,FR,F,135.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,32.0,CLT21LV-101-10-13,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,276.2,,,1.0,High,High,High,High,Reg,17,Sunday,420,316,2.2,1761.0,29.27,0.0,24.77,1,???


In [None]:
test_df["isAttended"].unique()

array(['???'], dtype=object)

In [None]:
train_df.shape, val_df.shape, test_df.shape

((1029536, 54), (128692, 54), (128688, 54))

In [None]:
train_df["event_name"].nunique(), val_df["event_name"].nunique(), test_df["event_name"].nunique()

(16, 2, 2)

In [None]:
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1].values
X_val = val_df.iloc[:, :-1]
y_val = val_df.iloc[:, -1].values
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1].values

In [None]:
enc = TargetEncoder(cols=cat_cols, min_samples_leaf=20, smoothing=10, handle_missing = 'return_nan').fit(X_train, y_train)
X_train_encoded = enc.transform(X_train)
X_val_encoded = enc.transform(X_val)
X_test_encoded = enc.transform(X_test)

In [None]:
X_train_encoded.head()

Unnamed: 0,acct_type_desc,event_name,plan_event_name,comp_name,price_code,PC1,Price,paid,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Term,TicketType,Season,Tenure,UniqueID,Resold,ResalePrice,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag
0,0.842318,CLT21HOU,0.831166,0.836616,0.850486,0.840325,111.0,0.825416,0.823359,0.836616,,0,0.763649,0.791033,,0.84441,0.752743,25.0,CLT21HOU-101-10-1,,,,0.840957,0.828715,0.793333,0.775226,0.808719,0.747116,0.834791,0.836764,0.827703,0.82742,0.78834,0.838018,199.4,,,1.0,0.671998,0.684329,0.67706,0.679989,0.812634,6,0.782914,108,89,2.5,184.4,52.12,0.0,39.62,1
1,0.842318,CLT21HOU,0.831166,0.836616,0.850486,0.840325,111.0,0.825416,0.823359,0.836616,,0,0.763649,0.791033,,0.84441,0.752743,20.0,CLT21HOU-101-10-10,,,,0.840957,0.828715,0.793333,0.775226,0.808719,0.747116,0.834791,0.836764,0.827703,0.82742,0.78834,0.838018,199.4,,,1.0,0.671998,0.684329,0.67706,0.679989,0.812634,6,0.782914,108,89,2.5,184.4,52.12,0.0,39.62,1
2,0.842318,CLT21HOU,0.831166,0.836616,0.850486,0.840325,111.0,0.825416,0.823359,0.836616,,0,0.763649,0.791033,,0.84441,0.752743,20.0,CLT21HOU-101-10-11,,,,0.840957,0.828715,0.793333,0.775226,0.808719,0.747116,0.834791,0.836764,0.827703,0.82742,0.78834,0.838018,199.4,,,1.0,0.671998,0.684329,0.67706,0.679989,0.812634,6,0.782914,108,89,2.5,184.4,52.12,0.0,39.62,1
3,0.842318,CLT21HOU,0.831166,0.836616,0.850486,0.840325,111.0,0.825416,0.823359,0.836616,,0,0.763649,0.791033,,0.84441,0.752743,4.0,CLT21HOU-101-10-12,,,,0.840957,0.828715,0.793333,0.775226,0.808719,0.747116,0.834791,0.836764,0.827703,0.82742,0.78834,0.838018,199.4,,,1.0,0.671998,0.684329,0.67706,0.679989,0.812634,6,0.782914,108,89,2.5,184.4,52.12,0.0,39.62,1
4,0.842318,CLT21HOU,0.831166,0.836616,0.850486,0.840325,111.0,0.825416,0.823359,0.836616,,0,0.763649,0.791033,,0.84441,0.752743,32.0,CLT21HOU-101-10-13,,,,0.840957,0.828715,0.793333,0.775226,0.808719,0.747116,0.834791,0.836764,0.827703,0.82742,0.78834,0.838018,199.4,,,1.0,0.671998,0.684329,0.67706,0.679989,0.812634,6,0.782914,108,89,2.5,184.4,52.12,0.0,39.62,1


In [None]:
train_ids = X_train["UniqueID"].values
val_ids = X_val["UniqueID"].values
test_ids = X_test["UniqueID"].values

X_train = X_train.drop(["UniqueID", "event_name"], 1)
X_val = X_val.drop(["UniqueID", "event_name"], 1)
X_test = X_test.drop(["UniqueID", "event_name"], 1)

In [None]:
train_ids = X_train_encoded["UniqueID"].values
val_ids = X_val_encoded["UniqueID"].values
test_ids = X_test_encoded["UniqueID"].values

X_train_encoded = X_train_encoded.drop(["UniqueID", "event_name"], 1)
X_val_encoded = X_val_encoded.drop(["UniqueID", "event_name"], 1)
X_test_encoded = X_test_encoded.drop(["UniqueID", "event_name"], 1)

In [None]:
X_train.head()

Unnamed: 0,acct_type_desc,plan_event_name,comp_name,price_code,PC1,Price,paid,class_name,status,Sales_Source,isHost,SeatType,TicketClass,Term,TicketType,Season,Tenure,Resold,ResalePrice,isSTM,acct_type_group,plan_event_name_group,row_name_group,seat_nums_group,section_names_group,Zone,comp_names_group,price_group,PC1_group,class_name_grp,ticket_class_grp,TicketType_group,no_days_prebooked,gameday_minus_start_year,lastyear_minus_gameday,section_group,section_group_segment,section_name_segment,row_segment,seat_segment,Game_Type,Week,week_day,Home Points,Away Points,unemployment_rate,covid_cases,Avg_Day_Temp,Precipitation,Avg_Dew_Point,weekend_flag
0,Season,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,25.0,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1
1,Season,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1
2,Season,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,20.0,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1
3,Season,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,4.0,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1
4,Season,21FS,Not Comp,FR,F,111.0,Y,OPEN,SOLD,,0,GA,Manifest,,Renewal,2021,32.0,,,,G1,G1,G1,G1,G1,N,G1,price_group_3,PC1_group_2,class_name_2,ticket_class_1,group_0,199.4,,,1.0,Low,Low,Low,Low,Reg,6,Sunday,108,89,2.5,184.4,52.12,0.0,39.62,1


In [None]:
y_train[:5]

array([1, 0, 0, 1, 0], dtype=object)

In [None]:
df["isAttended"].value_counts()

1      900945
0      257283
???    128688
Name: isAttended, dtype: int64

In [None]:
257283/900945, 900945/257283

(0.28557015134109187, 3.5017665372372058)

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [None]:
n_HP_points_to_test = 100

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_val_encoded,[int(each) for each in y_val])],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [None]:
# gs.fit(X_train_encoded, [int(each) for each in y_train], **fit_params)
# print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [None]:
get_params_ = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 5000,
 'n_jobs': 4,
 'num_leaves': 31,
 'objective': None,
 'random_state': 314,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'metric': 'None'}

In [None]:
opt_parameters = {'colsample_bytree': 0.7792703648870174, 'min_child_samples': 344, 'min_child_weight': 0.01, 
                  'num_leaves': 10, 'reg_alpha': 0, 'reg_lambda': 10, 'subsample': 0.8503048560728566} 


In [None]:
clf_sw = lgb.LGBMClassifier(**get_params_)
#set optimal parameters
clf_sw.set_params(**opt_parameters)

LGBMClassifier(colsample_bytree=0.7792703648870174, metric='None',
               min_child_samples=344, min_child_weight=0.01, n_estimators=5000,
               n_jobs=4, num_leaves=10, random_state=314, reg_alpha=0,
               reg_lambda=10, silent=True, subsample=0.8503048560728566)

In [None]:
gs_sample_weight = GridSearchCV(estimator=clf_sw, 
                                param_grid={'scale_pos_weight':[1,2,6,12]},
                                scoring='roc_auc',
                                cv=5,
                                refit=True,
                                verbose=True)

In [None]:
gs_sample_weight.fit(X_train_encoded, [int(each) for each in y_train], **fit_params)
print('Best score reached: {} with params: {} '.format(gs_sample_weight.best_score_, gs_sample_weight.best_params_))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[100]	valid's auc: 0.701978
Best score reached: 0.7105866222835976 with params: {'scale_pos_weight': 12} 


In [None]:
# print("Valid+-Std     Train  :   Parameters")
# for i in np.argsort(gs_sample_weight.cv_results_['mean_test_score'])[-5:]:
#     print('{1:.3f}+-{3:.3f}     :  {0}'.format(gs_sample_weight.cv_results_['params'][i], 
#                                     gs_sample_weight.cv_results_['mean_test_score'][i], 
#                                     gs_sample_weight.cv_results_['std_test_score'][i]))

In [None]:
print('Best score reached: {} with params: {} '.format(gs_sample_weight.best_score_, gs_sample_weight.best_params_))

In [None]:
# opt_sw_params = gs_sample_weight.best_params_

In [None]:
# clf_final = lgb.LGBMClassifier(**clf.get_params())
clf_final = lgb.LGBMClassifier(**get_params_)
#set optimal parameters
# clf_final.set_params(**opt_sw_params)

#Train the final model with learning rate decay
clf_final.fit(X_train_encoded, [int(each) for each in y_train], **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])


Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[28]	valid's auc: 0.698464


LGBMClassifier(metric='None', n_estimators=5000, n_jobs=4, random_state=314)

In [None]:
threshold=0.5

In [None]:
y_pred_val = clf_final.predict_proba(X_val_encoded)

In [None]:
y_pred_val = [1 if each[1]>threshold else 0 for each in y_pred_val]

print(classification_report([int(each) for each in y_val.tolist()], y_pred_val))

              precision    recall  f1-score   support

           0       1.00      0.22      0.36     19583
           1       0.88      1.00      0.93    109109

    accuracy                           0.88    128692
   macro avg       0.94      0.61      0.64    128692
weighted avg       0.90      0.88      0.85    128692



In [None]:
len(y_pred_val) - sum(y_pred_val), len(y_pred_val)

(4238, 128692)

In [None]:
# import sklearn
# sklearn.metrics.SCORERS.keys()

In [None]:
y_pred_val = [each[1] for each in clf_final.predict_proba(X_val_encoded)]

In [None]:
y_pred_val = [1 if each>threshold else 0 for each in y_pred_val]

In [None]:
print(classification_report([int(each) for each in y_val.tolist()], y_pred_val))

              precision    recall  f1-score   support

           0       1.00      0.22      0.36     19583
           1       0.88      1.00      0.93    109109

    accuracy                           0.88    128692
   macro avg       0.94      0.61      0.64    128692
weighted avg       0.90      0.88      0.85    128692



In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix([int(each) for each in y_val.tolist()], y_pred_val))

[[  4235  15348]
 [     3 109106]]


In [None]:
def get_heuristic_pred(row):
  if ((row['%Attended']<35) & (row['count']>=10) & (row["event_name_count"]>2)) or \
  (row["class_name"] in ['DIST-7HOLD','DIST-EXCP','DIST-WIP','KILL SEATS','OB2','OWNERSHIP','SINGLE GAME SRO','XTEST', 'UNTOUCHABLE','OBSTRUCTED']):
    return 0
  else:
    return row['model_pred']

In [None]:
val_pred_df = pd.DataFrame()
val_pred_df["UniqueID"] = val_ids
val_pred_df["count"] = val_cnt
val_pred_df["%Attended"] = val_att
val_pred_df["event_name_count"] = val_event_cnt
val_pred_df["class_name"] = val_df["class_name"].values
val_pred_df["TicketClass"] = val_df["TicketClass"].values
val_pred_df["Actual"] = [int(each) for each in y_val.tolist()]
val_pred_df["model_pred"] = y_pred_val
val_pred_df["heuristic_preds"] = val_pred_df.apply(lambda x: get_heuristic_pred(x), 1)

In [None]:
print(classification_report([int(each) for each in y_val.tolist()], val_pred_df["heuristic_preds"].values.tolist()))


              precision    recall  f1-score   support

           0       0.91      0.26      0.40     19583
           1       0.88      1.00      0.94    109109

    accuracy                           0.88    128692
   macro avg       0.90      0.63      0.67    128692
weighted avg       0.89      0.88      0.85    128692



In [None]:
test_preds = [each[1] for each in clf_final.predict_proba(X_test_encoded)]

In [None]:
test_preds = [1 if each>threshold else 0 for each in test_preds]

In [None]:
1 - (sum(test_preds)/len(test_preds))

0.05363359442993909

In [None]:
len(test_preds) - sum(test_preds), len(test_preds)

(6902, 128688)

In [None]:
len(test_acct_ids)

128688

In [None]:
test_pred_df = pd.DataFrame()
test_pred_df["UniqueID"] = test_ids
test_pred_df["acct_id"] = test_acct_ids
test_pred_df["count"] = test_cnt
test_pred_df["%Attended"] = test_att
test_pred_df["event_name_count"] = test_event_cnt
test_pred_df["class_name"] = test_df["class_name"].values
test_pred_df["TicketClass"] = test_df["TicketClass"].values
test_pred_df["model_pred"] = test_preds
test_pred_df["heuristic_preds"] = test_pred_df.apply(lambda x: get_heuristic_pred(x), 1)

In [None]:
test_preds_heur = test_pred_df["heuristic_preds"].values
len(test_preds_heur) - sum(test_preds_heur), len(test_preds_heur)

(8431, 128688)

In [None]:
val_pred_df.to_csv("/content/drive/MyDrive/Crossroads/predictions/LGB_v1.0_val_sub_5.csv", index=False)

In [None]:
test_pred_df.to_csv("/content/drive/MyDrive/Crossroads/predictions/LGB_v1.0_test_sub_5.csv", index=False)