In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 

In [2]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV

In [3]:
item_data = pd.read_csv("item_data.csv")
train = pd.read_csv("train.csv")
view_log = pd.read_csv("view_log.csv")
test = pd.read_csv("test.csv")

In [4]:
#use catgeory plots to get different relation with prices
train["impression_time"] = pd.to_datetime(train["impression_time"])
train["binned_hours"] = pd.cut(train["impression_time"].dt.hour, bins=3,labels=["a_time","b_time","c_time"])
train["impression_date"] = train["impression_time"].dt.date
train.drop(["impression_time"],axis=1,inplace=True)

In [5]:
test["impression_time"] = pd.to_datetime(test["impression_time"])
test["binned_hours"] = pd.cut(test["impression_time"].dt.hour, bins=3,labels=["a_time","b_time","c_time"])
test["impression_date"] = test["impression_time"].dt.date
test.drop(["impression_time"],axis=1,inplace=True)

In [6]:
df = pd.merge(view_log,item_data,on="item_id",how="left")
del(view_log)
del(item_data)

In [7]:
df["item_price"].fillna(int(df["item_price"].mean()),inplace=True)
df["category_1"].fillna(int(df["category_1"].mean()),inplace=True)
df["category_2"].fillna(int(df["category_2"].mean()),inplace=True)
df["category_3"].fillna(int(df["category_3"].mean()),inplace=True)
df["product_type"].fillna(int(df["product_type"].mean()),inplace=True)

In [8]:
df["server_time"] = pd.to_datetime(df["server_time"])
df["binned_surfing_hours"] = pd.cut(df["server_time"].dt.hour, bins=3,labels=["a_time","b_time","c_time"])
df["surfing_date"] = df["server_time"].dt.date
df.drop(["server_time"],axis=1,inplace=True)

In [9]:
#**************************************************LATENCY*****************************************#

In [10]:
df_user_surf_date = df[["user_id","surfing_date"]]
df_user_train_date = train[["user_id","impression_date"]]
df_user_test_date = test[["user_id","impression_date"]]

In [11]:
#*********************************** TRAIN LATENCY*****************************************#
df_user_min_postive_latency = pd.merge(df_user_train_date,df_user_surf_date,on="user_id",how="left")
df_user_min_postive_latency["surfing_date"] = pd.to_datetime(df_user_min_postive_latency["surfing_date"])
df_user_min_postive_latency["impression_date"] = pd.to_datetime(df_user_min_postive_latency["impression_date"])

In [12]:
df_user_min_postive_latency["latency"] = (df_user_min_postive_latency["impression_date"]-\
                                          df_user_min_postive_latency["surfing_date"]).dt.days

In [13]:
df_user_min_postive_latency = df_user_min_postive_latency.groupby(["user_id"]).agg({\
                                                     "latency":lambda x: x[x>=0].min()
                                                     })
df_user_min_postive_latency.columns = ["latest_latency"]
df_train_latency = df_user_min_postive_latency

In [14]:
#*********************************** TEST LATENCY*****************************************#
df_user_min_postive_latency = pd.merge(df_user_test_date,df_user_surf_date,on="user_id",how="left")
df_user_min_postive_latency["surfing_date"] = pd.to_datetime(df_user_min_postive_latency["surfing_date"])
df_user_min_postive_latency["impression_date"] = pd.to_datetime(df_user_min_postive_latency["impression_date"])

In [15]:
df_user_min_postive_latency["latency"] = (df_user_min_postive_latency["impression_date"]-\
                                          df_user_min_postive_latency["surfing_date"]).dt.days

In [16]:
df_user_min_postive_latency = df_user_min_postive_latency.groupby(["user_id"]).agg({\
                                                     "latency":lambda x: x[x>=0].min()
                                                     })
df_user_min_postive_latency.columns = ["latest_latency"]
df_test_latency = df_user_min_postive_latency


In [17]:
train = pd.merge(train,df_train_latency,on="user_id",how="left")
test = pd.merge(test,df_test_latency,on="user_id",how="left")
del(df_test_latency)
del(df_user_min_postive_latency)
del(df_train_latency)
del(df_user_surf_date)
del(df_user_train_date)
del(df_user_test_date)

In [18]:
df["surfing_date"] = pd.to_datetime(df["surfing_date"])
df1 = df.groupby(["user_id"]).agg({\
                                  #"device_type":lambda x: len(x.unique()),
                                   
                                   "category_1":{\
                                                "avg":'mean',
                                                "dev":"std"
                                                },
                                   
                                    "category_2":{\
                                                "avg":'mean',
                                                "dev":"std"
                                                },
                                   
                                    "category_3":{\
                                                "avg":'mean',
                                                "dev":"std"
                                                },
                                   
                                   "session_id":{
                                       "unique" : lambda x: len(x.unique()),
                                       "total_sess" : "count"
                                               },
                                   
                                   "item_id": lambda x: len(x.unique()),
                                   
                                   "product_type":{
                                       "unique_products": lambda x: len(x.unique()),
                                       "total_products_search": "count",
                                                   },
                                   
                                   "item_price":{
                                           "avg_price": lambda x: sum(x.unique())/len(x.unique()),
                                           "dev_price": lambda x: x.unique().std(),
                                               },
                                   
                                   "surfing_date":{\
                                                  "unique_dates": lambda x: len(x.unique()),
                                                   "period_of_surfing": lambda x: max(x)- min(x)\
                                                   },
                                   
                                   "binned_surfing_hours":{
                                       "a_time_tot" :lambda x: len(x[x=="a_time"]),
                                       "b_time_tot" :lambda x: len(x[x=="b_time"]),
                                       "c_time_tot" :lambda x: len(x[x=="c_time"])
                                       
                                                   }
                                   
                                       
                                  })

In [19]:
del(df)

In [20]:
df1.columns = ["cat_1_mean","cat_1_dev","cat_2_mean","cat_2_dev","cat_3_mean","cat_3_dev",\
               "unique_sess","total_sess",\
              "unique_items","unique_products_type","total_products","mean_price","std_dev_price",\
              "unique_dates","period_of_surfing","a_time_tot","b_time_tot","c_time_tot"]

In [21]:
df1["period_of_surfing"] = df1["period_of_surfing"]/np.timedelta64(1,"D")

In [22]:
df1["total_by_unique"]  = df1['total_sess']/df1["unique_sess"]
df1["products_per_sess"] = df1["total_products"]/df1["unique_sess"]
df1["total_prod_by_uniq_item"] = df1["total_products"]/df1["unique_items"]
df1["a_time_tot"] = 100*df1["a_time_tot"]/df1["total_sess"]
df1["b_time_tot"] = 100*df1["b_time_tot"]/df1["total_sess"]
df1["c_time_tot"] = 100*df1["c_time_tot"]/df1["total_sess"]
df1["tot_sess_by_unique_date"] = df1["total_sess"]/df1["unique_dates"]
df1["frequency_of_surf_days"] = df1["period_of_surfing"]/df1["unique_dates"]

In [23]:
t_final = pd.merge(train,df1,on="user_id",how="left")

t_final.drop(["impression_date"],axis=1,inplace=True)
t_final.set_index('impression_id', inplace=True)

t_final["cat_1_dev"].fillna(0,inplace=True)#asthey have only one price value
t_final["cat_2_dev"].fillna(0,inplace=True)
t_final["cat_3_dev"].fillna(0,inplace=True)

t_final.drop(["total_products"],axis=1,inplace=True)

In [24]:
test_final = pd.merge(test,df1,on="user_id",how="left")
test_final.drop(["impression_date","total_products"],axis=1,inplace=True)

test_final["cat_1_dev"].fillna(0,inplace=True)#asthey have only one price value
test_final["cat_2_dev"].fillna(0,inplace=True)
test_final["cat_3_dev"].fillna(0,inplace=True)

test_final.set_index('impression_id', inplace=True)
del(train)
del(test)

In [25]:
#********************************************************************************************************************#
#********************************************************************************************************************#
train_dump = t_final
test_dump = test_final
test_index = test_final.index
#********************************************************************************************************************#
#********************************************************************************************************************#

In [26]:
t_final = train_dump
test_final = test_dump

In [27]:
#adding click percentage of app
df = t_final.groupby("app_code").agg({\
                                     "is_click":{\
                                                 "click_per":lambda x: 100* len(x[x==1])/len(x) ,
                                                 "count":lambda x: len(x)
                                                  }
                                      
                                     })
df.columns = ["click_per_app","traffic_app"]
#df = df.sort_values(by="count",ascending=True)

In [28]:
bins_app_click_per = [0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,
                      11,12,13,14,15,16,17,18,19,20,30,40,100]
labels_click_per = ["click_1","click_2","click_3","click_4",'click_5','click_6','click_7','click_8','click_9',
                    'click_10','click_11','click_12','click_13','click_14','click_15','click_16','click_17',
                    'click_18','click_19','click_20','click_21','click_22','click_23','click_24','click_25',
                    'click_26','click_27','click_28','click_29','click_30','click_31','click_32','click_33']

bins_traffic_app  = [0,10,20,50,100,500,1000,1500,2000,2500,3000,4000,5000,6000,8000,
                    10000,50000]
labels_traffic_app = ["traffic_1","traffic_2","traffic_3","traffic_4","traffic_5","traffic_6","traffic_7","traffic_8",
                      "traffic_9","traffic_10","traffic_11","traffic_12","traffic_13","traffic_14","traffic_15",
                      "traffic_16"]

In [29]:
df["bins_click_app_per"] = pd.cut(df["click_per_app"],bins=bins_app_click_per,
                                       labels=labels_click_per,
                                       include_lowest=True)
df["bins_traffic_app"] = pd.cut(df["traffic_app"],bins=bins_traffic_app,
                                     labels=labels_traffic_app,
                                     include_lowest=True)
df.drop(["click_per_app","traffic_app"],axis=1,inplace=True)

In [30]:
t_final = pd.merge(t_final,df,on="app_code",how="left")

In [31]:
test_final = pd.merge(test_final,df,on="app_code",how="left")
test_final["bins_click_app_per"].fillna("click_1",inplace=True)
test_final["bins_traffic_app"].fillna("traffic_1",inplace=True)
test_final.index = test_index
del(df)

In [32]:
# df_os = pd.get_dummies(t_final[['os_version','binned_hours']])
# t_final = pd.concat([t_final,df_os],axis=1)
# t_final.drop(["os_version","binned_hours"],axis=1,inplace=True)

# df_os = pd.get_dummies(test_final[['os_version','binned_hours']])
# test_final = pd.concat([test_final,df_os],axis=1)
# test_final.drop(["os_version","binned_hours"],axis=1,inplace=True)


# df_os = pd.get_dummies(t_final[['bins_click_app_per','bins_traffic_app']])
# t_final = pd.concat([t_final,df_os],axis=1)
# t_final.drop(["bins_click_app_per","bins_traffic_app"],axis=1,inplace=True)

# df_os = pd.get_dummies(test_final[['bins_click_app_per','bins_traffic_app']])
# test_final = pd.concat([test_final,df_os],axis=1)
# test_final.drop(["bins_click_app_per","bins_traffic_app"],axis=1,inplace=True)

In [33]:
test_final.drop(["user_id","app_code"],axis=1,inplace=True)
t_final.drop(["user_id","app_code"],axis=1,inplace=True)

In [95]:
t_final.head()

Unnamed: 0,os_version,is_4G,is_click,binned_hours,cat_1_mean,cat_1_dev,cat_2_mean,cat_2_dev,cat_3_mean,cat_3_dev,unique_sess,total_sess,unique_items,unique_products_type,mean_price,std_dev_price,unique_dates,period_of_surfing,a_time_tot,b_time_tot,c_time_tot,total_by_unique,total_prod_by_uniq_item,tot_sess_by_unique_date,frequency_of_surf_days,bins_click_app_per,bins_traffic_app
0,2,0,0,0,11.0,0.0,35.0,0.0,20.0,0.0,1,1,1,1.0,2350.0,0.0,1,0.0,0.0,0.0,100.0,1.0,1.0,1.0,0.0,16,11
1,1,1,1,0,7.333333,3.576014,67.0,10.548589,257.5,72.470057,7,12,7,7.0,4774.285714,4142.30111,6,50.0,41.666667,16.666667,41.666667,1.714286,1.714286,2.0,8.333333,4,11
2,0,1,0,0,10.5,3.535534,44.0,32.526912,154.5,21.92031,2,2,2,2.0,1598.5,625.5,2,31.0,0.0,100.0,0.0,1.0,1.0,1.0,15.5,1,11
3,1,1,0,0,9.277778,5.244667,44.333333,12.461706,159.444444,70.529278,7,18,12,12.0,13663.0,25086.23912,7,12.0,5.555556,22.222222,72.222222,2.571429,1.5,2.571429,1.714286,15,3
4,1,0,0,0,9.978261,4.828108,32.652174,22.100998,168.391304,95.230592,24,46,34,34.0,10845.235294,20963.746847,19,54.0,21.73913,15.217391,63.043478,1.916667,1.352941,2.421053,2.842105,2,13


In [35]:
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#
#***********************************  data prep ends  ******************************#

In [54]:
t_final.drop(["latest_latency","products_per_sess"],axis=1,inplace=True)#not good for this model but can be used
test_final.drop(["latest_latency","products_per_sess"],axis=1,inplace=True)#Prodcuts__>>> coz not imp feature

In [55]:
#t_final.head()
categorical_features = ["os_version","is_4G","binned_hours","bins_click_app_per","bins_traffic_app"]
from sklearn.preprocessing import LabelEncoder
t_final[categorical_features] = t_final[categorical_features].apply(LabelEncoder().fit_transform)
test_final[categorical_features] = test_final[categorical_features].apply(LabelEncoder().fit_transform)

In [56]:
X = t_final.drop(["is_click"],axis=1)
y = t_final[["is_click"]]

In [124]:
categorical_features = [0,1,2,27,28]

model = LGBMClassifier(boosting_type='goss', 
                      objective='binary',
                      num_iteration=15000,
                      num_leaves=80,
                      #min_data_in_leaf=5,
                      max_depth=7,
                      learning_rate=0.001,
                      #seed=0,
                      categorical_feature = categorical_features
                     )

In [58]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
scores = cross_val_score(model, X, y,scoring='roc_auc', cv=5)

In [59]:
#100--> 37

array([0.7505507 , 0.75118953, 0.74861677, 0.74624838, 0.75004485])

In [125]:
model = model.fit(X,y)

In [126]:
pred = model.predict_proba(test_final)
pred = pred[:,1]
pred = pd.DataFrame(pred)
index = test_final.index
pred.set_index(index,inplace=True)
pred.columns = ["is_click"]

In [129]:
pred.to_csv("finalday14.csv")

In [128]:
#pred[pred["is_click"]>=0.5].shape

(37, 1)

In [64]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,X.columns)), columns=['Value','Feature'])

In [65]:
feature_imp

Unnamed: 0,Value,Feature
0,3555,is_4G
1,6135,unique_products_type
2,6559,os_version
3,8271,unique_dates
4,10417,unique_items
5,13743,total_sess
6,14178,bins_traffic_app
7,15241,binned_hours
8,16630,unique_sess
9,16941,a_time_tot
