In [1]:
# importing the libraries
import os
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

In [2]:
# defining few parameters
root_dir = os.path.dirname(os.path.abspath(os.getcwd())) # setting up root directory
warnings.filterwarnings("ignore")
sns.set_style("darkgrid")                                # graph style
plt.rcParams["figure.figsize"] = (12, 8)                 # graph size

In [29]:
# defining the paths
trainpath = os.path.join(root_dir, "data", "train.csv")
item_data_path = os.path.join(root_dir, "data", "item_data.csv")
view_log_path = os.path.join(root_dir, "data", "view_log.csv")
testpath = os.path.join(root_dir, "data", "test.csv")

# importing the datasets
train_df = pd.read_csv(trainpath)
item_df = pd.read_csv(item_data_path)
view_df = pd.read_csv(view_log_path)
test_df = pd.read_csv(testpath)

In [30]:
train_df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0


### Column descriptions - 
- impression_id = Ad impression ID 1 (unique). Shouldn't be fed to the model
- impression_time = time of impression at the partner site
- user_id = user identification
- app code = Application Code for a partner website where the ad was shown
- os_version = version of operating system
- is_4G = 1 (Using 4G), 0 (No 4G)
- is_click = **target**; 0 (No Click), 1 (Click)

In [31]:
item_df.head()

Unnamed: 0,item_id,item_price,category_1,category_2,category_3,product_type
0,26880,4602,11,35,20,3040
1,54939,3513,12,57,85,6822
2,40383,825,17,8,279,1619
3,8777,2355,13,58,189,5264
4,113705,1267,17,39,151,10239


Variables are quite self-explanatory. But can't be mapped to the main column as there is not shared column. 
- item_id is unique in this dataset

In [32]:
view_df.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
0,2018-10-15 08:58:00,android,112333,4557,32970
1,2018-10-15 08:58:00,android,503590,74788,7640
2,2018-10-15 08:58:00,android,573960,23628,128855
3,2018-10-15 08:58:00,android,121691,2430,12774
4,2018-10-15 08:58:00,android,218564,19227,28296


### Column Description -
- server_time = Timestamp of the log
- device_type = Device type of the user
- browser_id = Browser session id
- user_id = can be used to map this table to traindf
- item_id = can be used to map this table to item_data_df

We have two attributes that captures the time, ``server_time`` in ``view_df`` and ``impression_time`` in ``train_df``. These attributes can help us to validate whether the dataset have a time component influencing the label or not.

For this, we will first convert them into datetime format and then sort them.

In [33]:
# 01. converting date columns into datetime format

view_df["server_time"] = pd.to_datetime(view_df["server_time"])
train_df["impression_time"] = pd.to_datetime(train_df["impression_time"])

view_df = view_df.sort_values("server_time").reset_index(drop = True)
train_df = train_df.sort_values("impression_time").reset_index(drop = True)

In [36]:
%%time
# 02. Datetime features
# the following snippet gives the mean impression time difference for a particular user.
# np.timedelta helps us to perform arithmentic on two timedeltas. The first argument defines a number and second defines the unit of time. 
# Here, that unit is seconds

train_df["diff_time_mean"] = train_df["user_id"].map(
    train_df.groupby("user_id")["impression_time"].apply(lambda x: np.nanmean(x.diff() / np.timedelta64(1, "s"))).to_dict()
)

# similarly performing some other functions
train_df["diff_time_max"] = train_df["user_id"].map(
    train_df.groupby("user_id")["impression_time"].apply(lambda x: np.nanmax(x.diff() / np.timedelta64(1, "s"))).to_dict()
)
train_df["diff_time_min"] = train_df["user_id"].map(
    train_df.groupby("user_id")["impression_time"].apply(lambda x: np.nanmin(x.diff() / np.timedelta64(1, "s"))).to_dict()
)

In [37]:
train_df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,diff_time_mean,diff_time_max,diff_time_min
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0,74670.0,148200.0,1140.0
1,c81e728d9d4c2f636f067f89cc14862c,2018-11-15 00:00:00,89464,129,intermediate,0,0,101901.818182,347520.0,180.0
2,eccbc87e4b5ce2fe28308fd9f2a7baf3,2018-11-15 00:00:00,58442,127,latest,0,0,68812.727273,165720.0,8400.0
3,a87ff679a2f3e71d9181a67b7542122c,2018-11-15 00:00:00,4238,371,latest,0,0,540.0,540.0,540.0
4,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,45745.882353,167340.0,360.0


In [46]:
# 03. encoding os_version

encoder_1 = LabelEncoder()
os_encoder = encoder_1.fit(train_df["os_version"])

# serializing and dumping the encoder for future use (for encoding the testset)
with open(os.path.join(root_dir, "models", "os_encoder"), 'wb') as f:
    pickle.dump(os_encoder, f)
    
# transforming
train_df["os_version"] = os_encoder.transform(train_df["os_version"])

##########################################################################
# if loading then run following snippet
"""
with open(os.path.join(root_dir, "models", "os_encoder"), 'rb') as handle:
    b = pickle.load(handle)
"""
##########################################################################

'\nwith open(os.path.join(root_dir, "models", "os_encoder"), \'rb\') as handle:\n    b = pickle.load(handle)\n'

In [None]:
data['cnt_unique_app'] = data['user_id'].map(data.groupby('user_id')['app_code'].apply(lambda x: x.unique().size).to_dict())

In [None]:
# 04. count unique apps used by the user
train[""]

In [51]:
train_df['user_id'].map(train_df.groupby('user_id')['app_code'].apply(lambda x: x.unique().size).to_dict())

0         1
1         1
2         1
3         1
4         2
         ..
237604    1
237605    1
237606    1
237607    1
237608    1
Name: user_id, Length: 237609, dtype: int64