In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, confusion_matrix

from sklearn.calibration import CalibratedClassifierCV


from sklearn.metrics import f1_score,precision_recall_curve,roc_curve, recall_score,precision_score

from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [2]:
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced

Using TensorFlow backend.


In [3]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)

In [4]:
import os
os.listdir('../input/wns-analyticswizard')

['test_aq1fgdb', 'train_na17sgz', 'sample_submission_ipsblct']

In [5]:
%%time
Path ='../input/wns-analyticswizard/'


train_df = pd.read_csv(Path+'train_na17sgz/train.csv')
test_df = pd.read_csv(Path+'test_aq1fgdb/test.csv')
viewlog_df = pd.read_csv(Path+'train_na17sgz/view_log.csv')
itemdata_df = pd.read_csv(Path+'train_na17sgz/item_data.csv')
subm_df = pd.read_csv(Path+'sample_submission_ipsblct/sample_submission.csv')

targetcol = 'is_click'
target = train_df[targetcol]

CPU times: user 2.24 s, sys: 444 ms, total: 2.69 s
Wall time: 2.7 s


In [6]:
# subm_df.head()

In [7]:
print(train_df.shape, test_df.shape)
print(test_df.shape[0] / train_df.shape[0])

(237609, 7) (90675, 6)
0.3816143327904246


In [8]:
%%time
features = train_df.columns.values[1:40]
exclude_cols =['loan_id','financial_institution','origination_date',
               'first_payment_date','loan_purpose','source',targetcol]
features = [col for col in train_df.columns if col not in exclude_cols]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 155 µs


In [9]:
#generate date features
def gen_date_feats(data,orig_date_format):
    data['impression_time']= pd.to_datetime(data['impression_time'],format=orig_date_format)
    data['impression_year']=data['impression_time'].dt.year
    data['impression_month']=data['impression_time'].dt.month
    data['impression_day']=data['impression_time'].dt.day
    data['impression_hour']=data['impression_time'].dt.hour
    
    return data

timeformat = '%Y-%m-%d %H:%M:%S'
train_df = gen_date_feats(train_df,timeformat)
test_df = gen_date_feats(test_df,timeformat)

cols = ['impression_time','impression_year','impression_month','impression_day',
        'impression_hour']
print(train_df[cols].head())
print(test_df[cols].head())

      impression_time  impression_year  impression_month  impression_day  \
0 2018-11-15 00:00:00             2018                11              15   
1 2018-11-15 00:01:00             2018                11              15   
2 2018-11-15 00:02:00             2018                11              15   
3 2018-11-15 00:02:00             2018                11              15   
4 2018-11-15 00:02:00             2018                11              15   

   impression_hour  
0                0  
1                0  
2                0  
3                0  
4                0  
      impression_time  impression_year  impression_month  impression_day  \
0 2018-12-13 07:44:00             2018                12              13   
1 2018-12-13 07:45:00             2018                12              13   
2 2018-12-13 07:46:00             2018                12              13   
3 2018-12-13 07:47:00             2018                12              13   
4 2018-12-13 07:48:00             201

In [10]:
train_df.to_csv('train_preproc.csv',index=False)
test_df.to_csv('test_preproc.csv',index=False)

In [11]:
train_df = pd.read_csv('train_preproc.csv')
test_df = pd.read_csv('test_preproc.csv')

In [12]:
train_df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,impression_year,impression_month,impression_day,impression_hour
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0,2018,11,15,0
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,2018,11,15,0
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0,2018,11,15,0
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0,2018,11,15,0
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0,2018,11,15,0


In [13]:
test_df.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,impression_year,impression_month,impression_day,impression_hour
0,a9e7126a585a69a32bc7414e9d0c0ada,2018-12-13 07:44:00,44754,127,latest,1,2018,12,13,7
1,caac14a5bf2ba283db7708bb34855760,2018-12-13 07:45:00,29656,44,latest,0,2018,12,13,7
2,13f10ba306a19ce7bec2f3cae507b698,2018-12-13 07:46:00,25234,296,latest,1,2018,12,13,7
3,39c4b4dc0e9701b55a0a4f072008fb3f,2018-12-13 07:47:00,22988,207,latest,1,2018,12,13,7
4,bf5a572cca75f5fc67f4b14e58b11d70,2018-12-13 07:48:00,35431,242,latest,1,2018,12,13,7
