In [25]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv). Collection of functions for data processing and analysis modeled after R dataframes with SQL like features
import matplotlib #collection of functions for scientific and publication-ready visualization
import scipy as sp #collection of functions for scientific computing and advanced mathematics
import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
import sklearn #collection of machine learning algorithms
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import datetime
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection, model_selection, metrics
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test_supplement.csv', 'train.csv', 'test.csv', 'sample_submission.csv', 'train_sample.csv']


In [26]:
data_types = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'bool',
        }

In [27]:
def CleanData (dataset):
    dataset['click_time'] = pd.to_datetime(dataset['click_time'])
    dataset['attributed_time'] = pd.to_datetime(dataset['attributed_time'])

In [28]:
def FeatureEngineering (dataset):
    dataset['click_time_hour'] = dataset['click_time'].dt.round('H')
    dataset['click_time_half'] = dataset['click_time'].dt.round('0.5H')
    dataset['click_time_qtr'] = dataset['click_time'].dt.round('15min')
    dataset['click_time_hour'] = dataset['click_time_hour'].map(lambda x: x.time())
    dataset['click_time_half'] = dataset['click_time_half'].map(lambda x: x.time())
    dataset['click_time_qtr'] = dataset['click_time_qtr'].map(lambda x: x.time())

In [29]:
def TargetEncoder (dataset):
    target_ip = (dataset[dataset.is_attributed == True].ip.value_counts()/dataset.ip.value_counts()).fillna(0)
    target_app = (dataset[dataset.is_attributed == True].app.value_counts()/dataset.app.value_counts()).fillna(0)
    target_device = (dataset[dataset.is_attributed == True].device.value_counts()/dataset.device.value_counts()).fillna(0)
    target_os = (dataset[dataset.is_attributed == True].os.value_counts()/dataset.os.value_counts()).fillna(0)
    target_channel = (dataset[dataset.is_attributed == True].channel.value_counts()/dataset.channel.value_counts()).fillna(0)
    target_temp_hour = (dataset[dataset.is_attributed == True].click_time_hour.value_counts()/dataset.click_time_hour.value_counts()).fillna(0)
    target_temp_half = (dataset[dataset.is_attributed == True].click_time_half.value_counts()/dataset.click_time_half.value_counts()).fillna(0)
    target_temp_qtr = (dataset[dataset.is_attributed == True].click_time_qtr.value_counts()/dataset.click_time_qtr.value_counts()).fillna(0)

In [30]:
train_df = pd.read_csv('../input/train_sample.csv', dtype=data_types)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
ip                 100000 non-null uint32
app                100000 non-null uint16
device             100000 non-null uint16
os                 100000 non-null uint16
channel            100000 non-null uint16
click_time         100000 non-null object
attributed_time    251 non-null object
is_attributed      100000 non-null bool
dtypes: bool(1), object(2), uint16(4), uint32(1)
memory usage: 2.8+ MB


In [31]:
CleanData(train_df)
FeatureEngineering(train_df)

In [32]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
ip                 100000 non-null uint32
app                100000 non-null uint16
device             100000 non-null uint16
os                 100000 non-null uint16
channel            100000 non-null uint16
click_time         100000 non-null datetime64[ns]
attributed_time    251 non-null datetime64[ns]
is_attributed      100000 non-null bool
click_time_hour    100000 non-null object
click_time_half    100000 non-null object
click_time_qtr     100000 non-null object
dtypes: bool(1), datetime64[ns](2), object(3), uint16(4), uint32(1)
memory usage: 5.1+ MB


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_time_hour,click_time_half,click_time_qtr
0,29540,3,1,42,489,2017-11-08 03:57:46,NaT,False,04:00:00,04:00:00,04:00:00
1,26777,11,1,25,319,2017-11-09 11:02:14,NaT,False,11:00:00,11:00:00,11:00:00
2,140926,12,1,13,140,2017-11-07 04:36:14,NaT,False,05:00:00,04:30:00,04:30:00
3,69375,2,1,19,377,2017-11-09 13:17:20,NaT,False,13:00:00,13:30:00,13:15:00
4,119166,9,2,15,445,2017-11-07 12:11:37,NaT,False,12:00:00,12:00:00,12:15:00


In [33]:
ip_attributed = {}
app_attributed = {}
device_attributed = {}
os_attributed = {}
channel_attributed = {}
click_time_hour_attributed = {}
click_time_half_attributed = {}
click_time_qtr_attributed = {}

In [34]:
ip_total = {}
app_total = {}
device_total = {}
os_total = {}
channel_total = {}
click_time_hour_total = {}
click_time_half_total = {}
click_time_qtr_total = {}

In [35]:
chunk_size = 10000

In [36]:
for col in train_df:
    print(train_df[col][0])

29540
3
1
42
489
2017-11-08 03:57:46
NaT
False
04:00:00
04:00:00
04:00:00


In [37]:
for chunk in pd.read_csv('../input/train_sample.csv', dtype=data_types, chunksize=chunk_size):
    print (chunk.info(),'\n\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
ip                 10000 non-null uint32
app                10000 non-null uint16
device             10000 non-null uint16
os                 10000 non-null uint16
channel            10000 non-null uint16
click_time         10000 non-null object
attributed_time    31 non-null object
is_attributed      10000 non-null bool
dtypes: bool(1), object(2), uint16(4), uint32(1)
memory usage: 283.3+ KB
None 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 10000 to 19999
Data columns (total 8 columns):
ip                 10000 non-null uint32
app                10000 non-null uint16
device             10000 non-null uint16
os                 10000 non-null uint16
channel            10000 non-null uint16
click_time         10000 non-null object
attributed_time    22 non-null object
is_attributed      10000 non-null bool
dtypes: bool(1), object(2), uint16(4), uint32(1)
memor