In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams['figure.figsize'] = (16,8)

In [2]:
dtypes = {
    'channelGrouping' : str,
    'date' : str,
    'device' : str,
    'fullVisitorId' : str,
    'geoNetwork' : str,
    'sessionId' : str,
    'socialEngagementType' : str,
    'totals' : str,
    'trafficSource' : str,
    'visitId' : np.int64,
    'visitNumber' : np.int16,
    'visitStartTime' : np.int64
}

In [3]:
train = pd.read_csv('../input/train.csv', dtype=dtypes, parse_dates=['date'])

In [4]:
def data_engineering(df):
    
    def get_keys(series):
        keys = set()
        for element in series:
            keys = keys.union(set(element.keys()))
            
        return keys
    
    def get_value(d, key):
        if key in d:
            try:
                return eval(d[key])
            except:
                return d[key]
        else:
            return np.nan
    
    df['device'] = df['device'].transform(lambda x: eval(x))
    df['totals'] = df['totals'].transform(lambda x: eval(x))
    df['geoNetwork'] = df['geoNetwork'].transform(lambda x: eval(x))
    df['trafficSource'] = df['trafficSource'].transform(lambda x: eval(x))
    df['visitStartTime'] = pd.to_datetime(df['visitStartTime'], unit='s')
    
    keys = get_keys(df['geoNetwork'])
    for key in keys:
        df['g_'+key] = df['geoNetwork'].transform(lambda x: get_value(x, key))
        
    keys = get_keys(df['totals'])
    for key in keys:
        if key == 'transactionRevenue':
            df['target'] = df['totals'].transform(lambda x: get_value(x, key))
        else:
            df['t_'+key] = df['totals'].transform(lambda x: get_value(x, key))
        
    keys = get_keys(df['trafficSource'])
    for key in keys:
        df['ts_'+key] = df['trafficSource'].transform(lambda x: get_value(x, key))
        
    keys = get_keys(df['device'])
    for key in keys:
        df['d_'+key] = df['device'].transform(lambda x: get_value(x, key))
    
    df.drop(['totals', 'trafficSource', 'device', 'geoNetwork'], axis=1, inplace=True)
    
    # all levels are equal
    df.drop([
             'g_cityId', 'g_networkLocation', 'g_latitude', 'g_longitude', 't_visits',
             'ts_adwordsClickInfo', 'd_mobileDeviceInfo', 'd_mobileDeviceModel', 'd_language',
             'd_flashVersion', 'd_browserVersion', 'd_mobileDeviceBranding', 'd_mobileInputSelector',
             'd_mobileDeviceMarketingName', 'd_operatingSystemVersion', 'd_screenResolution', 'd_screenColors',
             'd_browserSize'
            ], axis=1, inplace=True)
    
    for column in df.columns:
        df[column] = df[column].replace('(none)', np.nan)
        df[column] = df[column].replace(False, np.nan)
        df[column] = df[column].replace('not available in demo dataset', np.nan)
        df[column] = df[column].replace('(not provided)', np.nan)
        
    df.loc[:,'target_bool'] = 0
    df.loc[df['target'] > 0,'target_bool'] = 1
    
    df['month'] = df['date'].dt.month
    df['weekday'] = df['date'].dt.dayofweek

In [5]:
true, false = True, False
data_engineering(train)

In [67]:
train[train['g_continent'].isna()]

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,month,weekday,...,ts_keyword,ts_isTrueDirect,ts_campaignCode,ts_referralPath,ts_source,d_deviceCategory,d_operatingSystem,d_browser,d_isMobile,target_bool
310,Referral,2016-09-02,149633083723377298,149633083723377298_1472819179,Not Socially Engaged,1472819179,1,2016-09-02 12:26:19,9,4,...,,,,/analytics/web/,analytics.google.com,desktop,Windows,Chrome,,0
1312,Social,2016-09-02,0983912688516501789,0983912688516501789_1472833208,Not Socially Engaged,1472833208,1,2016-09-02 16:20:08,9,4,...,,,,/yt/about/en-GB/,youtube.com,mobile,,Opera Mini,1.0,0
1784,Organic Search,2016-09-02,9660655671713860097,9660655671713860097_1472829789,Not Socially Engaged,1472829789,1,2016-09-02 15:23:09,9,4,...,,,,,google,desktop,Windows,Firefox,,0
1788,Organic Search,2016-09-02,4213667415448778925,4213667415448778925_1472844331,Not Socially Engaged,1472844331,1,2016-09-02 19:25:31,9,4,...,,,,,google,desktop,Windows,Chrome,,0
2125,Social,2016-09-02,4696756425353528990,4696756425353528990_1472835529,Not Socially Engaged,1472835529,1,2016-09-02 16:58:49,9,4,...,,,,/yt/about/,youtube.com,desktop,Windows,Opera,,0
2138,Social,2016-09-02,8682450913674241936,8682450913674241936_1472876158,Not Socially Engaged,1472876158,1,2016-09-03 04:15:58,9,4,...,,,,/yt/about/,youtube.com,desktop,Windows,Chrome,,0
3203,Affiliates,2017-01-26,000696790665307308,000696790665307308_1485439370,Not Socially Engaged,1485439370,2,2017-01-26 14:02:50,1,3,...,,True,,,Partners,desktop,Linux,Chrome,,0
3584,Social,2017-01-26,5076747184606670106,5076747184606670106_1485429657,Not Socially Engaged,1485429657,1,2017-01-26 11:20:57,1,3,...,,,,/yt/about/zh-CN/,youtube.com,desktop,Macintosh,Safari,,0
4270,Affiliates,2017-01-26,000696790665307308,000696790665307308_1485431224,Not Socially Engaged,1485431224,1,2017-01-26 11:47:04,1,3,...,,,,,Partners,desktop,Linux,Chrome,,0
8608,Social,2017-03-12,9768709211110843370,9768709211110843370_1489355312,Not Socially Engaged,1489355312,1,2017-03-12 21:48:32,3,6,...,,,,/yt/about/,youtube.com,desktop,Windows,Chrome,,0


In [66]:
train.isna().sum()

channelGrouping              0
date                         0
fullVisitorId                0
sessionId                    0
socialEngagementType         0
visitId                      0
visitNumber                  0
visitStartTime               0
month                        0
weekday                      0
g_networkDomain         244881
g_city                  542491
g_country                 1468
g_continent               1468
g_region                536056
g_metro                 709995
g_subContinent            1468
t_newVisits             200593
target                  892138
t_pageviews                100
t_hits                       0
t_bounces               453023
ts_adContent            892707
ts_medium               143146
ts_campaign             865347
ts_keyword              869292
ts_isTrueDirect         629648
ts_campaignCode         903652
ts_referralPath         572712
ts_source                   69
d_deviceCategory             0
d_operatingSystem         4695
d_browse

In [57]:
def aggregate_df(df):
    
    def most_common(lst):
        return max(set(lst), key=lst.count)
    
    agg = pd.DataFrame(data=df['fullVisitorId'].unique(), columns=['fullVisitorId'])
    
    agg = pd.merge(
            agg, 
            np.log1p(df.groupby('fullVisitorId')['target'].sum()).to_frame(), 
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    agg = pd.merge(
            agg, 
            df.groupby('fullVisitorId')['sessionId'].count().to_frame(), 
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    agg = pd.merge(
            agg, 
            df.groupby('fullVisitorId')['target_bool'].sum().to_frame(), 
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    agg = pd.merge(
            agg,
            df.groupby('fullVisitorId')['g_country'].apply(lambda x: most_common(x.values.tolist())).to_frame(),
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    train.groupby('fullVisitorId')['g_country'].apply(lambda x: most_common(x.values.tolist()))
    
    agg.columns = ['fullVisitorId', 'target', 'num_sess', 'num_deals', 'country']
    
    return agg