In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

%matplotlib inline
plt.rcParams['figure.figsize'] = (16,8)

In [2]:
dtypes = {
    'channelGrouping' : str,
    'date' : str,
    'device' : str,
    'fullVisitorId' : str,
    'geoNetwork' : str,
    'sessionId' : str,
    'socialEngagementType' : str,
    'totals' : str,
    'trafficSource' : str,
    'visitId' : np.int64,
    'visitNumber' : np.int16,
    'visitStartTime' : np.int64
}

In [3]:
train = pd.read_csv('../input/train.csv', dtype=dtypes, parse_dates=['date'])

In [4]:
def data_engineering(df):
    
    def get_keys(series):
        keys = set()
        for element in series:
            keys = keys.union(set(element.keys()))
            
        return keys
    
    def get_value(d, key):
        if key in d:
            try:
                return eval(d[key])
            except:
                return d[key]
        else:
            return np.nan
    
    df['device'] = df['device'].transform(lambda x: eval(x))
    df['totals'] = df['totals'].transform(lambda x: eval(x))
    df['geoNetwork'] = df['geoNetwork'].transform(lambda x: eval(x))
    df['trafficSource'] = df['trafficSource'].transform(lambda x: eval(x))
    df['visitStartTime'] = pd.to_datetime(df['visitStartTime'], unit='s')
    
    keys = get_keys(df['geoNetwork'])
    for key in keys:
        df['g_'+key] = df['geoNetwork'].transform(lambda x: get_value(x, key))
        
    keys = get_keys(df['totals'])
    for key in keys:
        if key == 'transactionRevenue':
            df['target'] = df['totals'].transform(lambda x: get_value(x, key))
        else:
            df['t_'+key] = df['totals'].transform(lambda x: get_value(x, key))
        
    keys = get_keys(df['trafficSource'])
    for key in keys:
        df['ts_'+key] = df['trafficSource'].transform(lambda x: get_value(x, key))
        
    keys = get_keys(df['device'])
    for key in keys:
        df['d_'+key] = df['device'].transform(lambda x: get_value(x, key))
    
    df.drop(['totals', 'trafficSource', 'device', 'geoNetwork'], axis=1, inplace=True)
    
    # all levels are equal
    df.drop([
             'g_cityId', 'g_networkLocation', 'g_latitude', 'g_longitude', 't_visits',
             'ts_adwordsClickInfo', 'd_mobileDeviceInfo', 'd_mobileDeviceModel', 'd_language',
             'd_flashVersion', 'd_browserVersion', 'd_mobileDeviceBranding', 'd_mobileInputSelector',
             'd_mobileDeviceMarketingName', 'd_operatingSystemVersion', 'd_screenResolution', 'd_screenColors',
             'd_browserSize', 'ts_campaignCode'
            ], axis=1, inplace=True)
    
    for column in df.columns:
        df[column] = df[column].replace('(none)', np.nan)
        df[column] = df[column].replace('not available in demo dataset', np.nan)
        df[column] = df[column].replace('(not provided)', np.nan)
        
    df.loc[:,'target_bool'] = 0
    df.loc[df['target'] > 0,'target_bool'] = 1
    
    df['month'] = df['date'].dt.month
    df['weekday'] = df['date'].dt.dayofweek
    
    df['socialEngagementType'].fillna(value=False, inplace=True)
    df['ts_isTrueDirect'].fillna(value=False, inplace=True)
    df['target'].fillna(0, inplace=True)
    df['ts_medium'].fillna('NA', inplace=True)
    df['ts_adContent'].fillna('NA', inplace=True)

In [5]:
true, false = True, False
data_engineering(train)

In [None]:
infer = pd.read_csv('../input/nput/')

In [68]:
def split(df, test_size=0.3):
    X = df.drop([
        'target', 'target_bool', 'visitStartTime', 'date', 'fullVisitorId', 'sessionId',
        'g_networkDomain', 'ts_referralPath', 'ts_keyword', 'ts_source', 'g_region', 'g_city', 'g_metro',
        'd_browser', 'g_region', 'socialEngagementType'
    
    ], axis=1)
    
    def encode_variables(df):
        variables = ['channelGrouping', 'g_continent', 'g_subContinent',
                     'g_country', 'ts_isTrueDirect', 'ts_medium', 'ts_campaign', 
                     'ts_adContent', 'd_operatingSystem', 'd_deviceCategory']

        for var in variables:
            df[var] = df[var].replace(False, 'False')
            df[var] = df[var].replace(True, 'True')
            encoder = preprocessing.LabelEncoder()
            encoder.fit(df[var])
            df[var] = encoder.transform(df[var])
    
    encode_variables(X)
    
    y = np.log1p(df['target'])
    
    return train_test_split(X, y, test_size=test_size, random_state=42)

In [69]:
X_train, X_test, y_train, y_test = split(train, 0.3)

In [73]:
model = xgb.XGBRegressor(max_depth=5, n_estimators=200, n_jobs=-1, nthread=8, random_state=42)

In [74]:
model.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric='rmse', early_stopping_rounds=10)

[0]	validation_0-rmse:1.96377
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.91608
[2]	validation_0-rmse:1.87611
[3]	validation_0-rmse:1.84176
[4]	validation_0-rmse:1.81338
[5]	validation_0-rmse:1.78973
[6]	validation_0-rmse:1.76922
[7]	validation_0-rmse:1.75196
[8]	validation_0-rmse:1.73784
[9]	validation_0-rmse:1.72555
[10]	validation_0-rmse:1.71523
[11]	validation_0-rmse:1.70637
[12]	validation_0-rmse:1.69852
[13]	validation_0-rmse:1.69209
[14]	validation_0-rmse:1.68626
[15]	validation_0-rmse:1.68102
[16]	validation_0-rmse:1.67667
[17]	validation_0-rmse:1.67259
[18]	validation_0-rmse:1.66882
[19]	validation_0-rmse:1.66529
[20]	validation_0-rmse:1.66227
[21]	validation_0-rmse:1.65973
[22]	validation_0-rmse:1.65729
[23]	validation_0-rmse:1.65508
[24]	validation_0-rmse:1.65295
[25]	validation_0-rmse:1.65078
[26]	validation_0-rmse:1.64867
[27]	validation_0-rmse:1.64676
[28]	validation_0-rmse:1.64487
[29]	validation_0-rmse:1.64355
[30]	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=-1, nthread=8, objective='reg:linear', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [None]:
def agg(X,y):
    df = pd.merge(X,y, left_on=[''])

In [None]:
def aggregate_df(df):
    
    def most_common(lst):
        return max(set(lst), key=lst.count)
    
    agg = pd.DataFrame(data=df['fullVisitorId'].unique(), columns=['fullVisitorId'])
    
    agg = pd.merge(
            agg, 
            np.log1p(df.groupby('fullVisitorId')['target'].sum()).to_frame(), 
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    agg = pd.merge(
            agg, 
            df.groupby('fullVisitorId')['sessionId'].count().to_frame(), 
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    agg = pd.merge(
            agg, 
            df.groupby('fullVisitorId')['target_bool'].sum().to_frame(), 
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    agg = pd.merge(
            agg,
            df.groupby('fullVisitorId')['g_country'].apply(lambda x: most_common(x.values.tolist())).to_frame(),
            left_on='fullVisitorId', 
            right_on='fullVisitorId')
    
    train.groupby('fullVisitorId')['g_country'].apply(lambda x: most_common(x.values.tolist()))
    
    agg.columns = ['fullVisitorId', 'target', 'num_sess', 'num_deals', 'country']
    
    return agg