In [1]:
## importing libraries
import numpy as np
import pandas as pd
import datetime 

import sys

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing, cross_validation

from sklearn.model_selection import cross_validate as cv

import xgboost as xgb

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'



In [8]:
def encode_categories(df):
        """Encodes categorical variables into one-hot or label.
        """
        # extracting categorical variables
        categorical_variables = []

        for colname in df.columns:
            if df[colname].dtype == "object":
                categorical_variables.append(colname)
                print("Categorical Variable: %s, No. Categories: %d" % (colname, len(np.unique(df[colname].values.astype("str")))))

        if len(categorical_variables) > 0:
            print("1: Label encode categorical variables\n2: Onehot encode categorical variables\n3: Remove categorical variables\n4: Do nothing")
            
            while True:
                encoding = "1"
                if encoding.lower() not in ["1", "2", "3", "4"]:
                    print("Please choose one of the above: ")
                else:
                    print("")
                    break
                    break

            if encoding == "1":
                label = LabelEncoder()
                for colname in categorical_variables:
                    label.fit(list(df[colname].values.astype("str")))
                    df[colname] = label.transform(list(df[colname].values.astype("str")))
                print("Label encoded the categorical variables")
            elif encoding == "2":
                df = pd.get_dummies(df, columns=categorical_variables)
                df = df[df.columns]
                print("Onehot encoded the categorical variables")
            elif encoding == "3":
                df.drop(categorical_variables, axis=1, inplace=True)
                print("Categorical variables removed from data")

        return df

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
final_feature = pd.read_csv("features_stage1.csv")

In [6]:
def date_features(df):
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    # Extracting Day, weekday and hours from timestamp
    df['date'] = df['DateTime'].dt.date
    df['day'] = df['DateTime'].dt.day
    df['weekday'] = df['DateTime'].dt.weekday
    df['hour'] = df['DateTime'].dt.hour
    
    return df

In [7]:
def train_test_cleaning(df):
    df['product_category_2'].fillna(99999, inplace=True)
    df['user_group_id'].fillna(99, inplace=True)
    df['gender'].fillna('Unknown', inplace=True)
    df['age_level'].fillna(9, inplace=True)
    df['user_depth'].fillna(9, inplace=True)
    df['city_development_index'].fillna(9, inplace=True)
    df = df.fillna(0)
    return df

In [10]:
train['train'] = 1
test['train'] = 0
feature_data = train.drop(columns='is_click').append(test)
feature_data = date_features(feature_data)
feature_data = train_test_cleaning(feature_data)
feature_data = encode_categories(feature_data)
feature_data['date'] = feature_data['DateTime'].dt.date

Categorical Variable: product, No. Categories: 10
Categorical Variable: gender, No. Categories: 3
Categorical Variable: date, No. Categories: 8
1: Label encode categorical variables
2: Onehot encode categorical variables
3: Remove categorical variables
4: Do nothing

Label encoded the categorical variables


In [11]:
train_df = feature_data[feature_data['train'] == 1]
test_df = feature_data[feature_data['train'] == 0]

In [12]:
y_train = train['is_click']

In [14]:
x_train = train_df.merge(final_feature, on = ['user_id','date'], how = 'left').fillna(0)
x_test = test_df.merge(final_feature, on = ['user_id','date'], how = 'left').fillna(0)

In [18]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train.drop(columns = ['session_id','DateTime','train', 'Unnamed: 0','date']), y_train, test_size=0.2, random_state=69)

In [88]:
parameters = {'booster':'gbtree',
                  'objective':'binary:logistic',
                  'learning_rate': 0.05,
                  'max_depth': 6,
                  'min_child_weight': 8,
                  'subsample': 0.8,
                  'colsample_bytree': 0.6,
                  #'n_estimators': 100,
                  'reg_aplpha': 0.01,
                  'reg_lambda': 20,
                  'eval_metric':'auc',
                  'seed': 79}

In [89]:
xg_train = xgb.DMatrix(X_train.drop(columns=['user_id']), label=Y_train)
xg_val = xgb.DMatrix(X_val.drop(columns=['user_id']), label=Y_val)

In [90]:
xgb_reg = xgb.train(parameters, xg_train, 300, verbose_eval=1)

In [91]:
fpr_train, tpr_train, thresholds_train = metrics.roc_curve(Y_train, xgb_reg.predict(xg_train))
metrics.auc(fpr_train, tpr_train)

0.6326872817879272

In [92]:
fpr_test, tpr_test, thresholds_test = metrics.roc_curve(Y_val, xgb_reg.predict(xg_val))
metrics.auc(fpr_test, tpr_test)

0.5924669098722324

In [93]:
xg_test = xgb.DMatrix(x_test.drop(columns=['user_id','session_id','DateTime','train', 'Unnamed: 0','date']).fillna(0))

In [94]:
y_xgb_pred = xgb_reg.predict(xg_test)

In [95]:
solution = pd.concat([x_test['session_id'], pd.DataFrame(y_xgb_pred)],axis=1)

In [96]:
solution.to_csv("solution7.csv", index = False)