In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
from pprint import pprint
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

from scipy import spatial
import math
from collections import Counter 

import datetime, time
import pickle
import re
import pytz

# SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# TENSORFLOW
import tensorflow as tf
from tensorflow.python.keras.optimizers import TFOptimizer

# KERAS
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit

In [None]:
def printRuntime():
    print(datetime.datetime.now(pytz.timezone('Asia/Jakarta')).strftime("%Y-%m-%d %T"))
    print("-"*19)
    
printRuntime()

In [None]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [None]:
xgb_params = {"eta": 0.02, "max_depth": 4, "subsample": 0.9, 
              # "tree_method": "gpu_hist",
              "colsample_bytree": 0.9, "objective": "binary:logistic", 
              "eval_metric": "auc", "seed": 99, "silent": True}

In [None]:
dfITrain = pd.read_pickle("./kalapa/itrain_20200217.pickle", compression="bz2")
dfITest = pd.read_pickle("./kalapa/itest_20200217.pickle", compression="bz2")

In [None]:
lsEmbedCol = ["FIELD_%d"%d for d in [7, 9, 13, 39]] + ["maCv", "jobCat"]

In [None]:
lsFieldFt = [c for c in dfITrain.columns 
             if "FIELD" in c 
             and c not in lsEmbedCol]

In [None]:
len(lsFieldFt)

In [None]:
def formatJob(iStr):
    if "None" == iStr or "none" == iStr or "nan" == iStr:
        return "none"
    # Replace all digit
    iStr = re.sub("\d", "", iStr)
    # Replace cn to cong nhan
    iStr = re.sub(r"^(cnhân|cn)", "công nhân ", iStr)
    # Replace nv/nv. to nhan vien
    iStr = re.sub(r"(- nv|nv.|nv)", "nhân viên ", iStr)
    # Replace p. to phó
    iStr = re.sub(r"(^p\.)", "phó ", iStr)
    iStr = (iStr.replace("-", " ")
            .replace("cty", "công ty")
            .replace(".", " ")
            .replace("(", " ").
            replace(")", " ")
           )
    # Return
    return iStr
print(formatJob("phó giám đốc xí nghiệp"))
print(formatJob("p. trưởng phòng"))
printRuntime()

def splitJobType(iStr):
    iStr = formatJob(iStr)
    
    lsWord = iStr.split()
    if len(lsWord) == 0:
        return "none", "none"
    splitIdx = 2
    if "phó chánh" in iStr or ("phó trưởng" in iStr 
                               and "phó trưởng phòng" not in iStr
                               and "phó trưởng ban" not in iStr):
        splitIdx = 4
    elif ("phó phòng" not in iStr) and (lsWord[0] == "phó" or lsWord[0] == "trưởng"):
        splitIdx = 3
    jobCat = " ".join(lsWord[:splitIdx])
    jobDesc = "none"
    if len(lsWord) > splitIdx:
        jobDesc = " ".join(lsWord[splitIdx:])
    return jobCat, jobDesc

# Test
print(splitJobType("nhân viên phòng thí nghiệm"))
print(splitJobType("nhân viên bảo trì"))
print(splitJobType("cn ủi"))
print(splitJobType("9782cấp dưỡng"))
print(splitJobType("trưởng dây chuyền phòng sản xuất"))
print(splitJobType("p. trưởng phòng"))
printRuntime()

### Format maCv

In [None]:
pdfTrain = dfITrain.copy()
pdfTest = dfITest.copy()

In [None]:
%time
pdfTrain["maCv"] = pdfTrain["maCv"].apply(formatJob)
pdfTrain["maCv_VECTOR"] = pdfTrain["maCv"].apply(lambda x: x.split())
pdfTrain["jobCat"], pdfTrain["jobDesc"] = zip(*pdfTrain["maCv"].apply(splitJobType))
printRuntime()

In [None]:
%time
pdfTest["maCv"] = pdfTest["maCv"].apply(formatJob)
pdfTest["maCv_VECTOR"] = pdfTest["maCv"].apply(lambda x: x.split())
pdfTest["jobCat"], pdfTest["jobDesc"] = zip(*pdfTest["maCv"].apply(splitJobType))
printRuntime()

# Embed functions F7 F9 F13 F39 maCv jobCat as pre-computed weights

In [None]:
def splitF7(f7):
    if f7 == "[]" or f7 is np.nan:
        return ["na"]
    s = f7.replace("'", "").replace("[", "").replace("]", "").replace(" ", "").split(",")
    return s

def encodeOneHot(row, vocabSize, word2int, verbose=False):
    temp = np.zeros(vocabSize)
    if verbose:
        print(row)
    if row in word2int.keys():
        dataPointIndex = word2int[row]
        temp[dataPointIndex] = 1
    return temp

def encodeOneHotVector(row, vocabSize, word2int, verbose=False):
    temp = np.zeros(vocabSize)
    for c in row:
        if c in word2int.keys():
            temp[word2int[c]] = temp[word2int[c]] + 1
    return temp

def extractVocabulary(pdf, iCol, isVector=False, vectorFunc=splitF7):
    """
    Args:
        pdf: training part
        iCol: column name to be used to extract vocabulary (FIELD_7, maCv)
        isVector: 
        vectorFunc: Function to build vector from iCol
    Returns:
        vocab: Vocabulary -> encode
        word2int: lookup table word2int -> encode
        int2word: lookup table int2word -> encode
        vocab, word2int, int2word: Vocabulary from iCol column
    """
    vocab = []
    word2int = {}
    int2word = {}
    # Separate field 7 into vector
    if isVector:
        # Special treatment for FIELD_7
        pdf[iCol+"_VECTOR"] = pdf[iCol].apply(vectorFunc)
        # Build vocabulary
        for r in pdf[iCol+"_VECTOR"]:
            vocab.extend(r)
        vocab = list(set(vocab))
    else:
        # Build vocabulary
        pdf.loc[pdf[iCol].isnull(), iCol] = "None" # Prevent nan value for key
        vocab = list(pdf[iCol].unique())
    # Build 1-hot lookup table
    vocabSize = len(vocab) # gives the total number of unique words
    print(vocab[:5])
    print(vocabSize)
    for i,word in enumerate(vocab):
        word2int[word] = i
        int2word[i] = word
    # Return
    return vocab, word2int, int2word

def encodeCol(pdf, iCol, vocab, word2int, isVector=False):
    """
    Args:
        dfTrain: train part, only encode column based on this dataframe
        iCol: column name to be encoded, in vectorized form
    Returns:
        pdfEncode: pdf with encoded column (1-hot/1-hot sum vector)
    """
    vocabSize = len(vocab)
    # apply 1-hot
    if isVector:
        newColSeries = pdf[iCol].apply(lambda r: encodeOneHotVector(r, vocabSize, word2int))
    else:
        newColSeries = pdf[iCol].apply(lambda r: encodeOneHot(r, vocabSize, word2int))
    # Return
    return newColSeries

printRuntime()

In [None]:
def getIdx(w, word2int):
    if w in word2int.keys():
        idx = word2int[w] + 1
    else:
        idx = -1 # Append non value
    return idx

def concatEmbedVector(r, word2int, weights, maxLength):
    # print(r)
    lsVec = []
    isVector = (maxLength>1)
    embeddingDim = len(weights[0])
    notFoundItem = np.array([-1]*embeddingDim, dtype="float32")
    if isVector:
        for w in r:
            idx = getIdx(w, word2int)
            if -1 == idx:
                lsVec.append(notFoundItem)
            else:
                lsVec.append(weights[idx])
        # Padding to maxLength
        paddedValue = maxLength - len(lsVec)
        for i in range(paddedValue):
            lsVec.append(weights[0])
    else:
        w = r
        idx = getIdx(w, word2int)
        if -1 == idx:
            lsVec.append(notFoundItem)
        else:
            lsVec.append(weights[idx])
    return np.concatenate(lsVec)

# Embed 1 new column into embedded_col
# Input parameters
def embedCol(pdf, vectorCol, onehotCol, vocab, word2int, weights, 
             maxLength=1, *args):
    # vectorCol = "FIELD_7_VECTOR"
    # vocab = vocabF7
    # word2int = word2intF7
    # weights = weightsF7

    # Processing
    # Using sum of all vectors as doc vector
    vocabSize = len(vocab)
    isVector = (maxLength > 1)
    if isVector:
        encodeFunc = encodeOneHotVector
    else:
        encodeFunc = encodeOneHot
        
    # pdf[onehotCol] = pdf[vectorCol].apply(lambda r: encodeFunc(r, vocabSize, word2int))
    # Using concatenated vector
    newConcatSeries = pdf[vectorCol].apply(lambda r: concatEmbedVector(r, word2int, weights, maxLength))
    # Using sum vector
    newDocSeries = pdf[onehotCol].apply(lambda x: np.matmul(x, weights[1:]))
    # return: combined of doc vector and concatnated vector
    return newDocSeries, newConcatSeries

def trainEmbedCol(pdf, vectorCol, lblCol, onehotCol, vocab, word2int, int2word, embeddingDim, 
                  maxLength=1, *args):
    """
    Build an embedding model 
    Args:
        pdf:
        vectorCol:
        lblCol:
        vocab:
        word2int:
        int2word:
        embeddingDim:
        maxLength: If input is vector -> maxLength := paddedLength; Else 1;
    Returns:
        weights: Weight matrix (vocabSz*embeddingDim)
    """
    docs = pdf[vectorCol].values
    label = pdf[lblCol].values
    vocabSize = len(vocab) + 1  # Reserved for position 0 for padding
    if maxLength > 1:
        encodedDocs = [[(word2int[c]+1) for c in d] for d in docs]
        print(docs[:5])
        print(encodedDocs[:5])
        paddedDocs = pad_sequences(encodedDocs, maxlen=maxLength, padding='post')
    else:
        encodedDocs = [word2int[d]+1 for d in docs]
        print(docs[:5])
        print(encodedDocs[:5])
        paddedDocs = encodedDocs
    print(paddedDocs[:5])
    
    # define the model
    model = Sequential()
    model.add(Embedding(vocabSize, embeddingDim, input_length=maxLength))
    model.add(Flatten(input_shape=(vocabSize, maxLength)))
    model.add(Dense(1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])

    # summarize the model
    print(model.summary())
    
    # fit the model
    model.fit(paddedDocs, label, epochs=100, verbose=0)
    
    # review auc
    yPredTrain = model.predict_proba(paddedDocs)
    rocTrain = roc_auc_score(label, yPredTrain)
    print("-"*50)
    print("ROC:", rocTrain)
    print("-"*50)
    
    # output weight
    e = model.layers[0]
    weights = e.get_weights()[0]
    print(weights.shape) # shape: (vocabSize, embeddingDim)
    
    embeddedDocSeries, embeddedConcatSeries = embedCol(
        pdf, vectorCol, onehotCol, vocab, word2int, weights, maxLength)
    # pdf[onehotCol].apply(lambda x: np.matmul(x, weights[1:]))
    
    # return
    return model, weights, embeddedDocSeries, embeddedConcatSeries

printRuntime()

# Submission Summary

In [None]:
# submissionType = ["max", "mean"]
def exportSubmission(idTest, lsResult, giniThreshold, submissionType):
    lsChosen = [c for c in lsResult if c[0].best_score > 0.35]
    print(len(lsChosen))
    submitId = idTest
    if "max" == submissionType:
        maxGini = 0
        maxXgbPred = None
        for c in lsChosen:
            if c[0].best_score > maxGini:
                maxGini = c[0].best_score
                maxXgbPred = c[1]
        print(maxGini)
        submitPred = maxXgbPred
    elif "mean" == submissionType:
        submitPred = np.mean([c[1] for c in lsChosen], axis=0)
    print("Shape of submitId: {}; submitPred: {}".format(submitId.shape, submitPred.shape))
    print("Similarity of submission with other chosen pred")
    for c in lsChosen:
        print("Gini:", c[0].best_score)
        result = 1 - spatial.distance.cosine(c[1], submitPred)
        print("Similarity:", result)
        
    # Write id, maxXgbPred into csv file
    dictSubmit = {"id": submitId, "label": submitPred}
    pdfSubmit = pd.DataFrame.from_dict(dictSubmit)
    print(pdfSubmit.shape)
    pdfSubmit.head()
    ymd = datetime.datetime.now().strftime("%Y%m%d")
    oPath = "/kaggle/working/submission_{}_{}.csv".format(submissionType, ymd)
    print(oPath)
    pdfSubmit.to_csv(oPath, header=True, index=False)
    # Return
    return pdfSubmit

printRuntime()

In [None]:
def summaryOutput(pdfSubmit):
    pdfSubmit["bool_label"] = (pdfSubmit["label"] > 0.5)
    pdfSummary = (pdfSubmit.groupby("bool_label", as_index=False)
                  .agg({"id": "count", "label": ["sum", "mean", "std"]})
                 )
    display(pdfSummary)
    printRuntime()
    return
printRuntime()

## ---------

# Run 0: k-fold

In [None]:
# df_train = pd.read_csv("../input/train.csv") # train
# df_test = pd.read_csv("../input/test.csv") # test
K = 5
kf = KFold(n_splits = K, random_state = 42, shuffle = True)

yTrain = dfITrain["label"].values
idTest = dfITest["id"].values

xTrain = np.array(dfITrain[lsFieldFt]) # only select field_ft column
# test = np.array(df_test.drop(["id"], axis = 1))

xgb_preds = []
printRuntime()

# TOP 17: GINI = 0.21642

In [None]:
idx = 0
trainVal = []
for trainIdx, testIdx in kf.split(xTrain):
    pdfTrain, pdfTest = dfITrain.iloc[trainIdx].copy(), dfITrain.iloc[testIdx].copy()
    print(pdfTrain.shape, pdfTest.shape)
    # Embedding field 7, 9, 13, 39
    print("Doing embedding ...")
    lsEmbeddedCol = []
    for col in lsEmbedCol:
        print("#", col)
        t = time.time()
        onehotCol = col + "_ONEHOT"
        embeddedCol = col + "_EMBEDDED"
        vocab, word2int, int2word, pdfTrain[onehotCol] = encodeCol(pdfTrain, col)
        embeddingDim = 2
        maxLength = 1
        if "FIELD_7" == col:
            iCol = "FIELD_7_VECTOR"
            embeddingDim = 2
            maxLength = 16
        else:
            iCol = col
        model, weights, pdfTrain[embeddedCol] = trainEmbedCol(pdfTrain, iCol, "label", onehotCol,
                                                           vocab, word2int, int2word, embeddingDim, maxLength)
        pdfTest[embeddedCol] = embedCol(pdfTest, col, onehotCol, vocab, word2int, weights)

        # pickleVal[col] = [vocab, word2int, int2word, embeddingDim, maxLength, weights]

        embeddingDim = weights.shape[1]
        lsCol = [(embeddedCol + "_%d"%d) for d in range(embeddingDim)]
        lsEmbeddedCol.extend(lsCol)
        pdfTrain[lsCol] = pd.DataFrame(pdfTrain[embeddedCol].values.tolist(), index=pdfTrain.index)
        pdfTest[lsCol] = pd.DataFrame(pdfTest[embeddedCol].values.tolist(), index=pdfTest.index)
        print("Elapsed time: %d(s)" % int(time.time() - t))
        printRuntime()
    
    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    train_X, valid_X = pdfTrain[lsFieldFt+lsEmbeddedCol].values, pdfTest[lsFieldFt+lsEmbeddedCol].values
    train_y, valid_y = yTrain[trainIdx], yTrain[testIdx]
    
    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    trainVal.append((d_train, d_valid))
printRuntime()

In [None]:
for (d_train, d_valid) in trainVal:
    # d_test = xgb.DMatrix(test)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, 
                      verbose_eval=50, early_stopping_rounds=200)
                        
    # xgb_pred = model.predict(d_test)
    # xgb_preds.append(list(xgb_pred))
    

In [None]:
# Rebuild pdfTrain pdfTest based on precalculated embeddedCol
print("Load pre-trained parameters to pickle file")
path = "../input/kalapa/weight.pickle"
print(path)
with open(path, 'rb') as handle:
    b = pickle.load(handle)

In [None]:
for k,v in b.items():
    print(k)

In [None]:
lsEmbeddedCol

In [None]:
lsEmbeddedCol = []
print("Reload embedding value...")
pdfTrain = dfITrain
pdfTest = dfITest
for col in lsEmbedCol:
    print("#", col)
    t = time.time()
    onehotCol = col + "_ONEHOT"
    # vectorCol = col + "_VECTOR"
    embeddedCol = col + "_EMBEDDED"
    embeddedDocsCol = col + "_DOCS"
    embeddedConcatCol = col + "_CONCAT"
    
    [vocab, word2int, int2word, embeddingDim, maxLength, weights] = b[col]
    isVector = (maxLength > 1)
    
    if isVector:
        print("Vectorize column", col)
        pdfTrain[vectorCol] = pdfTrain[col].apply(vectorFunc)
        pdfTest[vectorCol] = pdfTest[col].apply(vectorFunc)
        print("-"*20)
    print("One-hot encoding column", col)
    pdfTrain[onehotCol] = encodeCol(pdfTrain, vectorCol, vocab, word2int, isVector)
    pdfTest[onehotCol] = encodeCol(pdfTest, vectorCol, vocab, word2int, isVector)
    
    print("Embed column {} into docs&concat".format(col))
    pdfTrain[embeddedDocsCol], pdfTrain[embeddedConcatCol] = embedCol(
        pdfTrain, vectorCol, onehotCol, vocab, word2int, weights, maxLength)
    pdfTest[embeddedDocsCol], pdfTest[embeddedConcatCol] = embedCol(
        pdfTest, vectorCol, onehotCol, vocab, word2int, weights, maxLength)
    print("-"*20)
    
    print("Stretch out column {}".format(embeddedDocsCol))
    lsDocsCol = [(embeddedDocsCol + "_%d"%d) for d in range(embeddingDim)]
    pdfTrain[lsDocsCol] = pd.DataFrame(pdfTrain[embeddedDocsCol].values.tolist(), index=pdfTrain.index)
    pdfTest[lsDocsCol] = pd.DataFrame(pdfTest[embeddedDocsCol].values.tolist(), index=pdfTest.index)
    print("-"*20)
    # Stretch out concat col
    if isVector:
        print("Stretch out column {}".format(embeddedConcatCol))
        lsConcatCol = [(embeddedConcatCol + "_%d"%d) for d in range(embeddingDim*maxLength)]
        pdfTrain[lsConcatCol] = pd.DataFrame(pdfTrain[embeddedConcatCol].values.tolist(), index=pdfTrain.index)
        pdfTest[lsConcatCol] = pd.DataFrame(pdfTest[embeddedConcatCol].values.tolist(), index=pdfTest.index)
        print("-"*20)
        
    print("Elapsed time: %d(s)" % int(time.time() - t))
    printRuntime()

In [None]:
lsFieldFt

# LEADER 0.34

## TRAIN 01: GINI 0.20

In [None]:
# Build 1 model with full data and use to predict test
# Random split train_test
# xTrain = np.array(dfITrain[lsFieldFt]) # only select field_ft column
NUM_RUN = 50
xTrain = pdfTrain[lsFieldFt+lsEmbeddedCol].values
xTest = pdfTest[lsFieldFt+lsEmbeddedCol].values

yTrain = dfITrain["label"].values
lsResult = []
for i in range(NUM_RUN):
    t = time.time()
    train_X, valid_X, train_y, valid_y = train_test_split(xTrain, yTrain, test_size=0.2)
    print(train_X.shape)
    print(valid_X.shape)
    print(train_y.shape)
    print(valid_y.shape)

    # train_y, valid_y = yTrain[trainIdx], yTrain[testIdx]
    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    # d_test = xgb.DMatrix(test)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, 
                          verbose_eval=50, early_stopping_rounds=100)
    
    d_test = xgb.DMatrix(xTest)
    xgb_pred = model.predict(d_test)
    print(xgb_pred[:5])
    lsResult.append((model, xgb_pred))
    print("Elapsed time: %d(s)" % int(time.time() - t))
    printRuntime()

In [None]:
# Try train/test 0.9/0.1
NUM_RUN = 50
xTrain = pdfTrain[lsFieldFt+lsEmbeddedCol].values
xTest = pdfTest[lsFieldFt+lsEmbeddedCol].values

yTrain = dfITrain["label"].values
# lsResult = []
for i in range(NUM_RUN):
    t = time.time()
    train_X, valid_X, train_y, valid_y = train_test_split(xTrain, yTrain, test_size=0.1)
    print(train_X.shape)
    print(valid_X.shape)
    print(train_y.shape)
    print(valid_y.shape)

    # train_y, valid_y = yTrain[trainIdx], yTrain[testIdx]
    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    # d_test = xgb.DMatrix(test)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, 
                          verbose_eval=50, early_stopping_rounds=100)
    
    d_test = xgb.DMatrix(xTest)
    xgb_pred = model.predict(d_test)
    print(xgb_pred[:5])
    lsResult.append((model, xgb_pred))
    print("Elapsed time: %d(s)" % int(time.time() - t))
    printRuntime()

In [None]:
# Get the best performance model => Combine score of top 10 models
maxGini = 0
maxXgbPred = None
idTest = pdfTest["id"].values
for c in lsResult:
    if c[0].best_score > maxGini:
        maxGini = c[0].best_score
        maxXgbPred = c[1]
print(maxGini)

In [None]:
lsChosen = [c for c in lsResult if c[0].best_score > 0.35]
len(lsChosen)


In [None]:
submitId = idTest
# axis=0 => (20000,)
# axis=1 => (12, ) => mean of each vector
submitPred = np.mean([c[1] for c in lsChosen], axis=0)

In [None]:
submitPred.shape

In [None]:
# calculate the cosine similarity between mean vector and each vector
for c in lsChosen:
    print("Gini:", c[0].best_score)
    result = 1 - spatial.distance.cosine(c[1], submitPred)
    print("Similarity:", result)

In [None]:
# Write id, maxXgbPred into csv file
dictSubmit = {"id": submitId, "label": submitPred}
pdfSubmit = pd.DataFrame.from_dict(dictSubmit)
print(pdfSubmit.shape)
pdfSubmit.head()

In [None]:
ymd = datetime.datetime.now().strftime("%Y%m%d")
pdfSubmit.to_csv("/kaggle/working/submission_{}.csv".format(ymd), header=True, index=False)

# RUN 2: Not using embedded F7 F9 F13 F39, F_mean, F_std
# RUN 3: Not using embedded F7 F9 F13 F39

submission_max: 0.406523, result: 0.16533
submission_mean: result: 0.17629
=> BAD: try mean with embedded fields

In [None]:
pdfTrain = dfITrain.copy()
pdfTest = dfITest.copy()

In [None]:
# lsSelectedFt = ([c for c in lsFieldFt if "_mean" not in c and "_std" not in c] 
#                 + [c for c in lsFieldFt if "_stdized" in c]) # LOW: 0.17629
# lsSelectedFt = ([c for c in lsFieldFt if "_mean" not in c and "_std" not in c] 
#                 + [c for c in lsFieldFt if "_stdized" in c]) + lsEmbeddedCol # LOW: 0.16358
lsSelectedFt = lsFieldFt
# print(lsSelectedFt)
printRuntime()

In [None]:
# Build 1 model with full data and use to predict test
# Random split train_test
# xTrain = np.array(dfITrain[lsSelectedFt]) # only select field_ft column
NUM_RUN = 50
xTrain = pdfTrain[lsSelectedFt].values
xTest = pdfTest[lsSelectedFt].values

yTrain = pdfTrain["label"].values
lsResult = []
for i in range(NUM_RUN):
    t = time.time()
    pdfTrainX, pdfValidateX = train_test_split(pdfTrain, test_size=0.1)
    train_X, train_y = pdfTrainX[lsSelectedFt].values, pdfTrainX["label"].values
    valid_X, valid_y = pdfValidateX[lsSelectedFt].values, pdfValidateX["label"].values
    print(train_X.shape)
    print(valid_X.shape)
    print(train_y.shape)
    print(valid_y.shape)
    display(pdfTrainX.groupby("label").agg({"id": "count"}))
    display(pdfValidateX.groupby("label").agg({"id": "count"}))

    # train_y, valid_y = yTrain[trainIdx], yTrain[testIdx]
    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    # d_test = xgb.DMatrix(test)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, 
                          verbose_eval=50, early_stopping_rounds=100)
    
    d_test = xgb.DMatrix(xTest)
    xgb_pred = model.predict(d_test)
    print(xgb_pred[:5])
    lsResult.append((model, xgb_pred))
    print("Elapsed time: %d(s)" % int(time.time() - t))
    printRuntime()

In [None]:
# Try train/test 0.9/0.1 => Target encoding and embedding must be done inside 1 loop
NUM_RUN = 50
xTrain = pdfTrain[lsSelectedFt].values
xTest = pdfTest[lsSelectedFt].values

yTrain = pdfTrain["label"].values
# lsResult = []
for i in range(NUM_RUN):
    t = time.time()
    pdfTrainX, pdfValidateX = train_test_split(pdfTrain, test_size=0.1)
    train_X, train_y = pdfTrainX[lsSelectedFt].values, pdfTrainX["label"].values
    valid_X, valid_y = pdfValidateX[lsSelectedFt].values, pdfValidateX["label"].values
    print(train_X.shape)
    print(valid_X.shape)
    print(train_y.shape)
    print(valid_y.shape)
    display(pdfTrainX.groupby("label").agg({"id": "count"}))
    display(pdfValidateX.groupby("label").agg({"id": "count"}))

    # train_y, valid_y = yTrain[trainIdx], yTrain[testIdx]
    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    # d_test = xgb.DMatrix(test)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, 
                          verbose_eval=50, early_stopping_rounds=100)
    
    d_test = xgb.DMatrix(xTest)
    xgb_pred = model.predict(d_test)
    print(xgb_pred[:5])
    lsResult.append((model, xgb_pred))
    print("Elapsed time: %d(s)" % int(time.time() - t))
    printRuntime()

In [None]:
# Check the giniThreshold
giniThreshold = 0.372
idTest = pdfTest["id"].values
lsChosen = [c for c in lsResult if c[0].best_score > giniThreshold]
len(lsChosen)

## Check the distribution of pdfSubmit

In [None]:
pdfSubmitSum = exportSubmission(idTest, lsResult, giniThreshold=0.35, submissionType="max")

In [None]:
summaryOutput(pdfSubmitSum)

In [None]:
pdfSubmitMean = exportSubmission(idTest, lsResult, giniThreshold=0.35, submissionType="mean")

In [None]:
summaryOutput(pdfSubmitMean)

# RUN 4: Embedding inside 1 fold

In [None]:
lsCol = [8, 10, 12, 17, 24, 40, 43] # For target encoding
lsMeanEncodCol = ["FIELD_%d"%d for d in lsCol] + ["group_age"]

In [None]:
print(lsEmbedCol)

In [None]:
lsFieldFt = [c for c in pdfTrain.columns 
             if "FIELD" in c 
             and "_ecd" not in c
             and c not in lsMeanEncodCol 
             and c not in lsEmbedCol]
print(len(lsFieldFt))
printRuntime()

In [None]:
lsFieldFt

In [None]:
pdfTrainBk = pdfTrain.copy()
pdfTestBk = pdfTest.copy()

In [None]:
def runEmbedding(pdfTrain, pdfVal, pdfTest, lsEmbedCol):
    print("# --- Doing embedding --- #")
    # pickleVal[i] = {}
    lsEmbeddedCol = []
    for col in lsEmbedCol:
        print("#", col)
        t = time.time()
        embeddingDim = 2
        maxLength = 1
        if "FIELD_7" == col:
            vectorCol = "FIELD_7_VECTOR"
            maxLength = 16
            vectorFunc = splitF7
        elif "maCv" == col:
            vectorCol = "maCv_VECTOR"
            maxLength = 42
            vectorFunc = lambda x: x.split()
        else:
            vectorCol = col
        isVector = (maxLength > 1)    

        onehotCol = col + "_ONEHOT"
        embeddedCol = col + "_EMBEDDED"
        embeddedDocsCol = col + "_DOCS"
        embeddedConcatCol = col + "_CONCAT"
        print("# extract vocabulary")
        vocab, word2int, int2word = extractVocabulary(pdfTrain, col, isVector, vectorFunc)
        print("# one-hot encoding")
        pdfTrain[onehotCol] = encodeCol(pdfTrain, vectorCol, vocab, word2int, isVector)
        if isVector:
            pdfVal[vectorCol] = pdfVal[col].apply(vectorFunc)
            pdfTest[vectorCol] = pdfTest[col].apply(vectorFunc)
        pdfVal[onehotCol] = encodeCol(pdfVal, vectorCol, vocab, word2int, isVector)
        pdfTest[onehotCol] = encodeCol(pdfTest, vectorCol, vocab, word2int, isVector)
        
        # Train the embedding
        print("# Train the embedding")
        model, weights, pdfTrain[embeddedDocsCol], pdfTrain[embeddedConcatCol] = trainEmbedCol(
            pdfTrain, vectorCol, "label", onehotCol,
            vocab, word2int, int2word, embeddingDim, maxLength, isVector, vectorFunc)
        # Embed validate, test 
        pdfVal[embeddedDocsCol], pdfVal[embeddedConcatCol] = embedCol(
            pdfVal, vectorCol, onehotCol, vocab, word2int, weights, maxLength)
        
        pdfTest[embeddedDocsCol], pdfTest[embeddedConcatCol] = embedCol(
            pdfTest, vectorCol, onehotCol, vocab, word2int, weights, maxLength)

        # Save the weights and vocab to file
        # pickleVal[i][col] = [vocab, word2int, int2word, embeddingDim, maxLength, weights]
        # Stretch out doc col
        # embeddingDim = weights.shape[1]
        lsDocsCol = [(embeddedDocsCol + "_%d"%d) for d in range(embeddingDim)]
        pdfTrain[lsDocsCol] = pd.DataFrame(pdfTrain[embeddedDocsCol].values.tolist(), 
                                            index=pdfTrain.index)
        pdfVal[lsDocsCol] = pd.DataFrame(pdfVal[embeddedDocsCol].values.tolist(), 
                                               index=pdfVal.index)
        pdfTest[lsDocsCol] = pd.DataFrame(pdfTest[embeddedDocsCol].values.tolist(), 
                                          index=pdfTest.index)
        lsEmbeddedCol.extend(lsDocsCol)
        # Stretch out concat col
        if isVector:
            lsConcatCol = [(embeddedConcatCol + "_%d"%d) for d in range(embeddingDim*maxLength)]
            pdfTrain[lsConcatCol] = pd.DataFrame(pdfTrain[embeddedConcatCol].values.tolist(), 
                                                  index=pdfTrain.index)
            pdfVal[lsConcatCol] = pd.DataFrame(pdfVal[embeddedConcatCol].values.tolist(), 
                                                     index=pdfVal.index)
            pdfTest[lsConcatCol] = pd.DataFrame(pdfTest[embeddedConcatCol].values.tolist(), 
                                                index=pdfTest.index)
            lsEmbeddedCol.extend(lsConcatCol)

        print("Elapsed time: %d(s)" % int(time.time() - t))
    return pdfTrain, pdfVal, pdfTest, lsEmbeddedCol
printRuntime()

In [None]:
def runTargetEncoding(pdfTrain, pdfVal, pdfTest, lsMeanEncodCol):
    print("Target encoding for train: {}".format(lsMeanEncodCol))
    meanEcd = {}
    lsEmbeddedCol = []
    for cName in lsMeanEncodCol:
        pdfTrain[cName].replace(np.nan, "none")
        meanEcd[cName] = pdfTrain.groupby([cName], as_index=False).agg({"id":"count", "label":["mean", "std"]})
        meanEcd[cName].columns = ["_".join(x) for x in meanEcd[cName].columns.ravel()]
        meanEcd[cName] = meanEcd[cName].rename(columns={
            cName+"_": cName, "id_count": cName+"_ecdcount",
            "label_mean": cName+"_ecdmean", "label_std": cName+"_ecdstd"})
        pdfTrain = pd.merge(pdfTrain, meanEcd[cName], on=cName, how="left")
        lsEmbeddedCol.extend([cName+"_ecdcount", cName+"_ecdmean", cName+"_ecdstd"])
#         pickleVal[i][cName] = meanEcd[cName].to_dict()
    # Get target/mean encoding of columns B in pdfVal & pdfTest
    print("Target encoding for Val/Test using train data: {}".format(lsMeanEncodCol))
    for cName in lsMeanEncodCol:
        # Val
        pdfVal[cName].replace(np.nan, "none")
        pdfVal = pd.merge(pdfVal, meanEcd[cName], on=cName, how="left")
        # TestX
        pdfTest[cName].replace(np.nan, "none")
        pdfTest = pd.merge(pdfTest, meanEcd[cName], on=cName, how="left")
    return pdfTrain, pdfVal, pdfTest, lsEmbeddedCol
printRuntime()

In [None]:
# Read back-up the data
# pdfTrain = dfITrain.copy()
# pdfTestX = dfITest.copy()

# Try train/test 0.9/0.1 => Target encoding and embedding must be done inside 1 loop
NUM_RUN = 30
pickleVal = {}
lsResult = []

for i in range(NUM_RUN):
    t = time.time()
    train, val = train_test_split(pdfTrain, test_size=0.2)
    pdfTrainX = train.copy()
    pdfValidateX = val.copy()
    pdfTestX = pdfTest.copy()
    lsEmbeddedCol = []
#     pdfTrainX, pdfValidateX, pdfTestX, lsNewCol = runEmbedding(
#         pdfTrainX, pdfValidateX, pdfTestX, lsEmbedCol)
#     lsEmbeddedCol.extend(lsNewCol)
#     print("Shape after embedding {}".format(lsEmbedCol))
#     print(pdfTrainX.shape, pdfValidateX.shape, pdfTestX.shape)
#     print("="*50)
    # ----- # 
    pdfTrainX, pdfValidateX, pdfTestX, lsNewCol = runTargetEncoding(
        pdfTrainX, pdfValidateX, pdfTestX, lsMeanEncodCol)
    lsEmbeddedCol.extend(lsNewCol)
    print("Shape after target-encoding {}".format(lsMeanEncodCol))
    print(pdfTrainX.shape, pdfValidateX.shape, pdfTestX.shape)
    printRuntime()
    print("="*50)
    # ----- #
    
    lsSelectedFt = lsFieldFt + lsEmbeddedCol
    print("Train XGBoost model")
    train_X, train_y = pdfTrainX[lsSelectedFt].values, pdfTrainX["label"].values
    valid_X, valid_y = pdfValidateX[lsSelectedFt].values, pdfValidateX["label"].values
    print(train_X.shape)
    print(valid_X.shape)
    print(train_y.shape)
    print(valid_y.shape)
    display(pdfTrainX.groupby("label").agg({"id": "count"}))
    display(pdfValidateX.groupby("label").agg({"id": "count"}))

    # train_y, valid_y = yTrain[trainIdx], yTrain[testIdx]
    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    # d_test = xgb.DMatrix(test)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, feval=gini_xgb, maximize=True, 
                          verbose_eval=50, early_stopping_rounds=100)
    
    xTest = pdfTestX[lsSelectedFt].values
    d_test = xgb.DMatrix(xTest)
    xgb_pred = model.predict(d_test)
    print(xgb_pred[:5])
    lsResult.append((model, xgb_pred))
    print("Elapsed time: %d(s)" % int(time.time() - t))
    printRuntime()

In [None]:
# Check the giniThreshold
giniThreshold = 0.372
idTest = pdfTest["id"].values
lsChosen = [c for c in lsResult if c[0].best_score > giniThreshold]
len(lsChosen)

## Check the distribution of pdfSubmit

In [None]:
pdfSubmitSum = exportSubmission(idTest, lsResult, giniThreshold, submissionType="max")

In [None]:
summaryOutput(pdfSubmitSum)

In [None]:
pdfSubmitMean = exportSubmission(idTest, lsResult, giniThreshold, submissionType="mean")

In [None]:
summaryOutput(pdfSubmitMean)

In [None]:
train.shape

# TODO: Try filtering iteratively