In [1]:
import pandas as pd, numpy as np, seaborn as sns
from pprint import pprint
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import math
from collections import Counter

import datetime, time

from pprint import pprint
import re

In [2]:
def printRuntime():
    print(datetime.datetime.now().strftime("%Y-%m-%d %T"))
    print("-"*19)
printRuntime()

2020-02-15 21:41:36
-------------------


# 1. Load data

In [3]:
basePath = "./kalapa/"
trainPath = basePath + "train.csv"
testPath = basePath + "test.csv"
colDesc = basePath + "column_description.csv"
sampleSubmit = basePath + "sample_submission.csv"

cleanedTrain = basePath + "cleanedTrain.pickle"
cleanedTest = basePath + "cleanedTest.pickle"
nb01 = basePath + "nb01.pickle"
nb02 = basePath + "nb02.pickle"
nb03 = basePath + "nb03.pickle"
printRuntime()

2020-02-15 21:41:36
-------------------


In [5]:
def avgAge(row):
    a = row["age_source1"]
    b = row["age_source2"]
    if np.isnan(a) and np.isnan(b):
        return np.nan
    elif np.isnan(a):
        return b
    elif np.isnan(b):
        return a
    else:
        return (a+b)/2.0
    
def diffAge(row):
    a = row["age_source1"]
    b = row["age_source2"]
    if np.isnan(a) and np.isnan(b):
        return np.nan
    elif np.isnan(a):
        return -b
    elif np.isnan(b):
        return a
    else:
        return (a+b)/2.0
    
def groupAge(age):
    # 18-23; 23-35; 35-50; 50+
    result = ""
    if np.isnan(age):
        result = "AGE_NONE"
    elif age <= 18:
        result = "AGE_0"
    elif age <=23:
        result = "AGE_1"
    elif age <=35:
        result = "AGE_2"
    elif age <= 50:
        result = "AGE_3"
    else:
        result = "AGE_4"
    return result
printRuntime()

2020-02-15 21:42:01
-------------------


# Train

In [6]:
pdfTrainRaw = pd.read_csv(trainPath)
printRuntime()

2020-02-15 21:42:04
-------------------


  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
cond = (
    ~((pdfTrainRaw["age_source1"]<18) | (pdfTrainRaw["age_source2"]<18)) # Filter out age < 18, keep NaN
)
# pdfTrain = pdfTrainRaw[cond].copy()
pdfTrain = pdfTrainRaw.copy()
pdfTrain["maCv"] = pdfTrain["maCv"].apply(lambda x: str(x).lower())
pdfTrain["province"] = pdfTrain["province"].apply(lambda x: str(x).lower())
pdfTrain["district"] = pdfTrain["district"].apply(lambda x: str(x).lower())
print(pdfTrain.shape)
printRuntime()

(30000, 64)
2020-02-15 21:42:04
-------------------


In [8]:
lsMetaCol = ["id", "label", "province", "district", "age_source1", "age_source2", "maCv"]

# Test

In [9]:
pdfTestRaw = pd.read_csv(testPath)
printRuntime()

2020-02-15 21:42:08
-------------------


  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
cond = (
    ~((pdfTestRaw["age_source1"]<18) | (pdfTestRaw["age_source2"]<18)) # Filter out age < 18, keep NaN
)
# pdfTest = pdfTestRaw[cond].copy()
pdfTest = pdfTestRaw.copy()
pdfTest["maCv"] = pdfTest["maCv"].apply(lambda x: str(x).lower())
pdfTest["province"] = pdfTest["province"].apply(lambda x: str(x).lower())
pdfTest["district"] = pdfTest["district"].apply(lambda x: str(x).lower())
print(pdfTest.shape)
printRuntime()

(20000, 63)
2020-02-15 21:42:08
-------------------


### Clean data

In [None]:
25, 29, 30, 31, 36, 37, : 'TRUE' -> True, 'FALSE' -> False !DONE
35: => "Zero"-"Four" -> 1-5 scale (with NULL) !DONE
41: "I"-"V" -> 1-5 scale (with NULL) !DONE
42, 44: "Zezo", "One", "Two" !DONE
45: "1" 1.0 -> 1; "2" 2.0 ->2 !DONE

### Break down maCv to (maCv[:2] + maCv[2:])

In [11]:
def formatJob(iStr):
    if "none" == iStr or "nan" == iStr:
        return "none"
    # Replace all digit
    iStr = re.sub("\d", "", iStr)
    # Replace cn to cong nhan
    iStr = re.sub(r"^(cnhân|cn|coõng nhaõn)", "công nhân ", iStr)
    # Replace nv/nv. to nhan vien
    iStr = re.sub(r"(- nv|nv.|nv)", "nhân viên ", iStr)
    # Replace p. to phó
    iStr = re.sub(r"(^p\.)", "phó ", iStr)
    
    iStr = (iStr.replace("-", " ")
            .replace("cty", "công ty")
            .replace(".", " ")
            .replace("(", " ").
            replace(")", " ")
           )
    return iStr

def splitJobType(iStr):
    if "none" == iStr or "nan" == iStr:
        return "none", "none"
    iStr = formatJob(iStr)
    
    lsWord = iStr.split()
    if len(lsWord) == 0:
        return "none", "none"
    splitIdx = 2
    if "phó chánh" in iStr or ("phó trưởng" in iStr 
                               and "phó trưởng phòng" not in iStr
                               and "phó trưởng ban" not in iStr):
        splitIdx = 4
    elif ("phó phòng" not in iStr) and (lsWord[0] == "phó" or lsWord[0] == "trưởng"):
        splitIdx = 3
    
    jobCat = " ".join(lsWord[:splitIdx])
    jobDesc = "none"
    if len(lsWord) > splitIdx:
        jobDesc = " ".join(lsWord[splitIdx:])
    return jobCat, jobDesc

# Test
print(splitJobType("nhân viên phòng thí nghiệm"))
print(splitJobType("nhân viên bảo trì"))
print(splitJobType("cn ủi"))
print(splitJobType("9782cấp dưỡng"))
print(splitJobType("trưởng dây chuyền phòng sản xuất"))
print(splitJobType("p. trưởng phòng"))
printRuntime()

('nhân viên', 'phòng thí nghiệm')
('nhân viên', 'bảo trì')
('công nhân', 'ủi')
('cấp dưỡng', 'none')
('trưởng dây chuyền', 'phòng sản xuất')
('phó trưởng phòng', 'none')
2020-02-15 21:43:16
-------------------


In [12]:
# pdfTrain["jobCat"], pdfTrain["jobDesc"] = pdfTrain["maCv"].apply(lambda x: splitJobType(x))
%time
pdfTrain["maCv"] = pdfTrain["maCv"].apply(formatJob)
pdfTrain["maCv_vector"] = pdfTrain["maCv"].apply(lambda x: x.split())
pdfTrain["jobCat"], pdfTrain["jobDesc"] = zip(*pdfTrain["maCv"].apply(splitJobType))
printRuntime()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs
2020-02-15 21:43:19
-------------------


In [112]:
# pdfTest["jobCat"], pdfTest["jobDesc"] = pdfTest["maCv"].apply(lambda x: splitJobType(x))
%time
pdfTest["maCv"] = pdfTest["maCv"].apply(formatJob)
pdfTest["maCv_vector"] = pdfTest["maCv"].apply(lambda x: x.split())
pdfTest["jobCat"], pdfTest["jobDesc"] = zip(*pdfTest["maCv"].apply(splitJobType))
printRuntime()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
2020-02-15 22:06:39
-------------------


In [13]:
pdfTrain = (pdfTrain
      .replace(np.nan, "None") # 'None' -> nan
      .replace("TRUE", True) # 'TRUE' -> True
      .replace("FALSE", False) # 'FALSE' -> False
     )

In [14]:
pdfTest = (pdfTest
      .replace(np.nan, "None") # 'None' -> nan
      .replace("TRUE", True) # 'TRUE' -> True
      .replace("FALSE", False) # 'FALSE' -> False
     )

In [15]:
pdfTrainBk = pdfTrain.copy()

pdfTestBk = pdfTest.copy()

#### Replace ilformed values by columns to safely remove

In [16]:
for dfTmp in [pdfTrain, pdfTest]:
    # 6
    dfTmp.loc[dfTmp["FIELD_6"]=="None", "FIELD_6"] = np.nan
    # dfTmp["FIELD_6"] = dfTmp["FIELD_6"] + 1
    
    # 12
    dfTmp.loc[~dfTmp["FIELD_12"].isin(["None", "0", "1"]), "FIELD_12"] = "None"

    # 35 'Four' 'One' 'Three' 'Two'
    dfTmp.loc[dfTmp["FIELD_35"]=="Zero", "FIELD_35"] = 1
    dfTmp.loc[dfTmp["FIELD_35"]=="One", "FIELD_35"] = 2
    dfTmp.loc[dfTmp["FIELD_35"]=="Two", "FIELD_35"] = 3
    dfTmp.loc[dfTmp["FIELD_35"]=="Three", "FIELD_35"] = 4
    dfTmp.loc[dfTmp["FIELD_35"]=="Four", "FIELD_35"] = 5
    
    # 40 
    dfTmp.loc[dfTmp["FIELD_40"].isin(["05 08 11 02", "08 02"]), "FIELD_40"] = "02 05 08 11"
    
    # 41 I II III IV V
    dfTmp.loc[dfTmp["FIELD_41"]=="I", "FIELD_41"] = 1
    dfTmp.loc[dfTmp["FIELD_41"]=="II", "FIELD_41"] = 2
    dfTmp.loc[dfTmp["FIELD_41"]=="III", "FIELD_41"] = 3
    dfTmp.loc[dfTmp["FIELD_41"]=="IV", "FIELD_41"] = 4
    dfTmp.loc[dfTmp["FIELD_41"]=="V", "FIELD_41"] = 5

    # 42
    dfTmp.loc[dfTmp["FIELD_42"]=="Zezo", "FIELD_42"] = 0
    dfTmp.loc[dfTmp["FIELD_42"]=="One", "FIELD_42"] = 1

    # 44
    dfTmp.loc[dfTmp["FIELD_44"]=="One", "FIELD_44"] = 1
    dfTmp.loc[dfTmp["FIELD_44"]=="Two", "FIELD_44"] = 2

    # 45
    dfTmp.loc[(dfTmp["FIELD_45"]=="1")|(dfTmp["FIELD_45"]==1.0), "FIELD_45"] = 1
    dfTmp.loc[(dfTmp["FIELD_45"]=="2")|(dfTmp["FIELD_45"]==2.0), "FIELD_45"] = 2

### One-hot 8 (gender), 10, 12, 17, 24, 40, 43

In [17]:
dfTrainBk1 = pdfTrain.copy()
dfTestBk1 = pdfTest.copy()

In [18]:
# Get one hot encoding of columns for pdfTrain pdfTest
lsCol = [8, 10, 12, 17, 24, 40, 43]
for d in lsCol:
    cName = "FIELD_%d" % d
    one_hot = pd.get_dummies(pdfTrain[cName].replace(np.nan, "None"))
    renameCol = {}
    for c in one_hot.columns:
        renameCol[c] = "%s_%s" % (cName, c)
    pprint(renameCol)
    one_hot = one_hot.rename(columns=renameCol)
    pdfTrain = pdfTrain.drop(cName, axis = 1)
    # Join the encoded pdfTrain
    pdfTrain = pdfTrain.join(one_hot)

{'FEMALE': 'FIELD_8_FEMALE', 'MALE': 'FIELD_8_MALE', 'None': 'FIELD_8_None'}
{'GH': 'FIELD_10_GH', 'None': 'FIELD_10_None', 'T1': 'FIELD_10_T1'}
{'0': 'FIELD_12_0', '1': 'FIELD_12_1', 'None': 'FIELD_12_None'}
{'G2': 'FIELD_17_G2',
 'G3': 'FIELD_17_G3',
 'G4': 'FIELD_17_G4',
 'G7': 'FIELD_17_G7',
 'G8': 'FIELD_17_G8',
 'G9': 'FIELD_17_G9',
 'GX': 'FIELD_17_GX',
 'None': 'FIELD_17_None'}
{'K1': 'FIELD_24_K1',
 'K2': 'FIELD_24_K2',
 'K3': 'FIELD_24_K3',
 'None': 'FIELD_24_None'}
{'02 05 08 11': 'FIELD_40_02 05 08 11',
 '1': 'FIELD_40_1',
 '2': 'FIELD_40_2',
 '3': 'FIELD_40_3',
 '4': 'FIELD_40_4',
 '6': 'FIELD_40_6',
 'None': 'FIELD_40_None'}
{'0': 'FIELD_43_0',
 '5': 'FIELD_43_5',
 'A': 'FIELD_43_A',
 'B': 'FIELD_43_B',
 'C': 'FIELD_43_C',
 'D': 'FIELD_43_D',
 'None': 'FIELD_43_None'}


In [19]:
for d in lsCol:
    cName = "FIELD_%d" % d
    one_hot = pd.get_dummies(pdfTest[cName].replace(np.nan, "None"))
    renameCol = {}
    for c in one_hot.columns:
        renameCol[c] = "%s_%s" % (cName, c)
    pprint(renameCol)
    one_hot = one_hot.rename(columns=renameCol)
    pdfTest = pdfTest.drop(cName, axis = 1)
    # Join the encoded pdfTest
    pdfTest = pdfTest.join(one_hot)

{'FEMALE': 'FIELD_8_FEMALE', 'MALE': 'FIELD_8_MALE', 'None': 'FIELD_8_None'}
{'GH': 'FIELD_10_GH', 'None': 'FIELD_10_None', 'T1': 'FIELD_10_T1'}
{'0': 'FIELD_12_0', '1': 'FIELD_12_1', 'None': 'FIELD_12_None'}
{'G3': 'FIELD_17_G3',
 'G4': 'FIELD_17_G4',
 'G7': 'FIELD_17_G7',
 'G8': 'FIELD_17_G8',
 'G9': 'FIELD_17_G9',
 'GX': 'FIELD_17_GX',
 'None': 'FIELD_17_None'}
{'K1': 'FIELD_24_K1',
 'K2': 'FIELD_24_K2',
 'K3': 'FIELD_24_K3',
 'None': 'FIELD_24_None'}
{'02 05 08 11': 'FIELD_40_02 05 08 11',
 '1': 'FIELD_40_1',
 '2': 'FIELD_40_2',
 '3': 'FIELD_40_3',
 '6': 'FIELD_40_6',
 'None': 'FIELD_40_None'}
{'0': 'FIELD_43_0',
 '5': 'FIELD_43_5',
 'A': 'FIELD_43_A',
 'B': 'FIELD_43_B',
 'C': 'FIELD_43_C',
 'D': 'FIELD_43_D',
 'None': 'FIELD_43_None'}


In [20]:
for d in lsCol:
    cName = "FIELD_%d" % d
    lsMissCol = [c for c in pdfTrain.columns if cName in c and c not in pdfTest.columns]
    for c in lsMissCol:
        print(cName)
        print("Add missing columns for test")        
        print(c)
        pdfTest[c] = 0

    lsRedCol = [c for c in pdfTest.columns if cName in c and c not in pdfTrain.columns]
    for c in lsRedCol:
        print("Remove redundant columns for test")
        print(c)
        pdfTest = pdfTest.drop(c, axis=1)

FIELD_17
Add missing columns for test
FIELD_17_G2
FIELD_40
Add missing columns for test
FIELD_40_4


In [58]:
pdfTrain.columns

Index(['id', 'label', 'province', 'district', 'age_source1', 'age_source2',
       'maCv', 'FIELD_1', 'FIELD_2', 'FIELD_3', 'FIELD_4', 'FIELD_5',
       'FIELD_6', 'FIELD_7', 'FIELD_9', 'FIELD_11', 'FIELD_13', 'FIELD_14',
       'FIELD_15', 'FIELD_16', 'FIELD_18', 'FIELD_19', 'FIELD_20', 'FIELD_21',
       'FIELD_22', 'FIELD_23', 'FIELD_25', 'FIELD_26', 'FIELD_27', 'FIELD_28',
       'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_32', 'FIELD_33', 'FIELD_34',
       'FIELD_35', 'FIELD_36', 'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_41',
       'FIELD_42', 'FIELD_44', 'FIELD_45', 'FIELD_46', 'FIELD_47', 'FIELD_48',
       'FIELD_49', 'FIELD_50', 'FIELD_51', 'FIELD_52', 'FIELD_53', 'FIELD_54',
       'FIELD_55', 'FIELD_56', 'FIELD_57', 'jobCat', 'jobDesc', 'FIELD_8_0',
       'FIELD_8_1', 'FIELD_8_None', 'FIELD_10_GH', 'FIELD_10_None',
       'FIELD_10_T1', 'FIELD_12_0', 'FIELD_12_1', 'FIELD_12_None',
       'FIELD_17_G2', 'FIELD_17_G3', 'FIELD_17_G4', 'FIELD_17_G7',
       'FIELD_17_G8', 'FI

### Add FIELD_7_MOST, FIELD_7_LEN, FIELD_9_APPEAR, FIELD_9_INDEX

In [21]:
def mostFrequent(List): 
    occurenceCount = Counter(List) 
    lsCount = occurenceCount.most_common()
    maxAppear = lsCount[0][1]
    # print(lsCount)
    # print(maxAppear)
    lsVal = [v[0] for v in lsCount if v[1] == maxAppear]
    # Return list of most common items (if equally appeared)
    return lsVal

def mostAppear(iStr):
    if iStr == "[]" or iStr is np.nan:
        return "na"
    lsChar = iStr.replace("'", "").replace("[", "").replace("]", "").replace(" ", "").split(",")
    return mostFrequent(lsChar)

# Test
print(mostFrequent([2, 2, 3, 3, 3, 4, 5, 6])) # Expect [3]
print(mostAppear("['GD', 'GD', 'TE']")) # Expect ["GD"]

def index_7_9(row):
    """
    Return: (is_appear, index)
    """
    lsMostAppear7 = row["FIELD_7_MOST"]
    val9 = row["FIELD_9"]
    if val9 == "[]" or val9 == "na":
        return -2, -1
    elif lsMostAppear7 == "na":
        return -1, -1
    elif val9 in lsMostAppear7:
        return 1, lsMostAppear7.index(val9)
    else:
        # lsMostAppear7 is not null but 9 doesn't appear
        return 0, -1
    
printRuntime()

[3]
['GD']
2020-02-15 21:44:16
-------------------


In [22]:
# Add col as most appear FIELD_7
pdfTrain["FIELD_7_MOST"] = pdfTrain["FIELD_7"].apply(lambda x: mostAppear(x))
pdfTrain["FIELD_7_LEN"] = pdfTrain["FIELD_7_MOST"].apply(len)
# Add col as 9 appear in 7_MOST
pdfTrain["FIELD_9_APPEAR"], pdfTrain["FIELD_9_INDEX"] = zip(*pdfTrain.apply(lambda row: index_7_9(row), axis=1))

In [23]:
# Add col as most appear FIELD_7
pdfTest["FIELD_7_MOST"] = pdfTest["FIELD_7"].apply(lambda x: mostAppear(x))
pdfTest["FIELD_7_LEN"] = pdfTest["FIELD_7_MOST"].apply(len)
# Add col as 9 appear in 7_MOST
pdfTest["FIELD_9_APPEAR"], pdfTest["FIELD_9_INDEX"] = zip(*pdfTest.apply(lambda row: index_7_9(row), axis=1))

In [24]:
pdfTrain = pdfTrain.drop("FIELD_7_MOST", axis = 1)

pdfTest = pdfTest.drop("FIELD_7_MOST", axis = 1)

### Add avg_age, diff_age, group_age

In [25]:
pdfTrain.loc[pdfTrain["age_source1"]=="None", "age_source1"] = np.nan
pdfTrain.loc[pdfTrain["age_source2"]=="None", "age_source2"] = np.nan

In [48]:
pdfTrain["avg_age"] = pdfTrain.apply(lambda row: avgAge(row), axis=1)
pdfTrain["diff_age"] = pdfTrain.apply(lambda row: diffAge(row), axis=1)
pdfTrain["group_age"] = pdfTrain["avg_age"].apply(lambda x: groupAge(x))
printRuntime()

2020-02-15 21:51:15
-------------------


In [27]:
pdfTest.loc[pdfTest["age_source1"]=="None", "age_source1"] = np.nan
pdfTest.loc[pdfTest["age_source2"]=="None", "age_source2"] = np.nan

In [49]:
pdfTest["avg_age"] = pdfTest.apply(lambda row: avgAge(row), axis=1)
pdfTest["diff_age"] = pdfTest.apply(lambda row: diffAge(row), axis=1)
pdfTest["group_age"] = pdfTest["avg_age"].apply(lambda x: groupAge(x))
printRuntime()

2020-02-15 21:51:17
-------------------


### One hot group_age

In [62]:
cName = "group_age"
one_hot = pd.get_dummies(pdfTrain[cName].replace(np.nan, "None"))
renameCol = {}
for c in one_hot.columns:
    renameCol[c] = "FIELD_%s" % (c)
pprint(renameCol)
one_hot = one_hot.rename(columns=renameCol)

# pdfTrain = pdfTrain.drop(cName, axis = 1)
# Join the encoded pdfTrain
pdfTrain = pdfTrain.join(one_hot)
printRuntime()

{'AGE_0': 'FIELD_AGE_0',
 'AGE_1': 'FIELD_AGE_1',
 'AGE_2': 'FIELD_AGE_2',
 'AGE_3': 'FIELD_AGE_3',
 'AGE_4': 'FIELD_AGE_4',
 'AGE_NONE': 'FIELD_AGE_NONE'}
2020-02-15 21:53:52
-------------------


In [63]:
cName = "group_age"
one_hot = pd.get_dummies(pdfTest[cName].replace(np.nan, "None"))
renameCol = {}
for c in one_hot.columns:
    renameCol[c] = "FIELD_%s" % (c)
pprint(renameCol)
one_hot = one_hot.rename(columns=renameCol)

# pdfTest = pdfTest.drop(cName, axis = 1)
# Join the encoded pdfTest
pdfTest = pdfTest.join(one_hot)
printRuntime()

{'AGE_0': 'FIELD_AGE_0',
 'AGE_1': 'FIELD_AGE_1',
 'AGE_2': 'FIELD_AGE_2',
 'AGE_3': 'FIELD_AGE_3',
 'AGE_4': 'FIELD_AGE_4',
 'AGE_NONE': 'FIELD_AGE_NONE'}
2020-02-15 21:53:56
-------------------


### Add lv3_loc

In [73]:
pdfTrain["lv3_loc"] = "[" + pdfTrain["province"] + "]_[" + pdfTrain["district"] + "]"
lsProvince = pdfTrain["province"].unique()
lsLoc = pdfTrain["lv3_loc"].unique()
print(len(lsProvince), len(lsLoc))
printRuntime()

65 766
2020-02-15 21:56:26
-------------------


In [74]:
pdfTest["lv3_loc"] = "[" + pdfTest["province"] + "]_[" + pdfTest["district"] + "]"
lsProvince = pdfTest["province"].unique()
lsLoc = pdfTest["lv3_loc"].unique()
print(len(lsProvince), len(lsLoc))
printRuntime()

65 724
2020-02-15 21:56:28
-------------------


### Convert data types

In [91]:
lsExtraFt = (["FIELD_7_LEN", "FIELD_9_APPEAR", "FIELD_9_INDEX"]  + ["avg_age", "diff_age"]
             )

In [92]:
lsFieldFt = ([c for c in pdfTrain.columns 
              if "FIELD" in c 
              and c not in ["FIELD_%d"%d for d in [7, 9, 13, 39]]] 
             + lsExtraFt)
# pprint(lsFieldFt)
printRuntime()

2020-02-15 22:04:30
-------------------


In [93]:
lsFieldFtTest = ([c for c in pdfTest.columns 
                  if "FIELD" in c 
                  and c not in ["FIELD_%d"%d for d in [7, 9, 13, 39]]]
                 + lsExtraFt)
print(sorted(lsFieldFtTest) == sorted(lsFieldFt))
printRuntime()

True
2020-02-15 22:04:31
-------------------


In [94]:
for c in lsFieldFt:
    try:
        pdfTrain.loc[pdfTrain[c].isin(["None", "na"]), c] = np.nan
        pdfTrain[c] = pdfTrain[c].astype(np.float64)
        pdfTest.loc[pdfTest[c].isin(["None", "na"]), c] = np.nan
        pdfTest[c] = pdfTest[c].astype(np.float64)
    except:
        print(c)
printRuntime()

2020-02-15 22:04:35
-------------------


### Add neighbor features
Neighbor features: 
    + avg+std of groupby (province, age_group, maCv)
    + avg+std of groupby (lv3_loc, age_group, maCv)
    + avg+std of groupby (province, maCv)
    

In [95]:
aggDict = {}
for f in lsFieldFt:
    aggDict[f] = ["mean", "std"]

In [96]:
# 1
lsMetaCol1 = ["province", "group_age", "jobCat"]
lsMetaCol2 = ["lv3_loc", "group_age", "jobCat"]
lsMetaCol3 = ["province", "jobCat"]

In [97]:
dfNb01 = pdfTrain.groupby(lsMetaCol1, as_index=False).agg(aggDict)
dfNb01.columns = ["_".join(x) for x in dfNb01.columns.ravel()]
dfNb01 = dfNb01.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "jobCat_": "jobCat"})
printRuntime()

2020-02-15 22:04:39
-------------------


In [98]:
dfNb02 = pdfTrain.groupby(lsMetaCol2, as_index=False).agg(aggDict)
dfNb02.columns = ["_".join(x) for x in dfNb02.columns.ravel()]
dfNb02 = dfNb02.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "jobCat_": "jobCat"})
printRuntime()

2020-02-15 22:04:40
-------------------


In [99]:
dfNb03 = pdfTrain.groupby(lsMetaCol3, as_index=False).agg(aggDict)
dfNb03.columns = ["_".join(x) for x in dfNb03.columns.ravel()]
dfNb03 = dfNb03.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "jobCat_": "jobCat"})
printRuntime()

2020-02-15 22:04:41
-------------------


In [100]:
# Save outputs: pdfTrain, dfNb01, dfNb02, dfNb03
pdfTrain.to_pickle(cleanedTrain, compression="bz2")
pdfTest.to_pickle(cleanedTest, compression="bz2")
dfNb01.to_pickle(nb01, compression="bz2")
dfNb02.to_pickle(nb02, compression="bz2")
dfNb03.to_pickle(nb03, compression="bz2")
printRuntime()

2020-02-15 22:04:51
-------------------


In [51]:
dfNb01.head()

Unnamed: 0,province,group_age,jobCat,FIELD_1_mean,FIELD_1_std,FIELD_2_mean,FIELD_2_std,FIELD_3_mean,FIELD_3_std,FIELD_4_mean,...,FIELD_43_A_mean,FIELD_43_A_std,FIELD_43_B_mean,FIELD_43_B_std,FIELD_43_C_mean,FIELD_43_C_std,FIELD_43_D_mean,FIELD_43_D_std,FIELD_43_None_mean,FIELD_43_None_std
0,Thành phố Cần Thơ,AGE_I,công nhân,1.0,0.0,1.0,0.0,1353.75,451.900708,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Thành phố Cần Thơ,AGE_I,none,0.529412,0.514496,0.647059,0.492592,700.411765,634.703578,0.352941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Thành phố Cần Thơ,AGE_II,bôi keo,1.0,,1.0,,2899.0,,2.0,...,0.0,,0.0,,0.0,,0.0,,1.0,
3,Thành phố Cần Thơ,AGE_II,bảo vệ,0.0,,1.0,,2895.0,,1.0,...,0.0,,0.0,,0.0,,0.0,,1.0,
4,Thành phố Cần Thơ,AGE_II,chủ tịch,0.0,,0.0,,-1.0,,2.0,...,0.0,,0.0,,0.0,,0.0,,1.0,


### raw ft + nb ft + standardize raw ft

In [101]:
lsMetaCol = ["id", "label", "province", "district", "lv3_loc", 
              "age_source1", "age_source2", "avg_age", "diff_age", "group_age",
              "maCv", "jobCat", "jobDesc"]
lsEmbedFt = ["FIELD_%d"%d for d in [7, 9, 13, 39]]
# raw ft
lsFieldFt = ([c for c in pdfTrain.columns 
             if "FIELD" in c 
             and c not in lsEmbedFt] + lsExtraFt)

label = "label"
printRuntime()

2020-02-15 22:04:53
-------------------


In [102]:
# nb ft
lsNbMetaCol = lsMetaCol1
dfNb = dfNb01 # TODO: dfNb02, dfNb03
printRuntime()

2020-02-15 22:04:57
-------------------


In [103]:
dfITrain = pd.merge(pdfTrain, dfNb01, on=lsNbMetaCol, how="left")

print(dfITrain.shape)
printRuntime()

(30000, 298)
2020-02-15 22:04:58
-------------------


In [104]:
lsTmp = [c for c in dfITrain.columns if c not in lsFieldFt]
# pprint(lsTmp)

In [105]:
dfITrain.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_AGE_3_mean,FIELD_AGE_3_std,FIELD_AGE_4_mean,FIELD_AGE_4_std,FIELD_AGE_NONE_mean,FIELD_AGE_NONE_std,avg_age_mean,avg_age_std,diff_age_mean,diff_age_std
0,0,0,,none,,,none,1.0,1.0,2547.0,...,0.0,0.0,0.0,0.0,1.0,0.0,,,,
1,1,0,tỉnh đồng nai,thành phố biên hòa,44.0,44.0,none,1.0,0.0,-1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,42.989305,4.428633,42.989305,4.428633
2,2,0,tỉnh đồng nai,huyện long thành,30.0,30.0,công nhân,0.0,1.0,3273.0,...,0.0,0.0,0.0,0.0,0.0,0.0,29.463542,3.565208,29.463542,3.565208
3,3,0,tỉnh tuyên quang,thành phố tuyên quang,43.0,,none,0.0,1.0,3991.0,...,1.0,0.0,0.0,0.0,0.0,0.0,40.5,4.210377,40.5,4.210377
4,4,0,thành phố hồ chí minh,quận 1,21.0,21.0,none,0.0,1.0,1450.0,...,0.0,0.0,0.0,0.0,0.0,0.0,21.727545,1.023381,21.727545,1.023381


In [106]:
for c in lsFieldFt:
    dfITrain["%s_stdized"%c] = (dfITrain[c] - dfITrain[c+"_mean"])/dfITrain[c+"_std"]
print(dfITrain.shape) # (29924, 353)
printRuntime()

(30000, 390)
2020-02-15 22:05:07
-------------------


In [107]:
ymd = datetime.datetime.now().strftime("%Y%m%d")
iTrainPath = basePath + "itrain_{}.pickle".format(ymd)
print(iTrainPath)
dfITrain.to_pickle(iTrainPath, compression="bz2")
printRuntime()

./kalapa/itrain_20200215.pickle
2020-02-15 22:05:22
-------------------


In [65]:
dfITrain.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_24_K2_stdized,FIELD_24_K3_stdized,FIELD_24_None_stdized,FIELD_43_0_stdized,FIELD_43_5_stdized,FIELD_43_A_stdized,FIELD_43_B_stdized,FIELD_43_C_stdized,FIELD_43_D_stdized,FIELD_43_None_stdized
0,0,0,,,,,,1.0,1.0,2547.0,...,,,,,,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none,1.0,0.0,-1.0,...,,,,,,,,-0.224257,,0.224257
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,0.0,1.0,3273.0,...,,,,,,,,-0.223014,-0.126664,0.259688
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,-0.288675,,0.288675,,,,,,,
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none,0.0,1.0,1450.0,...,,,,,,,,,,


# Prepare itest

In [110]:
lsNbMetaCol

['province', 'group_age', 'jobCat']

In [111]:
pdfTest.columns

Index(['id', 'province', 'district', 'age_source1', 'age_source2', 'maCv',
       'FIELD_1', 'FIELD_2', 'FIELD_3', 'FIELD_4',
       ...
       'AGE_4', 'AGE_NONE', 'lv3_loc', 'group_age', 'FIELD_AGE_0',
       'FIELD_AGE_1', 'FIELD_AGE_2', 'FIELD_AGE_3', 'FIELD_AGE_4',
       'FIELD_AGE_NONE'],
      dtype='object', length=110)

In [113]:
dfITest = pd.merge(pdfTest, dfNb01, on=lsNbMetaCol, how="left")

In [114]:
dfITest.shape # label col is missing

(20000, 297)

In [115]:
lsTmp = [c for c in dfITest.columns if c not in lsFieldFt]
# pprint(lsTmp)

['id',
 'province',
 'district',
 'age_source1',
 'age_source2',
 'maCv',
 'FIELD_7',
 'FIELD_9',
 'FIELD_13',
 'FIELD_39',
 'AGE_0',
 'AGE_1',
 'AGE_2',
 'AGE_3',
 'AGE_4',
 'AGE_NONE',
 'lv3_loc',
 'group_age',
 'maCv_vector',
 'jobCat',
 'jobDesc',
 'FIELD_1_mean',
 'FIELD_1_std',
 'FIELD_2_mean',
 'FIELD_2_std',
 'FIELD_3_mean',
 'FIELD_3_std',
 'FIELD_4_mean',
 'FIELD_4_std',
 'FIELD_5_mean',
 'FIELD_5_std',
 'FIELD_6_mean',
 'FIELD_6_std',
 'FIELD_11_mean',
 'FIELD_11_std',
 'FIELD_14_mean',
 'FIELD_14_std',
 'FIELD_15_mean',
 'FIELD_15_std',
 'FIELD_16_mean',
 'FIELD_16_std',
 'FIELD_18_mean',
 'FIELD_18_std',
 'FIELD_19_mean',
 'FIELD_19_std',
 'FIELD_20_mean',
 'FIELD_20_std',
 'FIELD_21_mean',
 'FIELD_21_std',
 'FIELD_22_mean',
 'FIELD_22_std',
 'FIELD_23_mean',
 'FIELD_23_std',
 'FIELD_25_mean',
 'FIELD_25_std',
 'FIELD_26_mean',
 'FIELD_26_std',
 'FIELD_27_mean',
 'FIELD_27_std',
 'FIELD_28_mean',
 'FIELD_28_std',
 'FIELD_29_mean',
 'FIELD_29_std',
 'FIELD_30_mean',
 'FIELD

In [116]:
dfITest.head()

Unnamed: 0,id,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,FIELD_4,...,FIELD_AGE_3_mean,FIELD_AGE_3_std,FIELD_AGE_4_mean,FIELD_AGE_4_std,FIELD_AGE_NONE_mean,FIELD_AGE_NONE_std,avg_age_mean,avg_age_std,diff_age_mean,diff_age_std
0,30000,,none,,,none,1.0,1.0,719.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,,,,
1,30001,,none,,,none,1.0,1.0,1442.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,,,,
2,30002,thành phố hà nội,huyện mỹ đức,32.0,32.0,trưởng dây chuyền phòng sản xuất,0.0,1.0,4000.0,2.0,...,,,,,,,,,,
3,30003,,none,,25.0,none,0.0,1.0,1073.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,28.907828,3.404555,-28.872475,3.692563
4,30004,,none,,,none,1.0,1.0,703.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,,,,


In [117]:
for c in lsFieldFt:
    dfITest["%s_stdized"%c] = (dfITest[c] - dfITest[c+"_mean"])/dfITest[c+"_std"]
print(dfITest.shape) # (19936, 340)
printRuntime()

(20000, 389)
2020-02-15 22:07:23
-------------------


In [118]:
iTestPath = basePath + "itest_{}.pickle".format(ymd)
dfITest.to_pickle(iTestPath, compression="bz2")
printRuntime()

2020-02-15 22:07:31
-------------------


In [65]:
dfITest.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_24_K2_stdized,FIELD_24_K3_stdized,FIELD_24_None_stdized,FIELD_43_0_stdized,FIELD_43_5_stdized,FIELD_43_A_stdized,FIELD_43_B_stdized,FIELD_43_C_stdized,FIELD_43_D_stdized,FIELD_43_None_stdized
0,0,0,,,,,,1.0,1.0,2547.0,...,,,,,,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none,1.0,0.0,-1.0,...,,,,,,,,-0.224257,,0.224257
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,0.0,1.0,3273.0,...,,,,,,,,-0.223014,-0.126664,0.259688
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,-0.288675,,0.288675,,,,,,,
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none,0.0,1.0,1450.0,...,,,,,,,,,,
