In [1]:
import pandas as pd, numpy as np, seaborn as sns
from pprint import pprint
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import math

import datetime, time

from pprint import pprint
import re

In [2]:
def printRuntime():
    print(datetime.datetime.now().strftime("%Y-%m-%d %T"))
    print("-"*19)
printRuntime()

2020-02-05 00:47:49
-------------------


# 1. Load data

In [3]:
basePath = "./kalapa/"
trainPath = basePath + "train.csv"
testPath = basePath + "test.csv"
colDesc = basePath + "column_description.csv"
sampleSubmit = basePath + "sample_submission.csv"

cleanedTrain = basePath + "cleanedTrain.pickle"
nb01 = basePath + "nb01.pickle"
nb02 = basePath + "nb02.pickle"
nb03 = basePath + "nb03.pickle"
printRuntime()

2020-02-05 00:47:53
-------------------


In [4]:
def avgAge(row):
    a = row["age_source1"]
    b = row["age_source2"]
    if np.isnan(a) and np.isnan(b):
        return np.nan
    elif np.isnan(a):
        return b
    elif np.isnan(b):
        return a
    else:
        return (a+b)/2.0
    
def diffAge(row):
    a = row["age_source1"]
    b = row["age_source2"]
    if np.isnan(a) and np.isnan(b):
        return np.nan
    elif np.isnan(a):
        return -b
    elif np.isnan(b):
        return a
    else:
        return (a+b)/2.0
    
def groupAge(age):
    # 18-23; 23-35; 35-50; 50+
    result = ""
    if np.isnan(age):
        result = "AGE_NONE"
    elif age <=23:
        result = "AGE_I"
    elif age <=35:
        result = "AGE_II"
    elif age <= 50:
        result = "AGE_III"
    else:
        result = "AGE_IV"
    return result
printRuntime()

2020-02-05 00:47:54
-------------------


# Train

In [5]:
pdfTrainRaw = pd.read_csv(trainPath)
printRuntime()

2020-02-05 00:47:59
-------------------


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
cond = (
    ~((pdfTrainRaw["age_source1"]<18) | (pdfTrainRaw["age_source2"]<18)) # Filter out age < 18, keep NaN
)
pdfTrain = pdfTrainRaw[cond].copy()
printRuntime()

2020-02-05 00:48:00
-------------------


In [7]:
pdfTrain.shape

(29924, 64)

In [8]:
pdfTrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29924 entries, 0 to 29999
Data columns (total 64 columns):
id             29924 non-null int64
label          29924 non-null int64
province       17114 non-null object
district       17083 non-null object
age_source1    17121 non-null float64
age_source2    20253 non-null float64
maCv           20252 non-null object
FIELD_1        29924 non-null int64
FIELD_2        29461 non-null float64
FIELD_3        29461 non-null float64
FIELD_4        29461 non-null float64
FIELD_5        29461 non-null float64
FIELD_6        29461 non-null float64
FIELD_7        29461 non-null object
FIELD_8        20253 non-null object
FIELD_9        29924 non-null object
FIELD_10       20253 non-null object
FIELD_11       20253 non-null object
FIELD_12       20253 non-null object
FIELD_13       20231 non-null object
FIELD_14       29924 non-null int64
FIELD_15       29924 non-null int64
FIELD_16       20253 non-null float64
FIELD_17       20253 non-null object


In [9]:
lsMetaCol = ["id", "label", "province", "district", "age_source1", "age_source2", "maCv"]

# Test

In [66]:
pdfTestRaw = pd.read_csv(testPath)
printRuntime()

2020-02-05 01:18:44
-------------------


  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
cond = (
    ~((pdfTestRaw["age_source1"]<18) | (pdfTestRaw["age_source2"]<18)) # Filter out age < 18, keep NaN
)
pdfTest = pdfTestRaw[cond].copy()
printRuntime()

2020-02-05 01:18:44
-------------------


### Clean data

In [None]:
25, 29, 30, 31, 36, 37, : 'TRUE' -> True, 'FALSE' -> False !DONE
35: => "Zero"-"Four" -> 1-5 scale (with NULL) !DONE
41: "I"-"V" -> 1-5 scale (with NULL) !DONE
42, 44: "Zezo", "One", "Two" !DONE
45: "1" 1.0 -> 1; "2" 2.0 ->2 !DONE

### Break down maCv to (maCv[:2] + maCv[2:])

In [10]:
def splitJobType(iStr):
    if "none" == iStr or "nan" == iStr:
        return "none", "none"
    # Replace all digit
    iStr = re.sub("\d", "", iStr)
    # Replace cn to cong nhan
    iStr = re.sub(r"^(cnhân|cn)", "công nhân ", iStr)
    # Replace nv/nv. to nhan vien
    iStr = re.sub(r"(- nv|nv.|nv)", "nhân viên ", iStr)
    lsWord = iStr.split()
    if len(lsWord) == 0:
        return "none", "none"
    
    jobCat = " ".join(lsWord[:2])
    jobDesc = "none"
    if len(lsWord) > 2:
        jobDesc = " ".join(lsWord[2:])
    return jobCat, jobDesc

# Test
print(splitJobType("nhân viên phòng thí nghiệm"))
print(splitJobType("nhân viên bảo trì"))
print(splitJobType("cn ủi"))
print(splitJobType("9782cấp dưỡng"))
printRuntime()

('nhân viên', 'phòng thí nghiệm')
('nhân viên', 'bảo trì')
('công nhân', 'ủi')
('cấp dưỡng', 'none')
2020-02-05 00:48:16
-------------------


In [11]:
pdfTrain["maCv"] = pdfTrain["maCv"].apply(lambda x: str(x).lower())
pdfTrain["jobCat"], pdfTrain["jobDesc"] = zip(*pdfTrain["maCv"].apply(splitJobType))
printRuntime()

2020-02-05 00:48:19
-------------------


In [68]:
pdfTest["maCv"] = pdfTest["maCv"].apply(lambda x: str(x).lower())
pdfTest["jobCat"], pdfTest["jobDesc"] = zip(*pdfTest["maCv"].apply(splitJobType))
printRuntime()

2020-02-05 01:19:05
-------------------


In [12]:
df = (pdfTrain
      .replace("None", np.nan) # 'None' -> nan
      .replace("TRUE", True) # 'TRUE' -> True
      .replace("FALSE", False) # 'FALSE' -> False
      .replace("Tỉnh Vĩnh phúc", "Tỉnh Vĩnh Phúc") # "Tỉnh Vĩnh phúc" -> "Tỉnh Vĩnh Phúc"
     )

In [69]:
dfTest = (pdfTest
      .replace("None", np.nan) # 'None' -> nan
      .replace("TRUE", True) # 'TRUE' -> True
      .replace("FALSE", False) # 'FALSE' -> False
      .replace("Tỉnh Vĩnh phúc", "Tỉnh Vĩnh Phúc") # "Tỉnh Vĩnh phúc" -> "Tỉnh Vĩnh Phúc"
     )

#### Replace ilformed values by columns to safely remove

In [70]:
for dfTmp in [df, dfTest]:
    # 6
    dfTmp["FIELD_6"] = dfTmp["FIELD_6"] + 1

    # 35 'Four' 'One' 'Three' 'Two'
    dfTmp.loc[dfTmp["FIELD_35"]=="Zero", "FIELD_35"] = 1
    dfTmp.loc[dfTmp["FIELD_35"]=="One", "FIELD_35"] = 2
    dfTmp.loc[dfTmp["FIELD_35"]=="Two", "FIELD_35"] = 3
    dfTmp.loc[dfTmp["FIELD_35"]=="Three", "FIELD_35"] = 4
    dfTmp.loc[dfTmp["FIELD_35"]=="Four", "FIELD_35"] = 5

    # 41 I II III IV V
    dfTmp.loc[dfTmp["FIELD_41"]=="I", "FIELD_41"] = 1
    dfTmp.loc[dfTmp["FIELD_41"]=="II", "FIELD_41"] = 2
    dfTmp.loc[dfTmp["FIELD_41"]=="III", "FIELD_41"] = 3
    dfTmp.loc[dfTmp["FIELD_41"]=="IV", "FIELD_41"] = 4
    dfTmp.loc[dfTmp["FIELD_41"]=="V", "FIELD_41"] = 5

    # 42
    dfTmp.loc[dfTmp["FIELD_42"]=="Zezo", "FIELD_42"] = 0
    dfTmp.loc[dfTmp["FIELD_42"]=="One", "FIELD_42"] = 1

    # 44
    dfTmp.loc[dfTmp["FIELD_44"]=="One", "FIELD_44"] = 1
    dfTmp.loc[dfTmp["FIELD_44"]=="Two", "FIELD_44"] = 2

    # 45
    dfTmp.loc[(dfTmp["FIELD_45"]=="1")|(dfTmp["FIELD_45"]==1.0), "FIELD_45"] = 1
    dfTmp.loc[(dfTmp["FIELD_45"]=="2")|(dfTmp["FIELD_45"]==2.0), "FIELD_45"] = 2

### One-hot 8 (gender), 10, 12, 17, 24, 43

In [19]:
df_backup = df.copy()

In [20]:
# Get one hot encoding of columns B
lsCol = [8, 10, 12, 17, 24, 43]
for d in lsCol:
    cName = "FIELD_%d" % d
    one_hot = pd.get_dummies(df[cName].replace(np.nan, "None"))
    renameCol = {}
    for c in one_hot.columns:
        renameCol[c] = "%s_%s" % (cName, c)
    pprint(renameCol)
    one_hot = one_hot.rename(columns=renameCol)
    df = df.drop(cName, axis = 1)
    # Join the encoded df
    df = df.join(one_hot)

{'FEMALE': 'FIELD_8_FEMALE', 'MALE': 'FIELD_8_MALE', 'None': 'FIELD_8_None'}
{'GH': 'FIELD_10_GH', 'None': 'FIELD_10_None', 'T1': 'FIELD_10_T1'}
{'0': 'FIELD_12_0',
 '1': 'FIELD_12_1',
 'HT': 'FIELD_12_HT',
 'None': 'FIELD_12_None',
 'TN': 'FIELD_12_TN'}
{'G2': 'FIELD_17_G2',
 'G3': 'FIELD_17_G3',
 'G4': 'FIELD_17_G4',
 'G7': 'FIELD_17_G7',
 'G8': 'FIELD_17_G8',
 'G9': 'FIELD_17_G9',
 'GX': 'FIELD_17_GX',
 'None': 'FIELD_17_None'}
{'K1': 'FIELD_24_K1',
 'K2': 'FIELD_24_K2',
 'K3': 'FIELD_24_K3',
 'None': 'FIELD_24_None'}
{'0': 'FIELD_43_0',
 '5': 'FIELD_43_5',
 'A': 'FIELD_43_A',
 'B': 'FIELD_43_B',
 'C': 'FIELD_43_C',
 'D': 'FIELD_43_D',
 'None': 'FIELD_43_None'}


In [74]:
# Test: Get one hot encoding of columns B
lsCol = [8, 10, 12, 17, 24, 43]
for d in lsCol:
    cName = "FIELD_%d" % d
    one_hot = pd.get_dummies(dfTest[cName].replace(np.nan, "None"))
    renameCol = {}
    for c in one_hot.columns:
        renameCol[c] = "%s_%s" % (cName, c)
    pprint(renameCol)
    one_hot = one_hot.rename(columns=renameCol)
    dfTest = dfTest.drop(cName, axis = 1)
    # Join the encoded dfTest
    dfTest = dfTest.join(one_hot)

{'FEMALE': 'FIELD_8_FEMALE', 'MALE': 'FIELD_8_MALE', 'None': 'FIELD_8_None'}
{'GH': 'FIELD_10_GH', 'None': 'FIELD_10_None', 'T1': 'FIELD_10_T1'}
{'0': 'FIELD_12_0',
 '1': 'FIELD_12_1',
 'DK': 'FIELD_12_DK',
 'DN': 'FIELD_12_DN',
 'DT': 'FIELD_12_DT',
 'GD': 'FIELD_12_GD',
 'HT': 'FIELD_12_HT',
 'None': 'FIELD_12_None',
 'XK': 'FIELD_12_XK'}
{'G3': 'FIELD_17_G3',
 'G4': 'FIELD_17_G4',
 'G7': 'FIELD_17_G7',
 'G8': 'FIELD_17_G8',
 'G9': 'FIELD_17_G9',
 'GX': 'FIELD_17_GX',
 'None': 'FIELD_17_None'}
{'K1': 'FIELD_24_K1',
 'K2': 'FIELD_24_K2',
 'K3': 'FIELD_24_K3',
 'None': 'FIELD_24_None'}
{'0': 'FIELD_43_0',
 '5': 'FIELD_43_5',
 'A': 'FIELD_43_A',
 'B': 'FIELD_43_B',
 'C': 'FIELD_43_C',
 'D': 'FIELD_43_D',
 'None': 'FIELD_43_None'}


### Add avg_age, diff_age, group_age

In [21]:
df["avg_age"] = df.apply(lambda row: avgAge(row), axis=1)
df["diff_age"] = df.apply(lambda row: diffAge(row), axis=1)
df["group_age"] = df["avg_age"].apply(lambda x: groupAge(x))
printRuntime()

2020-02-05 00:48:49
-------------------


In [75]:
dfTest["avg_age"] = dfTest.apply(lambda row: avgAge(row), axis=1)
dfTest["diff_age"] = dfTest.apply(lambda row: diffAge(row), axis=1)
dfTest["group_age"] = dfTest["avg_age"].apply(lambda x: groupAge(x))
printRuntime()

2020-02-05 01:23:35
-------------------


### Add lv3_loc

In [23]:
df["lv3_loc"] = "[" + df["province"] + "]_[" + df["district"] + "]"
lsProvince = df["province"].unique()
lsLoc = df["lv3_loc"].unique()
print(len(lsProvince), len(lsLoc))
printRuntime()

65 745
2020-02-05 00:48:51
-------------------


In [76]:
dfTest["lv3_loc"] = "[" + dfTest["province"] + "]_[" + dfTest["district"] + "]"
lsProvince = dfTest["province"].unique()
lsLoc = dfTest["lv3_loc"].unique()
print(len(lsProvince), len(lsLoc))
printRuntime()

65 712
2020-02-05 01:23:47
-------------------


### Convert data types

In [32]:
for c in lsFieldFt:
    df[c] = df[c].astype(np.float64)

# TODO: One-hot problems! 12 & 17

In [86]:
lsCol = [8, 10, 12, 17, 24, 43]
for d in lsCol:
    cName = "FIELD_%d" % d
    lsF = [c for c in df.columns if cName in c]
    lsFTest = [c for c in dfTest.columns if cName in c]
    print(lsF)
    print(lsFTest)
    print("-"*20)

['FIELD_8_FEMALE', 'FIELD_8_MALE', 'FIELD_8_None']
['FIELD_8_FEMALE', 'FIELD_8_MALE', 'FIELD_8_None']
--------------------
['FIELD_10_GH', 'FIELD_10_None', 'FIELD_10_T1']
['FIELD_10_GH', 'FIELD_10_None', 'FIELD_10_T1']
--------------------
['FIELD_12_0', 'FIELD_12_1', 'FIELD_12_HT', 'FIELD_12_None', 'FIELD_12_TN']
['FIELD_12_0', 'FIELD_12_1', 'FIELD_12_DK', 'FIELD_12_DN', 'FIELD_12_DT', 'FIELD_12_GD', 'FIELD_12_HT', 'FIELD_12_None', 'FIELD_12_XK']
--------------------
['FIELD_17_G2', 'FIELD_17_G3', 'FIELD_17_G4', 'FIELD_17_G7', 'FIELD_17_G8', 'FIELD_17_G9', 'FIELD_17_GX', 'FIELD_17_None']
['FIELD_17_G3', 'FIELD_17_G4', 'FIELD_17_G7', 'FIELD_17_G8', 'FIELD_17_G9', 'FIELD_17_GX', 'FIELD_17_None']
--------------------
['FIELD_24_K1', 'FIELD_24_K2', 'FIELD_24_K3', 'FIELD_24_None']
['FIELD_24_K1', 'FIELD_24_K2', 'FIELD_24_K3', 'FIELD_24_None']
--------------------
['FIELD_43_0', 'FIELD_43_5', 'FIELD_43_A', 'FIELD_43_B', 'FIELD_43_C', 'FIELD_43_D', 'FIELD_43_None']
['FIELD_43_0', 'FIELD_43_5

In [84]:
for c in lsFieldFt:
    dfTest[c] = dfTest[c].astype(np.float64)

KeyError: 'FIELD_12_TN'

### Add neighbor features
Neighbor features: 
    + avg+std of groupby (province, age_group, maCv)
    + avg+std of groupby (lv3_loc, age_group, maCv)
    + avg+std of groupby (province, maCv)
    

In [24]:
lsFieldFt = [c for c in df.columns 
             if "FIELD" in c 
             and c not in ["FIELD_%d"%d for d in [7, 9, 13, 39, 40]]]
pprint(lsFieldFt)

['FIELD_1',
 'FIELD_2',
 'FIELD_3',
 'FIELD_4',
 'FIELD_5',
 'FIELD_6',
 'FIELD_11',
 'FIELD_14',
 'FIELD_15',
 'FIELD_16',
 'FIELD_18',
 'FIELD_19',
 'FIELD_20',
 'FIELD_21',
 'FIELD_22',
 'FIELD_23',
 'FIELD_25',
 'FIELD_26',
 'FIELD_27',
 'FIELD_28',
 'FIELD_29',
 'FIELD_30',
 'FIELD_31',
 'FIELD_32',
 'FIELD_33',
 'FIELD_34',
 'FIELD_35',
 'FIELD_36',
 'FIELD_37',
 'FIELD_38',
 'FIELD_41',
 'FIELD_42',
 'FIELD_44',
 'FIELD_45',
 'FIELD_46',
 'FIELD_47',
 'FIELD_48',
 'FIELD_49',
 'FIELD_50',
 'FIELD_51',
 'FIELD_52',
 'FIELD_53',
 'FIELD_54',
 'FIELD_55',
 'FIELD_56',
 'FIELD_57',
 'FIELD_8_FEMALE',
 'FIELD_8_MALE',
 'FIELD_8_None',
 'FIELD_10_GH',
 'FIELD_10_None',
 'FIELD_10_T1',
 'FIELD_12_0',
 'FIELD_12_1',
 'FIELD_12_HT',
 'FIELD_12_None',
 'FIELD_12_TN',
 'FIELD_17_G2',
 'FIELD_17_G3',
 'FIELD_17_G4',
 'FIELD_17_G7',
 'FIELD_17_G8',
 'FIELD_17_G9',
 'FIELD_17_GX',
 'FIELD_17_None',
 'FIELD_24_K1',
 'FIELD_24_K2',
 'FIELD_24_K3',
 'FIELD_24_None',
 'FIELD_43_0',
 'FIELD_43_5',

In [25]:
aggDict = {}
for f in lsFieldFt:
    aggDict[f] = ["mean", "std"]

In [31]:
# 1
lsMetaCol1 = ["province", "group_age", "jobCat"]
lsMetaCol2 = ["lv3_loc", "group_age", "jobCat"]
lsMetaCol3 = ["province", "jobCat"]

In [33]:
# for c in lsFieldFt:
#    print(c, df[c].dtypes)

In [48]:
dfNb01 = df.groupby(lsMetaCol1, as_index=False).agg(aggDict)
dfNb01.columns = ["_".join(x) for x in dfNb01.columns.ravel()]
dfNb01 = dfNb01.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "jobCat_": "jobCat"})
printRuntime()

2020-02-05 00:57:12
-------------------


In [49]:
dfNb02 = df.groupby(lsMetaCol2, as_index=False).agg(aggDict)
dfNb02.columns = ["_".join(x) for x in dfNb02.columns.ravel()]
dfNb02 = dfNb02.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "jobCat_": "jobCat"})
printRuntime()

2020-02-05 00:57:13
-------------------


In [50]:
dfNb03 = df.groupby(lsMetaCol3, as_index=False).agg(aggDict)
dfNb03.columns = ["_".join(x) for x in dfNb03.columns.ravel()]
dfNb03 = dfNb03.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "jobCat_": "jobCat"})
printRuntime()

2020-02-05 00:57:13
-------------------


In [77]:
# Save outputs: df, dfNb01, dfNb02, dfNb03
df.to_pickle(cleanedTrain, compression="bz2")
printRuntime()

2020-02-05 01:24:15
-------------------


In [78]:
dfNb01.to_pickle(nb01, compression="bz2")
dfNb02.to_pickle(nb02, compression="bz2")
dfNb03.to_pickle(nb03, compression="bz2")
printRuntime()

2020-02-05 01:24:16
-------------------


In [51]:
dfNb01.head()

Unnamed: 0,province,group_age,jobCat,FIELD_1_mean,FIELD_1_std,FIELD_2_mean,FIELD_2_std,FIELD_3_mean,FIELD_3_std,FIELD_4_mean,...,FIELD_43_A_mean,FIELD_43_A_std,FIELD_43_B_mean,FIELD_43_B_std,FIELD_43_C_mean,FIELD_43_C_std,FIELD_43_D_mean,FIELD_43_D_std,FIELD_43_None_mean,FIELD_43_None_std
0,Thành phố Cần Thơ,AGE_I,công nhân,1.0,0.0,1.0,0.0,1353.75,451.900708,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Thành phố Cần Thơ,AGE_I,none,0.529412,0.514496,0.647059,0.492592,700.411765,634.703578,0.352941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Thành phố Cần Thơ,AGE_II,bôi keo,1.0,,1.0,,2899.0,,2.0,...,0.0,,0.0,,0.0,,0.0,,1.0,
3,Thành phố Cần Thơ,AGE_II,bảo vệ,0.0,,1.0,,2895.0,,1.0,...,0.0,,0.0,,0.0,,0.0,,1.0,
4,Thành phố Cần Thơ,AGE_II,chủ tịch,0.0,,0.0,,-1.0,,2.0,...,0.0,,0.0,,0.0,,0.0,,1.0,


### raw ft + nb ft + standardize raw ft

In [38]:
lsMetaCol = ["id", "label", "province", "district", "lv3_loc", 
              "age_source1", "age_source2", "avg_age", "diff_age", "group_age",
              "maCv", "jobCat", "jobDesc"]
# raw ft
lsFieldFt = [c for c in df.columns 
             if "FIELD" in c 
             and c not in ["FIELD_%d"%d for d in [7, 9, 13, 39, 40]]]
label = "label"
printRuntime()

2020-02-05 00:54:53
-------------------


In [52]:
# nb ft
lsNbMetaCol = lsMetaCol1
dfNb = dfNb01 # TODO: dfNb02, dfNb03

In [53]:
dfITrain = pd.merge(df, dfNb01, on=lsNbMetaCol, how="left")

In [54]:
dfITrain.shape

(29924, 246)

In [61]:
lsTmp = [c for c in dfITrain.columns if c not in lsFieldFt]
pprint(lsTmp)

['id',
 'label',
 'province',
 'district',
 'age_source1',
 'age_source2',
 'maCv',
 'FIELD_7',
 'FIELD_9',
 'FIELD_13',
 'FIELD_39',
 'FIELD_40',
 'jobCat',
 'jobDesc',
 'avg_age',
 'diff_age',
 'group_age',
 'lv3_loc',
 'FIELD_1_mean',
 'FIELD_1_std',
 'FIELD_2_mean',
 'FIELD_2_std',
 'FIELD_3_mean',
 'FIELD_3_std',
 'FIELD_4_mean',
 'FIELD_4_std',
 'FIELD_5_mean',
 'FIELD_5_std',
 'FIELD_6_mean',
 'FIELD_6_std',
 'FIELD_11_mean',
 'FIELD_11_std',
 'FIELD_14_mean',
 'FIELD_14_std',
 'FIELD_15_mean',
 'FIELD_15_std',
 'FIELD_16_mean',
 'FIELD_16_std',
 'FIELD_18_mean',
 'FIELD_18_std',
 'FIELD_19_mean',
 'FIELD_19_std',
 'FIELD_20_mean',
 'FIELD_20_std',
 'FIELD_21_mean',
 'FIELD_21_std',
 'FIELD_22_mean',
 'FIELD_22_std',
 'FIELD_23_mean',
 'FIELD_23_std',
 'FIELD_25_mean',
 'FIELD_25_std',
 'FIELD_26_mean',
 'FIELD_26_std',
 'FIELD_27_mean',
 'FIELD_27_std',
 'FIELD_28_mean',
 'FIELD_28_std',
 'FIELD_29_mean',
 'FIELD_29_std',
 'FIELD_30_mean',
 'FIELD_30_std',
 'FIELD_31_mean',
 'F

In [55]:
dfITrain.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_43_A_mean,FIELD_43_A_std,FIELD_43_B_mean,FIELD_43_B_std,FIELD_43_C_mean,FIELD_43_C_std,FIELD_43_D_mean,FIELD_43_D_std,FIELD_43_None_mean,FIELD_43_None_std
0,0,0,,,,,,1.0,1.0,2547.0,...,,,,,,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none,1.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.048128,0.214612,0.0,0.0,0.951872,0.214612
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,0.0,1.0,3273.0,...,0.0,0.0,0.0,0.0,0.047619,0.213524,0.015873,0.125316,0.936508,0.244494
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none,0.0,1.0,1450.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [62]:
for c in lsFieldFt:
    dfITrain["%s_stdized"%c] = (dfITrain[c] - dfITrain[c+"_mean"])/dfITrain[c+"_std"]
printRuntime()

2020-02-05 01:00:23
-------------------


In [63]:
dfITrain.shape

(29924, 322)

In [64]:
iTrainPath = basePath + "itrain.pickle"
dfITrain.to_pickle(iTrainPath, compression="bz2")
printRuntime()

2020-02-05 01:00:48
-------------------


In [65]:
dfITrain.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_24_K2_stdized,FIELD_24_K3_stdized,FIELD_24_None_stdized,FIELD_43_0_stdized,FIELD_43_5_stdized,FIELD_43_A_stdized,FIELD_43_B_stdized,FIELD_43_C_stdized,FIELD_43_D_stdized,FIELD_43_None_stdized
0,0,0,,,,,,1.0,1.0,2547.0,...,,,,,,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none,1.0,0.0,-1.0,...,,,,,,,,-0.224257,,0.224257
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,0.0,1.0,3273.0,...,,,,,,,,-0.223014,-0.126664,0.259688
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,-0.288675,,0.288675,,,,,,,
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none,0.0,1.0,1450.0,...,,,,,,,,,,


# Prepare itest

In [79]:
dfITest = pd.merge(dfTest, dfNb01, on=lsNbMetaCol, how="left")

In [80]:
dfITest.shape

(19936, 248)

In [81]:
lsTmp = [c for c in dfITest.columns if c not in lsFieldFt]
pprint(lsTmp)

['id',
 'province',
 'district',
 'age_source1',
 'age_source2',
 'maCv',
 'FIELD_7',
 'FIELD_9',
 'FIELD_13',
 'FIELD_39',
 'FIELD_40',
 'jobCat',
 'jobDesc',
 'FIELD_12_DK',
 'FIELD_12_DN',
 'FIELD_12_DT',
 'FIELD_12_GD',
 'FIELD_12_XK',
 'avg_age',
 'diff_age',
 'group_age',
 'lv3_loc',
 'FIELD_1_mean',
 'FIELD_1_std',
 'FIELD_2_mean',
 'FIELD_2_std',
 'FIELD_3_mean',
 'FIELD_3_std',
 'FIELD_4_mean',
 'FIELD_4_std',
 'FIELD_5_mean',
 'FIELD_5_std',
 'FIELD_6_mean',
 'FIELD_6_std',
 'FIELD_11_mean',
 'FIELD_11_std',
 'FIELD_14_mean',
 'FIELD_14_std',
 'FIELD_15_mean',
 'FIELD_15_std',
 'FIELD_16_mean',
 'FIELD_16_std',
 'FIELD_18_mean',
 'FIELD_18_std',
 'FIELD_19_mean',
 'FIELD_19_std',
 'FIELD_20_mean',
 'FIELD_20_std',
 'FIELD_21_mean',
 'FIELD_21_std',
 'FIELD_22_mean',
 'FIELD_22_std',
 'FIELD_23_mean',
 'FIELD_23_std',
 'FIELD_25_mean',
 'FIELD_25_std',
 'FIELD_26_mean',
 'FIELD_26_std',
 'FIELD_27_mean',
 'FIELD_27_std',
 'FIELD_28_mean',
 'FIELD_28_std',
 'FIELD_29_mean',
 'F

In [82]:
dfITest.head()

Unnamed: 0,id,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,FIELD_4,...,FIELD_43_A_mean,FIELD_43_A_std,FIELD_43_B_mean,FIELD_43_B_std,FIELD_43_C_mean,FIELD_43_C_std,FIELD_43_D_mean,FIELD_43_D_std,FIELD_43_None_mean,FIELD_43_None_std
0,30000,,,,,,1,1.0,719.0,0.0,...,,,,,,,,,,
1,30001,,,,,,1,1.0,1442.0,0.0,...,,,,,,,,,,
2,30002,Thành phố Hà Nội,Huyện Mỹ Đức,32.0,32.0,trưởng dây chuyền phòng sản xuất,0,1.0,4000.0,2.0,...,,,,,,,,,,
3,30003,,,,25.0,none,0,1.0,1073.0,0.0,...,,,,,,,,,,
4,30004,,,,,,1,1.0,703.0,0.0,...,,,,,,,,,,


In [83]:
for c in lsFieldFt:
    dfITest["%s_stdized"%c] = (dfITest[c] - dfITest[c+"_mean"])/dfITest[c+"_std"]
printRuntime()

TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')

In [63]:
dfITest.shape

(29924, 322)

In [64]:
iTestPath = basePath + "itest.pickle"
dfITest.to_pickle(iTestPath, compression="bz2")
printRuntime()

2020-02-05 01:00:48
-------------------


In [65]:
dfITest.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_24_K2_stdized,FIELD_24_K3_stdized,FIELD_24_None_stdized,FIELD_43_0_stdized,FIELD_43_5_stdized,FIELD_43_A_stdized,FIELD_43_B_stdized,FIELD_43_C_stdized,FIELD_43_D_stdized,FIELD_43_None_stdized
0,0,0,,,,,,1.0,1.0,2547.0,...,,,,,,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none,1.0,0.0,-1.0,...,,,,,,,,-0.224257,,0.224257
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,0.0,1.0,3273.0,...,,,,,,,,-0.223014,-0.126664,0.259688
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,-0.288675,,0.288675,,,,,,,
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none,0.0,1.0,1450.0,...,,,,,,,,,,
