In [1]:
import pandas as pd, numpy as np, seaborn as sns
from pprint import pprint
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import math
from collections import Counter 

import datetime, time

from pprint import pprint
import re

In [2]:
def printRuntime():
    print(datetime.datetime.now().strftime("%Y-%m-%d %T"))
    print("-"*19)
    
printRuntime()

2020-02-06 22:45:36
-------------------


# 1. Load data

In [3]:
basePath = "./kalapa/"
trainPath = basePath + "train.csv"
testPath = basePath + "test.csv"
colDesc = basePath + "column_description.csv"
sampleSubmit = basePath + "sample_submission.csv"

cleanedTrain = basePath + "cleanedTrain.pickle"
nb01 = basePath + "nb01.pickle"
nb02 = basePath + "nb02.pickle"
nb03 = basePath + "nb03.pickle"
printRuntime()

2020-02-06 22:45:38
-------------------


In [4]:
def avgAge(row):
    a = row["age_source1"]
    b = row["age_source2"]
    if np.isnan(a) and np.isnan(b):
        return np.nan
    elif np.isnan(a):
        return b
    elif np.isnan(b):
        return a
    else:
        return (a+b)/2.0
    
def diffAge(row):
    a = row["age_source1"]
    b = row["age_source2"]
    if np.isnan(a) and np.isnan(b):
        return np.nan
    elif np.isnan(a):
        return -b
    elif np.isnan(b):
        return a
    else:
        return (a+b)/2.0
    
def groupAge(age):
    # 18-23; 23-35; 35-50; 50+
    result = ""
    if np.isnan(age):
        result = "AGE_NONE"
    elif age <=23:
        result = "AGE_I"
    elif age <=35:
        result = "AGE_II"
    elif age <= 50:
        result = "AGE_III"
    else:
        result = "AGE_IV"
    return result
printRuntime()

2020-02-06 22:45:40
-------------------


# Train

In [5]:
pdfTrainRaw = pd.read_csv(trainPath)
printRuntime()

2020-02-06 22:45:45
-------------------


  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
cond = (
    ~((pdfTrainRaw["age_source1"]<18) | (pdfTrainRaw["age_source2"]<18)) # Filter out age < 18, keep NaN
)
pdfTrain = pdfTrainRaw[cond].copy()
printRuntime()

2020-02-06 22:45:53
-------------------


In [7]:
pdfTrain["maCv"] = pdfTrain["maCv"].apply(lambda x: str(x).lower())

In [8]:
pdfTrain["province"] = pdfTrain["province"].apply(lambda x: str(x).lower())

In [9]:
pdfTrain.shape

(29924, 64)

In [10]:
pdfTrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29924 entries, 0 to 29999
Data columns (total 64 columns):
id             29924 non-null int64
label          29924 non-null int64
province       29924 non-null object
district       17083 non-null object
age_source1    17121 non-null float64
age_source2    20253 non-null float64
maCv           29924 non-null object
FIELD_1        29924 non-null int64
FIELD_2        29461 non-null float64
FIELD_3        29461 non-null float64
FIELD_4        29461 non-null float64
FIELD_5        29461 non-null float64
FIELD_6        29461 non-null float64
FIELD_7        29461 non-null object
FIELD_8        20253 non-null object
FIELD_9        29924 non-null object
FIELD_10       20253 non-null object
FIELD_11       20253 non-null object
FIELD_12       20253 non-null object
FIELD_13       20231 non-null object
FIELD_14       29924 non-null int64
FIELD_15       29924 non-null int64
FIELD_16       20253 non-null float64
FIELD_17       20253 non-null object


# Test

In [21]:
pdfTestRaw = pd.read_csv(testPath)
printRuntime()

2020-02-06 22:56:30
-------------------


  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
cond = (
    ~((pdfTestRaw["age_source1"]<18) | (pdfTestRaw["age_source2"]<18)) # Filter out age < 18, keep NaN
)
pdfTest = pdfTestRaw[cond].copy()
printRuntime()

2020-02-06 22:56:34
-------------------


In [23]:
pdfTest["maCv"] = pdfTest["maCv"].apply(lambda x: str(x).lower())

In [24]:
pdfTest["province"] = pdfTest["province"].apply(lambda x: str(x).lower())

# Display basic information of FIELDs

In [26]:
lsField = ["FIELD_%d"%d for d in range(1, 50)]
for f in lsField:
    print(f)
    lsVal = pdfTrain[f].unique()
    l = len(lsVal)
    print("num of distinct values:", l)
    print("dtypes:", pdfTrain[f].dtypes)
    if l < 20:
        print(lsVal)
    print("-"*20)
printRuntime()

FIELD_1
num of distinct values: 2
dtypes: int64
[1 0]
--------------------
FIELD_2
num of distinct values: 3
dtypes: float64
[ 1.  0. nan]
--------------------
FIELD_3
num of distinct values: 514
dtypes: float64
--------------------
FIELD_4
num of distinct values: 11
dtypes: float64
[ 0.  1. nan  2.  3.  4.  5.  6.  8.  7. 12.]
--------------------
FIELD_5
num of distinct values: 16
dtypes: float64
[ 0.  1.  4.  2. nan  6.  3.  5.  7.  8. 11.  9. 12. 14. 10. 13.]
--------------------
FIELD_6
num of distinct values: 6
dtypes: float64
[ 0.  1. nan  2.  3.  4.]
--------------------
FIELD_7
num of distinct values: 7064
dtypes: object
--------------------
FIELD_8
num of distinct values: 3
dtypes: object
[nan 'MALE' 'FEMALE']
--------------------
FIELD_9
num of distinct values: 35
dtypes: object
--------------------
FIELD_10
num of distinct values: 3
dtypes: object
[nan 'T1' 'GH']
--------------------
FIELD_11
num of distinct values: 43
dtypes: object
--------------------
FIELD_12
num of dis

### Break down maCv to (maCv[:2] + maCv[2:]) DONE!

In [55]:
pdfTrain[lsMetaCol].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv
0,0,0,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none
5,5,0,,,,,
6,6,0,,,,28.0,2983
7,7,0,Tỉnh Bắc Giang,Huyện Yên Dũng,40.0,32.0,công nhân
8,8,0,,,,,
9,9,0,,,,,


In [14]:
def splitJobType(iStr):
    if "none" == iStr or "nan" == iStr:
        return "none", "none"
    # Replace all digit
    iStr = re.sub("\d", "", iStr)
    # Replace cn to cong nhan
    iStr = re.sub(r"^(cnhân|cn)", "công nhân ", iStr)
    # Replace nv/nv. to nhan vien
    iStr = re.sub(r"(- nv|nv.|nv)", "nhân viên ", iStr)
    lsWord = iStr.split()
    if len(lsWord) == 0:
        return "none", "none"
    
    jobCat = " ".join(lsWord[:2])
    jobDesc = "none"
    if len(lsWord) > 2:
        jobDesc = " ".join(lsWord[2:])
    return jobCat, jobDesc

# Test
print(splitJobType("nhân viên phòng thí nghiệm"))
print(splitJobType("nhân viên bảo trì"))
print(splitJobType("cn ủi"))
print(splitJobType("9782cấp dưỡng"))
printRuntime()

('nhân viên', 'phòng thí nghiệm')
('nhân viên', 'bảo trì')
('công nhân', 'ủi')
('cấp dưỡng', 'none')
2020-02-06 22:49:15
-------------------


In [15]:
# pdfTrain["jobCat"], pdfTrain["jobDesc"] = pdfTrain["maCv"].apply(lambda x: splitJobType(x))
%time
pdfTrain["jobCat"], pdfTrain["jobDesc"] = zip(*pdfTrain["maCv"].apply(splitJobType))
printRuntime()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
2020-02-06 22:49:18
-------------------


In [76]:
pdfTrain[lsMetaCol + ["jobCat", "jobDesc"]].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,jobCat,jobDesc
0,0,0,,,,,,none,none
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,none,none,none
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,công nhân,none
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,none,none
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,none,none,none
5,5,0,,,,,,none,none
6,6,0,,,,28.0,2983,2983,none
7,7,0,Tỉnh Bắc Giang,Huyện Yên Dũng,40.0,32.0,công nhân,công nhân,none
8,8,0,,,,,,none,none
9,9,0,,,,,,none,none


In [110]:
pdfTrain[lsMetaCol + ["jobCat", "jobDesc"]].loc[pdfTrain["jobCat"]!="none"].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,jobCat,jobDesc
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,công nhân,công nhân,none
7,7,0,Tỉnh Bắc Giang,Huyện Yên Dũng,40.0,32.0,công nhân,công nhân,none
12,12,0,,,,36.0,cấp dưỡng,cấp dưỡng,none
14,14,0,Tỉnh Thừa Thiên Huế,Huyện Phong Điền,21.0,21.0,nhân viên bảo trì,nhân viên,bảo trì
15,15,0,Tỉnh Cà Mau,Huyện Đầm Dơi,20.0,20.0,công nhân ủi,công nhân,ủi
21,21,0,Tỉnh Tây Ninh,Thành phố Tây Ninh,29.0,29.0,nhân viên,nhân viên,none
22,22,0,Thành phố Hà Nội,Quận Thanh Xuân,38.0,38.0,nhân viên kinh doanh,nhân viên,kinh doanh
25,25,0,Tỉnh Bình Dương,Thị xã Tân Uyên,20.0,20.0,công nhân,công nhân,none
29,29,0,Tỉnh Thừa Thiên Huế,Thị xã Hương Thủy,26.0,26.0,công nhân may công nghiệp,công nhân,may công nghiệp
31,31,0,,,,43.0,công nhân,công nhân,none


In [119]:
lsJobCat = pdfTrain["jobCat"].unique()
print(len(lsJobCat))
pprint(lsJobCat[:10])

413
array(['none', 'công nhân', 'cấp dưỡng', 'nhân viên', 'điều dưỡng',
       'y sĩ', 'giáo viên', 'kế toán', 'vscn', 'đứng máy'], dtype=object)


In [None]:
pdfTmpJobCat = pdfTrain.groupby(["jobCat"], as_index=False).agg({"id": "count"})

In [122]:
pdfTmpJobCat.sort_values(["id"], ascending=False).head(30) # replace cn => cong nhan

Unnamed: 0,jobCat,id
221,none,22831
66,công nhân,3695
211,nhân viên,1592
116,giáo viên,381
60,cán bộ,64
171,kế toán,62
177,kỹ thuật,60
47,chuyên viên,58
184,lái xe,55
175,kỹ sư,42


In [121]:
pdfTmpJobCat.sort_values(["id"]).head(20)

Unnamed: 0,jobCat,id
0,-công nhân,1
237,phòng hcns,1
236,phân tích,1
235,phân cỡ,1
234,phan loai,1
233,pha chế,1
232,p.xnk,1
231,p.quản đốc,1
230,p.ql,1
229,p.gđốc,1


### Clean data

In [None]:
25, 29, 30, 31, 36, 37, : 'TRUE' -> True, 'FALSE' -> False !DONE
35: => "Zero"-"Four" -> 1-5 scale (with NULL) !DONE
41: "I"-"V" -> 1-5 scale (with NULL) !DONE
42, 44: "Zezo", "One", "Two" !DONE
45: "1" 1.0 -> 1; "2" 2.0 ->2 !DONE

### Replace "None" as nan, "TRUE"/"FALSE", dupplicated string values DONE!

In [36]:
pdfTrain = (pdfTrain
      .replace(np.nan, "None") #  nan -> 'None', convert back while finish
      .replace("TRUE", True) # 'TRUE' -> True
      .replace("FALSE", False) # 'FALSE' -> False
      # .replace("Tỉnh Vĩnh phúc", "Tỉnh Vĩnh Phúc") # "Tỉnh Vĩnh phúc" -> "Tỉnh Vĩnh Phúc": All to lower case
     )

### Add avg_age, diff_age, group_age DONE!

In [38]:
pdfTrain["avg_age"] = pdfTrain.apply(lambda row: avgAge(row), axis=1)
pdfTrain["diff_age"] = pdfTrain.apply(lambda row: diffAge(row), axis=1)
pdfTrain["group_age"] = pdfTrain["avg_age"].apply(lambda x: groupAge(x))
printRuntime()

2020-02-06 23:17:45
-------------------


### Add lv3_loc DONE!

In [41]:
pdfTrain["lv3_loc"] = "[" + pdfTrain["province"] + "]_[" + pdfTrain["district"] + "]"
lsProvince = pdfTrain["province"].unique()
lsLoc = pdfTrain["lv3_loc"].unique()
print(len(lsProvince), len(lsLoc))
printRuntime()

65 746
2020-02-06 23:18:27
-------------------


### Replace ilformed values by columns to safely remove DONE!

In [43]:
# 6
pdfTrain["FIELD_6"] = pdfTrain["FIELD_6"] + 1

In [43]:
# 8
pdfTrain.loc[pdfTrain["FIELD_8"]=="MALE", "FIELD_8"] = 0
pdfTrain.loc[pdfTrain["FIELD_8"]=="FEMALE", "FIELD_8"] = 1

In [35]:
# 12
pdfTrain.loc[~pdfTrain["FIELD_12"].isin([np.nan, "0", "1"]), "FIELD_12"] = np.nan

NameError: name 'df' is not defined

In [44]:
# 35 'Four' 'One' 'Three' 'Two'
pdfTrain.loc[pdfTrain["FIELD_35"]=="Zero", "FIELD_35"] = 1
pdfTrain.loc[pdfTrain["FIELD_35"]=="One", "FIELD_35"] = 2
pdfTrain.loc[pdfTrain["FIELD_35"]=="Two", "FIELD_35"] = 3
pdfTrain.loc[pdfTrain["FIELD_35"]=="Three", "FIELD_35"] = 4
pdfTrain.loc[pdfTrain["FIELD_35"]=="Four", "FIELD_35"] = 5

In [45]:
# 41 I II III IV V
pdfTrain.loc[pdfTrain["FIELD_41"]=="I", "FIELD_41"] = 1
pdfTrain.loc[pdfTrain["FIELD_41"]=="II", "FIELD_41"] = 2
pdfTrain.loc[pdfTrain["FIELD_41"]=="III", "FIELD_41"] = 3
pdfTrain.loc[pdfTrain["FIELD_41"]=="IV", "FIELD_41"] = 4
pdfTrain.loc[pdfTrain["FIELD_41"]=="V", "FIELD_41"] = 5

In [46]:
# 42
pdfTrain.loc[pdfTrain["FIELD_42"]=="Zezo", "FIELD_42"] = 0
pdfTrain.loc[pdfTrain["FIELD_42"]=="One", "FIELD_42"] = 1

In [57]:
# 43 0/5 => None
pdfTrain.loc[(pdfTrain["FIELD_43"]=="0")|(pdfTrain["FIELD_43"]=="5"), "FIELD_43"] = np.nan

In [47]:
# 44
pdfTrain.loc[pdfTrain["FIELD_44"]=="One", "FIELD_44"] = 1
pdfTrain.loc[pdfTrain["FIELD_44"]=="Two", "FIELD_44"] = 2

In [48]:
# 45
pdfTrain.loc[(pdfTrain["FIELD_45"]=="1")|(pdfTrain["FIELD_45"]==1.0), "FIELD_45"] = 1
pdfTrain.loc[(pdfTrain["FIELD_45"]=="2")|(pdfTrain["FIELD_45"]==2.0), "FIELD_45"] = 2

### One-hot 8 (gender), 10, 17, 24, 43 DONE!

In [49]:
df_backup = pdfTrain.copy()

In [127]:
d = 17
cName = "FIELD_%d" % d
one_hot = pd.get_dummies(pdfTrain[cName].replace(np.nan, "None"))
print(one_hot)

       G2  G3  G4  G7  G8  G9  GX  None
0       0   0   0   0   0   0   0     1
1       0   0   0   0   1   0   0     0
2       0   0   0   0   0   0   0     1
3       0   0   0   0   0   0   0     1
4       0   0   0   0   1   0   0     0
5       0   0   0   0   0   0   0     1
6       0   0   0   0   0   0   0     1
7       0   0   0   0   0   0   0     1
8       0   0   0   0   0   0   0     1
9       0   0   0   0   0   0   0     1
10      0   0   0   0   0   0   0     1
11      0   0   0   0   0   0   0     1
12      0   0   0   0   0   0   0     1
13      0   0   0   0   0   0   0     1
14      0   0   0   0   0   0   0     1
15      0   0   0   0   1   0   0     0
16      0   0   0   0   0   0   0     1
17      0   0   0   0   0   0   0     1
18      0   0   0   0   0   0   0     1
19      0   0   0   0   0   0   0     1
20      0   0   0   0   0   0   0     1
21      0   0   0   0   1   0   0     0
22      0   0   0   0   0   0   0     1
23      0   0   0   0   0   0   0     1


In [50]:
# Get one hot encoding of columns B
lsCol = [10, 17, 24, 43]
for d in lsCol:
    cName = "FIELD_%d" % d
    one_hot = pd.get_dummies(pdfTrain[cName].replace(np.nan, "None"))
    renameCol = {}
    for c in one_hot.columns:
        renameCol[c] = "%s_%s" % (cName, c)
    pprint(renameCol)
    one_hot = one_hot.rename(columns=renameCol)
    df = pdfTrain.drop(cName, axis = 1)
    # Join the encoded df
    df = pdfTrain.join(one_hot)

{'FEMALE': 'FIELD_8_FEMALE', 'MALE': 'FIELD_8_MALE', 'None': 'FIELD_8_None'}
{'GH': 'FIELD_10_GH', 'None': 'FIELD_10_None', 'T1': 'FIELD_10_T1'}
{'G2': 'FIELD_17_G2',
 'G3': 'FIELD_17_G3',
 'G4': 'FIELD_17_G4',
 'G7': 'FIELD_17_G7',
 'G8': 'FIELD_17_G8',
 'G9': 'FIELD_17_G9',
 'GX': 'FIELD_17_GX',
 'None': 'FIELD_17_None'}
{'K1': 'FIELD_24_K1',
 'K2': 'FIELD_24_K2',
 'K3': 'FIELD_24_K3',
 'None': 'FIELD_24_None'}
{'0': 'FIELD_43_0',
 '5': 'FIELD_43_5',
 'A': 'FIELD_43_A',
 'B': 'FIELD_43_B',
 'C': 'FIELD_43_C',
 'D': 'FIELD_43_D',
 'None': 'FIELD_43_None'}


In [130]:
# Add missing col FIELD_17_G2 = 0 for test
pdfTest["FIELD_17_G2"] = 0

In [129]:
# Count the value for each col 10, 17, 24, 43
pdfTmp = {}
pdfTestTmp = {}
for c in [10, 17, 24, 43]:
    cName = "FIELD_%d" % c
    print("Train")
    pdfTmp[cName] = pdfTrain.groupby([cName], as_index=False).agg({"id": "count"})
    display(pdfTmp[cName])
    print("Test")
    pdfTestTmp[cName] = pdfTest.groupby([cName], as_index=False).agg({"id": "count"})
    display(pdfTestTmp[cName])

Train


Unnamed: 0,FIELD_10,id
0,GH,9443
1,T1,10810


Test


Unnamed: 0,FIELD_10,id
0,GH,6254
1,,1
2,T1,7178


Train


Unnamed: 0,FIELD_17,id
0,G2,2
1,G3,75
2,G4,6
3,G7,20
4,G8,3521
5,G9,12
6,GX,39


Test


Unnamed: 0,FIELD_17,id
0,G3,42
1,G4,6
2,G7,12
3,G8,2346
4,G9,10
5,GX,25
6,,10992


Train


Unnamed: 0,FIELD_24,id
0,K1,169
1,K2,218
2,K3,112


Test


Unnamed: 0,FIELD_24,id
0,K1,114
1,K2,154
2,K3,60
3,,13105


Train


Unnamed: 0,FIELD_43,id
0,A,114
1,B,373
2,C,630
3,D,65


Test


Unnamed: 0,FIELD_43,id
0,0,1
1,5,5
2,A,68
3,B,248
4,C,435
5,D,30
6,,12646


# TODO:
- Embed field 7, 9, 13, 39
- Fix problem with field 40? Meaning
- Try to find the categories from maCv => too coarse to use now

In [31]:
# Data to get more information: 3, 4, 5, 11, 13, 39, 40
lsObjField = [3, 4, 5, 7, 9, 11, 13, 39, 40]
for d in lsObjField:
    col = "FIELD_%d"%d
    print(col)
    lsUniqueVal = pdfTrain[col].unique()
    print(len(lsUniqueVal))
    print(lsUniqueVal[:50])
    print("-"*20)
printRuntime()

FIELD_3
514
[ 2.547e+03 -1.000e+00  3.273e+03  3.991e+03  1.450e+03  1.812e+03
  2.906e+03        nan  2.544e+03  2.168e+03  3.262e+03  7.060e+02
  1.436e+03  7.210e+02  3.550e+02  3.260e+03  3.637e+03  3.390e+02
  3.500e+02  3.650e+02  3.267e+03  3.540e+02  1.071e+03  3.630e+02
  2.174e+03  1.826e+03  4.009e+03  1.084e+03  2.186e+03  1.444e+03
  3.992e+03  3.420e+02  3.989e+03  3.670e+02  1.824e+03  2.923e+03
  2.897e+03  7.250e+02  2.548e+03  2.924e+03  4.012e+03  3.651e+03
  2.541e+03  1.438e+03  4.007e+03  2.915e+03  3.631e+03  3.284e+03
  2.189e+03  1.440e+03]
--------------------
FIELD_4
11
[ 0.  1. nan  2.  3.  4.  5.  6.  8.  7. 12.]
--------------------
FIELD_5
16
[ 0.  1.  4.  2. nan  6.  3.  5.  7.  8. 11.  9. 12. 14. 10. 13.]
--------------------
FIELD_7
7064
['[]' "['GD', 'GD', 'TE']" "['DN', 'HN', 'DN']" "['CH', 'TQ']"
 "['SV', 'GD']" "['HS', 'GB', 'DN', 'DN']" nan "['DN', 'DN', 'GD']"
 "['HT', 'DN', 'TN', 'GD', 'GD', 'TN', 'GD']" "['DN']" "['DN', 'DK']"
 "['XD', 'XD', 'X

In [123]:
lsField = [c for c in df.columns if c not in ["id", "label"]]
# ["FIELD_%d"%d for d in range(1, 50)]
for f in lsField:
    print(f)
    lsVal = df[f].unique()
    l = len(lsVal)
    print(l, "dtypes:", df[f].dtypes)
    if l < 10:
        print(lsVal)
    print("-"*20)
printRuntime()

province
66 dtypes: object
--------------------
district
719 dtypes: object
--------------------
age_source1
51 dtypes: float64
--------------------
age_source2
61 dtypes: float64
--------------------
maCv
3064 dtypes: object
--------------------
FIELD_1
2 dtypes: int64
[1 0]
--------------------
FIELD_2
3 dtypes: float64
[ 1.  0. nan]
--------------------
FIELD_3
514 dtypes: float64
--------------------
FIELD_4
11 dtypes: float64
--------------------
FIELD_5
16 dtypes: float64
--------------------
FIELD_6
6 dtypes: float64
[ 1.  2. nan  3.  4.  5.]
--------------------
FIELD_7
7064 dtypes: object
--------------------
FIELD_8
3 dtypes: object
[nan 'MALE' 'FEMALE']
--------------------
FIELD_9
35 dtypes: object
--------------------
FIELD_11
42 dtypes: object
--------------------
FIELD_13
227 dtypes: object
--------------------
FIELD_14
2 dtypes: int64
[1 0]
--------------------
FIELD_15
2 dtypes: int64
[1 0]
--------------------
FIELD_16
6 dtypes: float64
[nan  2.  1.  3.  4.  5.]
-----

### Check FIELD_7 and FIELD_9

In [20]:
pdfTrain[lsMetaCol + ["FIELD_7", "FIELD_9"]].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_7,FIELD_9
0,0,0,,,,,,[],na
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,,"['GD', 'GD', 'TE']",GD
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,Công nhân,"['DN', 'HN', 'DN']",DN
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,"['CH', 'TQ']",na
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,,"['SV', 'GD']",GD
5,5,0,,,,,,[],na
6,6,0,,,,28.0,2983,[],DN
7,7,0,Tỉnh Bắc Giang,Huyện Yên Dũng,40.0,32.0,Công nhân,"['HS', 'GB', 'DN', 'DN']",DN
8,8,0,,,,,,,na
9,9,0,,,,,,[],na


In [21]:
pdfTrain[lsMetaCol + ["FIELD_7", "FIELD_9"]].loc[pdfTrain["label"]==1].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_7,FIELD_9
106,106,1,Thành phố Hồ Chí Minh,Quận 6,32.0,,,"['GD', 'TN']",na
113,113,1,,,,,,[],na
157,157,1,Tỉnh Kiên Giang,Huyện Giồng Riềng,40.0,40.0,CBKCT,"['GD', 'GD', 'XK', 'GD']",GD
163,163,1,,,,28.0,,[],DN
296,296,1,Thành phố Hà Nội,Huyện Phú Xuyên,35.0,,,"['HS', 'TE', 'HT', 'HT', 'HC']",na
377,377,1,Tỉnh Vĩnh Long,Thành phố Vĩnh Long,29.0,29.0,Giáo viên Trường Mầm non 5 phường 5,"['TE', 'HC', 'BT']",HC
384,384,1,,,,25.0,,[],DN
580,580,1,,,,25.0,,[],DN
746,746,1,Tỉnh Đồng Nai,Huyện Long Thành,48.0,48.0,Nhân viên Phụ bếp,"['DN', 'DN', 'DN']",TN
816,816,1,,,,,,[],na


In [71]:
ls7Val = pdfTrain["FIELD_7"].unique()
print(len(ls7Val))

7064


In [74]:
# Break down FIELD_7 value
tmp = []
for c in ls7Val:
    if c == "[]" or c is np.nan:
        tmp.append("na")
        continue
    s = c.replace("'", "").replace("[", "").replace("]", "").replace(" ", "").split(",")
    # print(c, s)
    tmp.extend(s)
ls7UniqueVal = list(set(tmp)) # => Vocab of FIELD_7
print(ls7UniqueVal)
print(len(ls7UniqueVal))

['HT', 'TN', 'CC', 'TS', 'na', 'MS', 'CN', 'KC', 'GD', 'HX', 'HD', 'CB', 'CK', 'HS', 'SV', 'NO', 'TK', 'PV', 'TB', 'XK', 'XV', 'LS', 'TA', 'DN', 'TQ', 'DT', 'BT', 'QN', 'TE', 'DK', 'TC', 'HC', 'CH', 'XN', 'XD', 'HN', 'NN', 'GB']
38


In [75]:
# FIELD_9 is not fully covered by FIELD_7
ls9Val = pdfTrain["FIELD_9"].unique()

print(ls9Val)
print(len(ls9Val))

['na' 'GD' 'DN' 'XD' 'HC' 'TN' 'CH' 'CN' 'HT' 'DT' 'XK' 'TK' 'GB' 'DK'
 'SV' 'HN' 'TS' 'TA' 'HD' 'NN' 'BT' 'HS' 'HX' 'NO' 'KC' 'CB' 'TC' 'XV'
 'XN' 'CC' 'MS']
31


In [76]:
lsDiff1 = [c for c in ls7UniqueVal if c not in ls9Val] 
print(lsDiff1) # in 7 not in 9
lsDiff2 = [c for c in ls9Val if c not in ls7UniqueVal]
print(lsDiff2) # in 9 not in 7

['CK', 'PV', 'TB', 'LS', 'TQ', 'QN', 'TE']
[]


In [77]:
def mostFrequent(List): 
    occurenceCount = Counter(List) 
    lsCount = occurenceCount.most_common()
    maxAppear = lsCount[0][1]
    # print(lsCount)
    # print(maxAppear)
    lsVal = [v[0] for v in lsCount if v[1] == maxAppear]
    # Return list of most common items (if equally appeared)
    return lsVal
    
# Test
# mostFrequent([2, 2, 3, 3, 4, 5, 6]) # Expect [2, 3]
printRuntime()

2020-02-06 23:52:45
-------------------


In [78]:
def mostAppear(iStr):
    if iStr == "[]" or iStr is np.nan:
        return "na"
    lsChar = iStr.replace("'", "").replace("[", "").replace("]", "").replace(" ", "").split(",")
    return mostFrequent(lsChar)

print(mostAppear("['GD', 'GD', 'TE']")) # Expect ["GD"]
printRuntime()

['GD']
2020-02-06 23:52:47
-------------------


In [79]:
# Add col as most appear FIELD_7
pdfTrain["FIELD_7_MOST"] = pdfTrain["FIELD_7"].apply(lambda x: mostAppear(x))

In [80]:
pdfTrain[lsMetaCol + ["FIELD_7", "FIELD_7_MOST", "FIELD_9"]].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_7,FIELD_7_MOST,FIELD_9
0,0,0,,,,,,[],na,na
1,1,0,tỉnh đồng nai,Thành phố Biên Hòa,44.0,44.0,none,"['GD', 'GD', 'TE']",[GD],GD
2,2,0,tỉnh đồng nai,Huyện Long Thành,30.0,30.0,công nhân,"['DN', 'HN', 'DN']",[DN],DN
3,3,0,tỉnh tuyên quang,Thành phố Tuyên Quang,43.0,,,"['CH', 'TQ']","[CH, TQ]",na
4,4,0,thành phố hồ chí minh,Quận 1,21.0,21.0,none,"['SV', 'GD']","[SV, GD]",GD
5,5,0,,,,,,[],na,na
6,6,0,,,,28.0,2983,[],na,DN
7,7,0,tỉnh bắc giang,Huyện Yên Dũng,40.0,32.0,công nhân,"['HS', 'GB', 'DN', 'DN']",[DN],DN
8,8,0,,,,,,,na,na
9,9,0,,,,,,[],na,na


In [92]:
pdfTrain["F7_MOST_IDX"] = pdfTrain["FIELD_7_MOST"].astype(str)

In [94]:
pdfTrain["F7_LEN"] = pdfTrain["FIELD_7_MOST"].apply(len)

In [None]:
pdfTmp = pdfTrain.groupby(["F7_LEN"], as_index=False).agg({"id":"count"})

In [96]:
pdfTmp.head(10)

Unnamed: 0,F7_LEN,id
0,1,12115
1,2,16033
2,3,1165
3,4,482
4,5,107
5,6,21
6,7,1


In [97]:
def index_7_9(row):
    """
    Return: (is_appear, index)
    """
    lsMostAppear7 = row["FIELD_7_MOST"]
    val9 = row["FIELD_9"]
    if val9 == "[]" or val9 == "na":
        return -2, -1
    elif lsMostAppear7 == "na":
        return -1, -1
    elif val9 in lsMostAppear7:
        return 1, lsMostAppear7.index(val9)
    else:
        # lsMostAppear7 is not null but 9 doesn't appear
        return 0, -1

printRuntime()

2020-02-07 00:03:12
-------------------


In [98]:
# Add col as 9 appear in 7_MOST
pdfTrain["FIELD_9_APPEAR"], pdfTrain["FIELD_9_INDEX"] = zip(*pdfTrain.apply(lambda row: index_7_9(row), axis=1))

In [84]:
pdfTrain[lsMetaCol + ["FIELD_7", "FIELD_7_MOST", "FIELD_9", "FIELD_9_APPEAR", "FIELD_9_INDEX",]].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_7,FIELD_7_MOST,FIELD_9,FIELD_9_APPEAR,FIELD_9_INDEX
0,0,0,,,,,,[],na,na,0,-1
1,1,0,tỉnh đồng nai,Thành phố Biên Hòa,44.0,44.0,none,"['GD', 'GD', 'TE']",[GD],GD,1,0
2,2,0,tỉnh đồng nai,Huyện Long Thành,30.0,30.0,công nhân,"['DN', 'HN', 'DN']",[DN],DN,1,0
3,3,0,tỉnh tuyên quang,Thành phố Tuyên Quang,43.0,,,"['CH', 'TQ']","[CH, TQ]",na,0,-1
4,4,0,thành phố hồ chí minh,Quận 1,21.0,21.0,none,"['SV', 'GD']","[SV, GD]",GD,1,1
5,5,0,,,,,,[],na,na,0,-1
6,6,0,,,,28.0,2983,[],na,DN,0,-1
7,7,0,tỉnh bắc giang,Huyện Yên Dũng,40.0,32.0,công nhân,"['HS', 'GB', 'DN', 'DN']",[DN],DN,1,0
8,8,0,,,,,,,na,na,0,-1
9,9,0,,,,,,[],na,na,0,-1


In [99]:
pdfTmp = pdfTrain.groupby(["FIELD_9_APPEAR"], as_index=False).agg({"id":"count",})
pdfTmp.head(5)

Unnamed: 0,FIELD_9_APPEAR,id
0,-2,9684
1,-1,4936
2,0,5061
3,1,10243


In [100]:
pdfTmpIdx = pdfTrain.groupby(["FIELD_9_INDEX"], as_index=False).agg({"id":"count",})
pdfTmpIdx.head(10)

Unnamed: 0,FIELD_9_INDEX,id
0,-1,19681
1,0,8172
2,1,1509
3,2,425
4,3,111
5,4,23
6,5,3


In [53]:
pdfTrain[lsMetaCol + ["FIELD_7", "FIELD_7_MOST", "FIELD_9", "FIELD_9_APPEAR",]
        ].loc[pdfTrain["FIELD_9_APPEAR"]==3].head(20)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_7,FIELD_7_MOST,FIELD_9,FIELD_9_APPEAR
166,166,0,Tỉnh Nghệ An,Thị xã Thái Hoà,28.0,28.0,kế toán,"['TE', 'GD', 'DN', 'TK']","[TE, GD, DN, TK]",TK,3
302,302,0,Tỉnh Long An,Huyện Cần Đước,41.0,41.0,công nhân,"['HS', 'BT', 'GD', 'DN']","[HS, BT, GD, DN]",DN,3
390,390,0,Tỉnh Quảng Nam,Huyện Duy Xuyên,33.0,33.0,nhân viên,"['TE', 'DK', 'HS', 'DN']","[TE, DK, HS, DN]",DN,3
566,566,0,Tỉnh Vĩnh Phúc,Huyện Bình Xuyên,32.0,32.0,nhân viên phòng thí nghiệm,"['TN', 'HT', 'TE', 'DN']","[TN, HT, TE, DN]",DN,3
674,674,0,Tỉnh Khánh Hòa,Thị xã Ninh Hòa,26.0,26.0,công nhân ép keo,"['GD', 'SV', 'TE', 'CH', 'HD']","[GD, SV, TE, CH, HD]",CH,3
1006,1006,0,Tỉnh Yên Bái,Huyện Yên Bình,34.0,34.0,none,"['TE', 'HS', 'HC', 'DN']","[TE, HS, HC, DN]",DN,3
1194,1194,0,Tỉnh Bình Phước,Huyện Lộc Ninh,31.0,31.0,none,"['DK', 'CK', 'TE', 'DN']","[DK, CK, TE, DN]",DN,3
1561,1561,0,Tỉnh Bến Tre,Thành phố Bến Tre,27.0,27.0,none,"['GD', 'HS', 'TE', 'HC', 'KC']","[GD, HS, TE, HC, KC]",HC,3
1653,1653,0,Thành phố Hồ Chí Minh,Quận Thủ Đức,22.0,22.0,none,"['HS', 'SV', 'TE', 'DN']","[HS, SV, TE, DN]",DN,3
1693,1693,0,Tỉnh Bình Định,Thành phố Qui Nhơn,53.0,53.0,nhân viên hành chính kiêm lái xe,"['SV', 'DN', 'HS', 'CH']","[SV, DN, HS, CH]",CH,3


### Check Test

In [101]:
ls7TestVal = pdfTest["FIELD_7"].unique()
print(len(ls7TestVal))

4905


In [102]:
# Break down FIELD_7 value
tmp = []
for c in ls7TestVal:
    if c == "[]" or c is np.nan:
        tmp.append("na")
        continue
    s = c.replace("'", "").replace("[", "").replace("]", "").replace(" ", "").split(",")
    # print(c, s)
    tmp.extend(s)
ls7TestUniqueVal = list(set(tmp)) # => Vocab of FIELD_7
print(ls7TestUniqueVal)
print(len(ls7TestUniqueVal))

['HT', 'TN', 'TS', 'CC', 'na', 'MS', 'CN', 'HK', 'KC', 'GD', 'HX', 'HD', 'HG', 'CB', 'CK', 'HS', 'SV', 'NO', 'AT', 'TK', 'PV', 'ND', 'TB', 'XK', 'XV', 'TA', 'DN', 'TQ', 'QT', 'DT', 'BT', 'TE', 'XB', 'DK', 'TC', 'HC', 'CH', 'XN', 'XD', 'TL', 'HN', 'NN', 'GB']
43


In [103]:
# FIELD_9 is not fully covered by FIELD_7
ls9TestVal = pdfTest["FIELD_9"].unique()

print(ls9TestVal)
print(len(ls9TestVal))

['na' 'DN' 'GD' 'DK' 'DT' 'BT' 'TN' 'CN' 'CH' 'HC' 'GB' 'HN' 'HT' 'SV'
 'TA' 'XK' 'XD' 'TK' 'CB' 'NN' 'TS' 'KC' 'XV' 'HD' 'HX' 'TC' 'HS' '75'
 '74' 'TL' 'CC' 'CK']
32


In [123]:
# Add col as most appear FIELD_7
pdfTest["FIELD_7_MOST"] = pdfTest["FIELD_7"].apply(lambda x: mostAppear(x))

In [124]:
pdfTest["F7_LEN"] = pdfTest["FIELD_7_MOST"].apply(len)

In [125]:
# Add col as 9 appear in 7_MOST
pdfTest["FIELD_9_APPEAR"], pdfTest["FIELD_9_INDEX"] = zip(*pdfTest.apply(lambda row: index_7_9(row), axis=1))

In [126]:
pdfTest.loc[pdfTest["FIELD_9"].isin(["CC", "CK", "TC", "TL", "HD", "XV"])][
    ["FIELD_7", "FIELD_7_MOST", "FIELD_9", "FIELD_9_APPEAR", "FIELD_9_INDEX"]].head(30)

Unnamed: 0,FIELD_7,FIELD_7_MOST,FIELD_9,FIELD_9_APPEAR,FIELD_9_INDEX
1791,[],na,XV,-1,-1
2033,[],na,HD,-1,-1
2524,[],na,TC,-1,-1
4155,[],na,XV,-1,-1
5511,"['HS', 'HD']","[HS, HD]",HD,1,1
5978,"['XD', 'XD', 'XD', 'XD', 'XD']",[XD],XV,0,-1
7213,"['HD', 'GB', 'GB', 'BT']",[GB],HD,0,-1
10903,[],na,TL,-1,-1
11410,[],na,HD,-1,-1
11817,"['GD', 'DN', 'TE', 'GD', 'GD']",[GD],XV,0,-1


In [117]:
dfTmp = pdfTest.groupby("FIELD_9").agg({"id": "count"})
display(dfTmp)

Unnamed: 0_level_0,id
FIELD_9,Unnamed: 1_level_1
74,3
75,3
BT,14
CB,14
CC,1
CH,383
CK,1
CN,94
DK,177
DN,7344


In [116]:
dfTmp = pdfTest.loc[pdfTest["FIELD_9"].isin(["74", "75"])].groupby("FIELD_9").agg({"id": "count"})
display(dfTmp)

Unnamed: 0_level_0,id
FIELD_9,Unnamed: 1_level_1
74,3
75,3


In [104]:
lsDiff1 = [c for c in ls7TestUniqueVal if c not in ls9TestVal] 
print(lsDiff1) # in 7 not in 9
lsDiff2 = [c for c in ls9TestVal if c not in ls7TestUniqueVal]
print(lsDiff2) # in 9 not in 7

['MS', 'HK', 'HG', 'NO', 'AT', 'PV', 'ND', 'TB', 'TQ', 'QT', 'TE', 'XB', 'XN']
['75', '74']


In [105]:
print("in 7train but not in 7test")
ls7TrainOnly = [c for c in ls7UniqueVal if c not in ls7TestUniqueVal] 
print(ls7TrainOnly)

in 7train but not in 7test
['LS', 'QN']


In [106]:
print("in 7test but not in 7train")
ls7TestOnly = [c for c in ls7TestUniqueVal if c not in ls7UniqueVal] 
print(ls7TestOnly)

in 7test but not in 7train
['HK', 'HG', 'AT', 'ND', 'QT', 'XB', 'TL']


In [105]:
print("in 7train but not in 7test")
ls7TrainOnly = [c for c in ls7UniqueVal if c not in ls7TestUniqueVal] 
print(ls7TrainOnly)

in 7train but not in 7test
['LS', 'QN']


In [106]:
print("in 7test but not in 7train")
ls7TestOnly = [c for c in ls7TestUniqueVal if c not in ls7UniqueVal] 
print(ls7TestOnly)

in 7test but not in 7train
['HK', 'HG', 'AT', 'ND', 'QT', 'XB', 'TL']


# TODO: With FIELD_7 and FIELD_9
    Combine vocabulary of (7, 9) * (train, test)
    One-hot and sum the vector for 7, add normalized for 7
    One-hot for 9
    Also keep FIELD_7_MOST -> one-hot, FIELD_7_LEN, FIELD_9_APPEAR, FIELD_9_INDEX as features

In [113]:
ls7UniqueVal

['HT',
 'TN',
 'CC',
 'TS',
 'na',
 'MS',
 'CN',
 'KC',
 'GD',
 'HX',
 'HD',
 'CB',
 'CK',
 'HS',
 'SV',
 'NO',
 'TK',
 'PV',
 'TB',
 'XK',
 'XV',
 'LS',
 'TA',
 'DN',
 'TQ',
 'DT',
 'BT',
 'QN',
 'TE',
 'DK',
 'TC',
 'HC',
 'CH',
 'XN',
 'XD',
 'HN',
 'NN',
 'GB',
 'HT',
 'TN',
 'TS',
 'CC',
 'na',
 'MS',
 'CN',
 'HK',
 'KC',
 'GD',
 'HX',
 'HD',
 'HG',
 'CB',
 'CK',
 'HS',
 'SV',
 'NO',
 'AT',
 'TK',
 'PV',
 'ND',
 'TB',
 'XK',
 'XV',
 'TA',
 'DN',
 'TQ',
 'QT',
 'DT',
 'BT',
 'TE',
 'XB',
 'DK',
 'TC',
 'HC',
 'CH',
 'XN',
 'XD',
 'TL',
 'HN',
 'NN',
 'GB']

In [111]:
ls9TestVal

array(['na', 'DN', 'GD', 'DK', 'DT', 'BT', 'TN', 'CN', 'CH', 'HC', 'GB',
       'HN', 'HT', 'SV', 'TA', 'XK', 'XD', 'TK', 'CB', 'NN', 'TS', 'KC',
       'XV', 'HD', 'HX', 'TC', 'HS', '75', '74', 'TL', 'CC', 'CK'],
      dtype=object)

In [110]:
ls9Val

array(['na', 'GD', 'DN', 'XD', 'HC', 'TN', 'CH', 'CN', 'HT', 'DT', 'XK',
       'TK', 'GB', 'DK', 'SV', 'HN', 'TS', 'TA', 'HD', 'NN', 'BT', 'HS',
       'HX', 'NO', 'KC', 'CB', 'TC', 'XV', 'XN', 'CC', 'MS'], dtype=object)

In [114]:
set(ls7UniqueVal + ls7TestUniqueVal + list(ls9Val) + list(ls9TestVal))

{'74',
 '75',
 'AT',
 'BT',
 'CB',
 'CC',
 'CH',
 'CK',
 'CN',
 'DK',
 'DN',
 'DT',
 'GB',
 'GD',
 'HC',
 'HD',
 'HG',
 'HK',
 'HN',
 'HS',
 'HT',
 'HX',
 'KC',
 'LS',
 'MS',
 'ND',
 'NN',
 'NO',
 'PV',
 'QN',
 'QT',
 'SV',
 'TA',
 'TB',
 'TC',
 'TE',
 'TK',
 'TL',
 'TN',
 'TQ',
 'TS',
 'XB',
 'XD',
 'XK',
 'XN',
 'XV',
 'na'}

In [115]:
lsVocab = list(set(ls7UniqueVal + ls7TestUniqueVal + list(ls9Val) + list(ls9TestVal)))
print(len(lsVocab))

47


# TODO:
    - Try target count encoding

# Check FIELD_13

In [146]:
dfTmp = pdfTrain.groupby("FIELD_13", as_index=False).agg({"id": ["count"], "label": ["sum", "mean", "std"]})
dfTmp.columns = ["_".join(x) for x in dfTmp.columns.ravel()]
dfTmp.sort_values(["id_count"], ascending=False).head(10)

Unnamed: 0,FIELD_13_,id_count,label_sum,label_mean,label_std
224,YN,4961,77,0.015521,0.123625
24,BI,4836,73,0.015095,0.121944
196,TA,1193,17,0.01425,0.118569
28,BO,1067,20,0.018744,0.135684
221,TZ,610,11,0.018033,0.133179
164,QW,336,3,0.008929,0.094209
201,TF,315,3,0.009524,0.097279
197,TB,306,3,0.009804,0.09869
200,TE,290,5,0.017241,0.130395
198,TC,282,3,0.010638,0.102774


In [136]:
dfTmp.sort_values(["id"]).head(10)

Unnamed: 0_level_0,id
FIELD_13,Unnamed: 1_level_1
SS,1
EH,1
BU,1
QU,1
CB,1
CC,1
NF,1
CE,1
CH,1
CK,1


In [137]:
dfTmp = pdfTest.groupby("FIELD_13").agg({"id": "count"})
dfTmp.sort_values(["id"], ascending=False).head(10)

Unnamed: 0_level_0,id
FIELD_13,Unnamed: 1_level_1
YN,3293
BI,3100
TA,799
BO,769
TZ,441
TF,237
QW,197
TG,195
TC,191
TB,189


In [138]:
dfTmp.sort_values(["id"]).head(10)

Unnamed: 0_level_0,id
FIELD_13,Unnamed: 1_level_1
ZA,1
EU,1
EP,1
EJ,1
EF,1
ED,1
NZ,1
F4,1
HO,1
DL,1


# CHECK FIELD_39

In [None]:
# Maybe: Phone source

In [139]:
dfTmp = pdfTrain.groupby("FIELD_39").agg({"id": "count"})
dfTmp.sort_values(["id"], ascending=False).head(10)

Unnamed: 0_level_0,id
FIELD_39,Unnamed: 1_level_1
VN,10529
TW,157
KR,97
JP,95
CN,85
TQ,46
CZ,37
1,28
HQ,27
UK,23


In [140]:
dfTmp.sort_values(["id"]).head(10)

Unnamed: 0_level_0,id
FIELD_39,Unnamed: 1_level_1
VU,1
SE,1
SC,1
PH,1
TR,1
IT,1
IL,1
TK,1
ES,1
DM,1


In [141]:
dfTmp = pdfTest.groupby("FIELD_39").agg({"id": "count"})
dfTmp.sort_values(["id"], ascending=False).head(10)

Unnamed: 0_level_0,id
FIELD_39,Unnamed: 1_level_1
VN,6984
,5942
TW,92
KR,81
JP,63
CN,48
CZ,38
TQ,30
DL,19
HQ,17


In [142]:
dfTmp.sort_values(["id"]).head(10)

Unnamed: 0_level_0,id
FIELD_39,Unnamed: 1_level_1
IL,1
NU,1
VU,1
ES,1
DM,1
DK,1
WS,1
AO,1
AN,1
AT,1


# Convert data types

In [None]:
for c in lsFieldFt:
    df[c] = df[c].astype(np.float64)

### Add neighbor features
Neighbor features: 
    + avg+std of groupby (province, age_group, maCv)
    + avg+std of groupby (lv3_loc, age_group, maCv)
    + avg+std of groupby (province, maCv)
    

In [157]:
lsFieldFt = [c for c in df.columns 
             if "FIELD" in c 
             and c not in ["FIELD_%d"%d for d in [7, 9, 13, 39, 40]]]
pprint(lsFieldFt)

['FIELD_1',
 'FIELD_2',
 'FIELD_3',
 'FIELD_4',
 'FIELD_5',
 'FIELD_6',
 'FIELD_11',
 'FIELD_14',
 'FIELD_15',
 'FIELD_16',
 'FIELD_18',
 'FIELD_19',
 'FIELD_20',
 'FIELD_21',
 'FIELD_22',
 'FIELD_23',
 'FIELD_25',
 'FIELD_26',
 'FIELD_27',
 'FIELD_28',
 'FIELD_29',
 'FIELD_30',
 'FIELD_31',
 'FIELD_32',
 'FIELD_33',
 'FIELD_34',
 'FIELD_35',
 'FIELD_36',
 'FIELD_37',
 'FIELD_38',
 'FIELD_41',
 'FIELD_42',
 'FIELD_44',
 'FIELD_45',
 'FIELD_46',
 'FIELD_47',
 'FIELD_48',
 'FIELD_49',
 'FIELD_50',
 'FIELD_51',
 'FIELD_52',
 'FIELD_53',
 'FIELD_54',
 'FIELD_55',
 'FIELD_56',
 'FIELD_57',
 'FIELD_10_GH',
 'FIELD_10_None',
 'FIELD_10_T1',
 'FIELD_12_0',
 'FIELD_12_1',
 'FIELD_12_HT',
 'FIELD_12_None',
 'FIELD_12_TN',
 'FIELD_17_G2',
 'FIELD_17_G3',
 'FIELD_17_G4',
 'FIELD_17_G7',
 'FIELD_17_G8',
 'FIELD_17_G9',
 'FIELD_17_GX',
 'FIELD_17_None',
 'FIELD_24_K1',
 'FIELD_24_K2',
 'FIELD_24_K3',
 'FIELD_24_None',
 'FIELD_43_0',
 'FIELD_43_5',
 'FIELD_43_A',
 'FIELD_43_B',
 'FIELD_43_C',
 'FIELD

In [159]:
aggDict = {}
for f in lsFieldFt:
    aggDict[f] = ["mean", "std"]

In [147]:
# 1
lsMetaCol1 = ["province", "group_age", "maCv"]
lsMetaCol2 = ["lv3_loc", "group_age", "maCv"]
lsMetaCol3 = ["province", "maCv"]

In [161]:
dfNb01 = df.groupby(lsMetaCol1, as_index=False).agg(aggDict)
printRuntime()

2020-02-03 23:12:12
-------------------


In [162]:
dfNb02 = df.groupby(lsMetaCol2, as_index=False).agg(aggDict)
printRuntime()
dfNb03 = df.groupby(lsMetaCol3, as_index=False).agg(aggDict)
printRuntime()

2020-02-03 23:12:15
-------------------
2020-02-03 23:12:15
-------------------


In [164]:
# Save outputs: df, dfNb01, dfNb02, dfNb03
df.to_pickle(cleanedTrain, compression="bz2")
printRuntime()

2020-02-03 23:15:05
-------------------


In [165]:
dfNb01.to_pickle(nb01, compression="bz2")
dfNb02.to_pickle(nb02, compression="bz2")
dfNb03.to_pickle(nb03, compression="bz2")
printRuntime()

2020-02-03 23:15:41
-------------------


In [None]:
# Nummeric data: 
# Replace province/lv3_loc by (mean, std, p10, p25, p50, p75, p90) value of 3, 22
# Replace maCv by (mean, std, p10, p25, p50, p75, p90) value of

### Distribution of categorical data:
- province
- province+district
- age_source1
- age_source2
- maCv

#### province + district

In [18]:
pdfTrain["lv3_loc"] = "[" + pdfTrain["province"] + "]_[" + pdfTrain["district"] + "]"
lsProvince = pdfTrain["province"].unique()
lsLoc = pdfTrain["lv3_loc"].unique()
print(len(lsProvince), len(lsLoc))
printRuntime()

66 748
2020-01-28 21:41:43
-------------------


In [19]:
lsProvince

array([nan, 'Tỉnh Đồng Nai', 'Tỉnh Tuyên Quang', 'Thành phố Hồ Chí Minh',
       'Tỉnh Bắc Giang', 'Tỉnh Thanh Hóa', 'Tỉnh Thừa Thiên Huế',
       'Tỉnh Cà Mau', 'Tỉnh Kiên Giang', 'Tỉnh Long An', 'Tỉnh Tây Ninh',
       'Thành phố Hà Nội', 'Tỉnh Bình Dương', 'Tỉnh Hòa Bình',
       'Tỉnh Bến Tre', 'Tỉnh Tiền Giang', 'Tỉnh Hà Tĩnh', 'Tỉnh Hoà Bình',
       'Tỉnh Hậu Giang', 'Tỉnh Quảng Nam', 'Tỉnh Khánh Hòa',
       'Tỉnh Phú Yên', 'Tỉnh Phú Thọ', 'Tỉnh Vĩnh Phúc', 'Tỉnh Hưng Yên',
       'Tỉnh Bình Phước', 'Thành phố Đà Nẵng', 'Tỉnh Đồng Tháp',
       'Tỉnh Bình Định', 'Tỉnh Bà Rịa - Vũng Tàu', 'Tỉnh Quảng Ninh',
       'Tỉnh Trà Vinh', 'Tỉnh Nghệ An', 'Tỉnh Bắc Kạn', 'Tỉnh Bình Thuận',
       'Tỉnh An Giang', 'Tỉnh Thái Bình', 'Tỉnh Bạc Liêu',
       'Tỉnh Ninh Bình', 'Thành phố Hải Phòng', 'Tỉnh Quảng Ngãi',
       'Tỉnh Lâm Đồng', 'Tỉnh Gia Lai', 'Tỉnh Nam Định', 'Tỉnh Lạng Sơn',
       'Tỉnh Sơn La', 'Tỉnh Vĩnh Long', 'Tỉnh Thái Nguyên',
       'Thành phố Cần Thơ', 'Tỉnh Quảng Bìn

In [39]:
# Distribution over lv3_loc
pdfTrain.groupby(["lv3_loc", "label"]).agg({"id":("count")})
# pdfTrain[["lv3_loc", "province", "id", "label"]].hist(bins=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,id
lv3_loc,label,Unnamed: 2_level_1
[Thành phố Cần Thơ]_[Huyện Cờ Đỏ],0,22
[Thành phố Cần Thơ]_[Huyện Phong Điền],0,11
[Thành phố Cần Thơ]_[Huyện Phong Điền],1,1
[Thành phố Cần Thơ]_[Huyện Thới Lai],0,21
[Thành phố Cần Thơ]_[Huyện Vĩnh Thạnh],0,19
[Thành phố Cần Thơ]_[Quận Bình Thuỷ],0,14
[Thành phố Cần Thơ]_[Quận Bình Thủy],0,8
[Thành phố Cần Thơ]_[Quận Cái Răng],0,17
[Thành phố Cần Thơ]_[Quận Ninh Kiều],0,34
[Thành phố Cần Thơ]_[Quận Ninh Kiều],1,1


In [41]:
# Distribution over province
pdfTrain.groupby(["province", "label"], as_index=False).agg({"id":("count")})
# pdfTrain[["lv3_loc", "province", "id", "label"]].hist(bins=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,id
province,label,Unnamed: 2_level_1
Thành phố Cần Thơ,0,190
Thành phố Cần Thơ,1,2
Thành phố Hà Nội,0,657
Thành phố Hà Nội,1,10
Thành phố Hải Phòng,0,252
Thành phố Hồ Chí Minh,0,2353
Thành phố Hồ Chí Minh,1,36
Thành phố Đà Nẵng,0,226
Thành phố Đà Nẵng,1,2
Tỉnh An Giang,0,597


In [133]:
dfProvince = df.groupby(["province", "label"], as_index=False).agg({"id":("count")}).sort_values(["province"])
# dfProvince.columns = ["_".join(x) for x in dfProvince.columns.ravel()]
print(dfProvince.shape)
display(dfProvince)

(112, 3)


Unnamed: 0,province,label,id
0,Thành phố Cần Thơ,0,189
1,Thành phố Cần Thơ,1,2
2,Thành phố Hà Nội,0,657
3,Thành phố Hà Nội,1,10
4,Thành phố Hải Phòng,0,252
5,Thành phố Hồ Chí Minh,0,2339
6,Thành phố Hồ Chí Minh,1,35
7,Thành phố Đà Nẵng,0,226
8,Thành phố Đà Nẵng,1,2
9,Tỉnh An Giang,0,597


In [138]:
dfProvince.pivot(index="province", columns="label", values="id")[34:]

label,0,1
province,Unnamed: 1_level_1,Unnamed: 2_level_1
Tỉnh Lạng Sơn,72.0,1.0
Tỉnh Nam Định,145.0,3.0
Tỉnh Nghệ An,411.0,3.0
Tỉnh Ninh Bình,125.0,3.0
Tỉnh Ninh Thuận,108.0,3.0
Tỉnh Phú Thọ,153.0,5.0
Tỉnh Phú Yên,156.0,4.0
Tỉnh Quảng Bình,101.0,1.0
Tỉnh Quảng Nam,372.0,3.0
Tỉnh Quảng Ngãi,183.0,1.0


#### Age

In [21]:
# Age
pdfTrain[lsCatCol].describe()

Unnamed: 0,age_source1,age_source2
count,17189.0,20322.0
mean,33.041015,32.872011
std,9.187672,9.096176
min,0.0,-1.0
25%,26.0,26.0
50%,31.0,31.0
75%,39.0,38.0
max,71.0,89.0


In [22]:
# Age source < 18: 
pdfTrainLt18 = pdfTrain[(pdfTrain["age_source1"] < 18) | (pdfTrain["age_source2"] < 18)]
print(pdfTrainLt18.shape)
pdfTrainLt18.describe()
# => Filter out <18 value for both age_source: only 76 value would not affect

(76, 65)


Unnamed: 0,id,label,age_source1,age_source2,FIELD_1,FIELD_2,FIELD_3,FIELD_4,FIELD_5,FIELD_6,...,FIELD_34,FIELD_46,FIELD_50,FIELD_51,FIELD_52,FIELD_53,FIELD_54,FIELD_55,FIELD_56,FIELD_57
count,76.0,76.0,68.0,69.0,76.0,76.0,76.0,76.0,76.0,76.0,...,76.0,76.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,15467.657895,0.026316,9.073529,27.304348,0.763158,0.723684,1960.052632,1.644737,2.368421,0.184211,...,0.592105,0.671053,33.052261,31.674101,32.816464,33.035159,0.01971,0.097681,0.003043,0.003333
std,8855.343534,0.161136,12.272182,10.871312,0.42797,0.450146,1567.825321,1.251175,1.742352,0.481955,...,0.494709,0.472953,9.858006,21.009283,7.559901,7.570787,0.070083,0.096622,0.012403,0.013897
min,213.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,29.77,4.413,30.955,31.171,0.0,0.0,0.0,0.0
25%,8847.25,0.0,2.75,17.0,1.0,0.0,-1.0,1.0,1.0,0.0,...,0.0,0.0,29.77,16.799,30.955,31.171,0.0,0.04,0.0,0.0
50%,14407.5,0.0,4.0,30.0,1.0,1.0,1820.0,1.0,2.0,0.0,...,1.0,1.0,29.77,21.478,30.955,31.171,0.0,0.05,0.0,0.0
75%,22138.0,0.0,9.0,34.0,1.0,1.0,3365.5,2.0,3.0,0.0,...,1.0,1.0,29.77,54.937,30.955,31.171,0.0,0.16,0.0,0.0
max,29965.0,1.0,48.0,45.0,1.0,1.0,4375.0,5.0,7.0,2.0,...,1.0,1.0,65.068,67.366,63.922,64.214,0.38,0.55,0.06,0.08


In [24]:
pdfTrain[pdfTrain["age_source1"].isnull()].head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_49,FIELD_50,FIELD_51,FIELD_52,FIELD_53,FIELD_54,FIELD_55,FIELD_56,FIELD_57,lv3_loc
0,0,0,,,,,,1,1.0,2547.0,...,True,,,,,,,,,
5,5,0,,,,,,1,1.0,1812.0,...,True,,,,,,,,,
6,6,0,,,,28.0,2983.0,1,0.0,-1.0,...,True,29.77,4.413,30.955,31.171,0.0,0.0,0.0,0.0,
8,8,0,,,,,,0,,,...,True,,,,,,,,,
9,9,0,,,,,,1,1.0,2544.0,...,True,,,,,,,,,


In [26]:
pdfTrain["avg_age"] = pdfTrain.apply(lambda row: avgAge(row), axis=1)
pdfTrain["diff_age"] = pdfTrain.apply(lambda row: diffAge(row), axis=1)
printRuntime()

In [27]:
display(pdfTrain[["age_source1", "age_source2", "avg_age", "diff_age"]].head())
printRuntime()

Unnamed: 0,age_source1,age_source2,avg_age,diff_age
0,,,,
1,44.0,44.0,44.0,44.0
2,30.0,30.0,30.0,30.0
3,43.0,,43.0,43.0
4,21.0,21.0,21.0,21.0


In [29]:
display(pdfTrain[["age_source1", "age_source2", "avg_age", "diff_age"]][pdfTrain["diff_age"]<0].head())
printRuntime()

Unnamed: 0,age_source1,age_source2,avg_age,diff_age
6,,28.0,28.0,-28.0
12,,36.0,36.0,-36.0
18,,30.0,30.0,-30.0
31,,43.0,43.0,-43.0
38,,23.0,23.0,-23.0


2020-01-28 22:16:53
-------------------


#### maCv

In [31]:
lsJob = pdfTrain["maCv"].unique()
print(len(lsJob))
pprint(lsJob)
printRuntime()

3067
array([nan, 'None', 'Công nhân', ..., 'NV. MR', 'Thợ ép đế', 'CN  May CN'],
      dtype=object)
2020-01-28 22:19:22
-------------------


In [34]:
lsJob = pdfTrain["maCv"].unique()

In [53]:
pdfTrain[pdfTrain["maCv"]!="None"].shape

(7118, 67)

In [52]:
pdfTrain[pdfTrain["maCv"]!="None"].head(10)

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_51,FIELD_52,FIELD_53,FIELD_54,FIELD_55,FIELD_56,FIELD_57,lv3_loc,avg_age,diff_age
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,Công nhân,0,1.0,3273.0,...,56.512,30.955,31.171,0.0,0.16,0.0,0.0,[Tỉnh Đồng Nai]_[Huyện Long Thành],30.0,30.0
6,6,0,,,,28.0,2983,1,0.0,-1.0,...,4.413,30.955,31.171,0.0,0.0,0.0,0.0,,28.0,-28.0
7,7,0,Tỉnh Bắc Giang,Huyện Yên Dũng,40.0,32.0,Công nhân,1,1.0,2906.0,...,40.683,30.955,31.171,0.0,0.13,0.0,0.0,[Tỉnh Bắc Giang]_[Huyện Yên Dũng],36.0,36.0
12,12,0,,,,36.0,Cấp dưỡng,1,0.0,-1.0,...,65.072,30.955,31.171,0.0,0.25,0.0,0.0,,36.0,-36.0
14,14,0,Tỉnh Thừa Thiên Huế,Huyện Phong Điền,21.0,21.0,Nhân viên bảo trì,1,1.0,1436.0,...,63.351,30.955,31.171,0.0,0.21,0.0,0.0,[Tỉnh Thừa Thiên Huế]_[Huyện Phong Điền],21.0,21.0
15,15,0,Tỉnh Cà Mau,Huyện Đầm Dơi,20.0,20.0,Công nhân ủi,1,1.0,721.0,...,44.561,30.955,31.171,0.0,0.14,0.0,0.0,[Tỉnh Cà Mau]_[Huyện Đầm Dơi],20.0,20.0
21,21,0,Tỉnh Tây Ninh,Thành phố Tây Ninh,29.0,29.0,Nhân viên,0,0.0,-1.0,...,45.303,30.955,31.171,0.0,0.14,0.0,0.0,[Tỉnh Tây Ninh]_[Thành phố Tây Ninh],29.0,29.0
22,22,0,Thành phố Hà Nội,Quận Thanh Xuân,38.0,38.0,Nhân viên kinh doanh,1,1.0,339.0,...,45.93,30.955,31.171,0.0,0.14,0.0,0.0,[Thành phố Hà Nội]_[Quận Thanh Xuân],38.0,38.0
25,25,0,Tỉnh Bình Dương,Thị xã Tân Uyên,20.0,20.0,Công nhân,1,1.0,1450.0,...,46.564,30.955,31.171,0.0,0.14,0.0,0.0,[Tỉnh Bình Dương]_[Thị xã Tân Uyên],20.0,20.0
29,29,0,Tỉnh Thừa Thiên Huế,Thị xã Hương Thủy,26.0,26.0,Công nhân may công nghiệp,1,1.0,3267.0,...,54.38,30.955,31.171,0.0,0.16,0.0,0.0,[Tỉnh Thừa Thiên Huế]_[Thị xã Hương Thủy],26.0,26.0


In [36]:
lsJob[:10]

array(['None', 'Công nhân', '2983', 'Cấp dưỡng', 'Nhân viên bảo trì',
       'Công nhân ủi', 'Nhân viên', 'Nhân viên kinh doanh',
       'Công nhân may công nghiệp', 'Công nhân se sợi'], dtype=object)

In [49]:
pdfTrainJob = pdfTrain.groupby(["maCv"]).agg({"id":("count")})
# pdfTrainJob.columns = ["_".join(x) for x in pdfTrainJob.columns.ravel()]

In [None]:
Calculate bad-rate percentage by location (province + district; province)

# Test

In [6]:
pdfTest = pd.read_csv(testPath)

  interactivity=interactivity, compiler=compiler, result=result)


# 2. EDA

In [None]:
# Compare the distribution between train & test

# 3. Simple model

### 3.1 Build features

In [166]:
df.columns

Index(['id', 'label', 'province', 'district', 'age_source1', 'age_source2',
       'maCv', 'FIELD_1', 'FIELD_2', 'FIELD_3', 'FIELD_4', 'FIELD_5',
       'FIELD_6', 'FIELD_7', 'FIELD_9', 'FIELD_11', 'FIELD_13', 'FIELD_14',
       'FIELD_15', 'FIELD_16', 'FIELD_18', 'FIELD_19', 'FIELD_20', 'FIELD_21',
       'FIELD_22', 'FIELD_23', 'FIELD_25', 'FIELD_26', 'FIELD_27', 'FIELD_28',
       'FIELD_29', 'FIELD_30', 'FIELD_31', 'FIELD_32', 'FIELD_33', 'FIELD_34',
       'FIELD_35', 'FIELD_36', 'FIELD_37', 'FIELD_38', 'FIELD_39', 'FIELD_40',
       'FIELD_41', 'FIELD_42', 'FIELD_44', 'FIELD_45', 'FIELD_46', 'FIELD_47',
       'FIELD_48', 'FIELD_49', 'FIELD_50', 'FIELD_51', 'FIELD_52', 'FIELD_53',
       'FIELD_54', 'FIELD_55', 'FIELD_56', 'FIELD_57', 'FIELD_10_GH',
       'FIELD_10_None', 'FIELD_10_T1', 'FIELD_12_0', 'FIELD_12_1',
       'FIELD_12_HT', 'FIELD_12_None', 'FIELD_12_TN', 'FIELD_17_G2',
       'FIELD_17_G3', 'FIELD_17_G4', 'FIELD_17_G7', 'FIELD_17_G8',
       'FIELD_17_G9', 'FIELD_17

In [169]:
dfBk01 = dfNb01.copy()

In [None]:
dfNb01.columns = ["_".join(x) for x in dfNb01.columns.ravel()]

In [175]:
dfNb01.columns = ["_".join(x) for x in dfNb01.columns.ravel()]
dfNb01 = dfNb01.rename(columns={"province_": "province", 
                       "group_age_": "group_age", 
                       "maCv_": "maCv"})

In [174]:
df.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_43_C,FIELD_43_D,FIELD_43_None,avg_age,diff_age,lv3_loc,FIELD_8_FEMALE,FIELD_8_MALE,FIELD_8_None,group_age
0,0,0,,,,,,1.0,1.0,2547.0,...,0.0,0.0,1.0,,,,0.0,0.0,1.0,AGE_NONE
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,,1.0,0.0,-1.0,...,0.0,0.0,1.0,44.0,44.0,[Tỉnh Đồng Nai]_[Thành phố Biên Hòa],0.0,1.0,0.0,AGE_III
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,Công nhân,0.0,1.0,3273.0,...,0.0,0.0,1.0,30.0,30.0,[Tỉnh Đồng Nai]_[Huyện Long Thành],1.0,0.0,0.0,AGE_II
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,0.0,0.0,1.0,43.0,43.0,[Tỉnh Tuyên Quang]_[Thành phố Tuyên Quang],0.0,0.0,1.0,AGE_III
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,,0.0,1.0,1450.0,...,0.0,0.0,1.0,21.0,21.0,[Thành phố Hồ Chí Minh]_[Quận 1],0.0,1.0,0.0,AGE_I


In [176]:
dfNb01.head()

Unnamed: 0,province,group_age,maCv,FIELD_1_mean,FIELD_1_std,FIELD_2_mean,FIELD_2_std,FIELD_3_mean,FIELD_3_std,FIELD_4_mean,...,FIELD_43_D_mean,FIELD_43_D_std,FIELD_43_None_mean,FIELD_43_None_std,FIELD_8_FEMALE_mean,FIELD_8_FEMALE_std,FIELD_8_MALE_mean,FIELD_8_MALE_std,FIELD_8_None_mean,FIELD_8_None_std
0,Thành phố Cần Thơ,AGE_I,CN-DÁN HỢP,1.0,,1.0,,1451.0,,0.0,...,0.0,,1.0,,1.0,,0.0,,0.0,
1,Thành phố Cần Thơ,AGE_I,Công nhân,1.0,0.0,1.0,0.0,1263.5,761.554003,0.0,...,0.0,0.0,1.0,0.0,0.5,0.707107,0.5,0.707107,0.0,0.0
2,Thành phố Cần Thơ,AGE_I,Công nhân dập đầu vis - Xưởng Inox,1.0,,1.0,,1437.0,,0.0,...,0.0,,1.0,,0.0,,1.0,,0.0,
3,Thành phố Cần Thơ,AGE_II,Bôi keo,1.0,,1.0,,2899.0,,2.0,...,0.0,,1.0,,1.0,,0.0,,0.0,
4,Thành phố Cần Thơ,AGE_II,Bảo Vệ,0.0,,1.0,,2895.0,,1.0,...,0.0,,1.0,,0.0,,1.0,,0.0,


#### raw ft + nb ft + standardize raw ft

In [181]:
lsMetaCol = ["id", "label", "province", "district", "lv3_loc", 
              "age_source1", "age_source2", "avg_age", "diff_age", "group_age",
              "maCv",]
# raw ft
lsFieldFt = [c for c in df.columns 
             if "FIELD" in c 
             and c not in ["FIELD_%d"%d for d in [7, 9, 13, 39, 40]]]
label = "label"
printRuntime()

2020-02-03 23:33:00
-------------------


In [178]:
# nb ft
lsNbMetaCol = lsMetaCol1
dfNb = dfNb01 # TODO: dfNb02, dfNb03

In [186]:
# pdfDistRaw = pd.merge(pdfArea, pdfArea, on="area_level_2", how="inner")
dfITrain = pd.merge(df, dfNb01, on=lsNbMetaCol, how="left")

In [187]:
dfITrain.shape

(29924, 244)

In [188]:
dfITrain.head()

Unnamed: 0,id,label,province,district,age_source1,age_source2,maCv,FIELD_1,FIELD_2,FIELD_3,...,FIELD_43_D_mean,FIELD_43_D_std,FIELD_43_None_mean,FIELD_43_None_std,FIELD_8_FEMALE_mean,FIELD_8_FEMALE_std,FIELD_8_MALE_mean,FIELD_8_MALE_std,FIELD_8_None_mean,FIELD_8_None_std
0,0,0,,,,,,1.0,1.0,2547.0,...,,,,,,,,,,
1,1,0,Tỉnh Đồng Nai,Thành phố Biên Hòa,44.0,44.0,,1.0,0.0,-1.0,...,,,,,,,,,,
2,2,0,Tỉnh Đồng Nai,Huyện Long Thành,30.0,30.0,Công nhân,0.0,1.0,3273.0,...,0.046154,0.211451,0.938462,0.242186,0.6,0.49371,0.4,0.49371,0.0,0.0
3,3,0,Tỉnh Tuyên Quang,Thành phố Tuyên Quang,43.0,,,0.0,1.0,3991.0,...,,,,,,,,,,
4,4,0,Thành phố Hồ Chí Minh,Quận 1,21.0,21.0,,0.0,1.0,1450.0,...,,,,,,,,,,


In [189]:
# standardize ft
for c in lsFieldFt:
    dfITrain["%s_stdized"%c] = (dfITrain[c] - dfITrain[c+"_mean"])/dfITrain[c+"_std"]
printRuntime()

2020-02-03 23:40:06
-------------------


In [190]:
dfITrain.shape

(29924, 320)

In [191]:
iTrainPath = basePath + "itrain.pickle"
dfITrain.to_pickle(iTrainPath, compression="bz2")
printRuntime()

2020-02-03 23:40:36
-------------------


In [None]:
# 5-fold XGBoost