In [4]:
import pandas as pd
import numpy as np
from hyperparameters import VAL_DATA, BATCH_SIZE
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader #, Dataset
from utils.utils import normalization, standardization
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')

In [5]:
########## read row data ##########
# full
data_path = r'data/raw_utf-8.csv' 
df = pd.read_csv(data_path, encoding='utf-8', low_memory=False) # for ignore DtypeWarning

print('csv table shape: {}'.format(df.shape))
print('====================')
df.info()
print('====================')
print(df.describe())
print('====================')
print(df.head())

csv table shape: (89383, 316)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89383 entries, 0 to 89382
Columns: 316 entries, 会社ID to D-574-574(総合ラフールネス_職場ラフールネス)
dtypes: float64(278), int64(12), object(26)
memory usage: 215.5+ MB
               会社ID       パートナーID         業種大分類         業種中分類          社員ID  \
count  89383.000000  8.938300e+04  89383.000000  89383.000000   89383.00000   
mean     368.506383  4.207739e+07     30.582035     32.936431   56100.43035   
std      236.442892  2.007670e+08     25.537480     26.160689   26651.60940   
min        2.000000  1.000000e+00      1.000000      2.000000       2.00000   
25%      173.000000  1.000000e+00      5.000000      6.000000   37537.50000   
50%      333.000000  1.000000e+00     26.000000     31.000000   55703.00000   
75%      556.000000  1.000000e+00     52.000000     53.000000   75901.50000   
max      882.000000  1.000001e+09     93.000000     98.000000  110120.00000   

                 性別           誕生年           入社年        

In [6]:
# list column names
column_names = df.columns.values.tolist()
for i, column_name in enumerate(column_names):
    print(i,column_name)

0 会社ID
1 パートナーID
2 業種大分類
3 業種大分類名称
4 業種中分類
5 業種中分類名称
6 社員ID
7 性別
8 血液型
9 誕生年
10 誕生日
11 入社年
12 学歴
13 学歴名称
14 雇用形態
15 雇用形態名称
16 勤務形態
17 勤務形態名称
18 職種
19 職種名称
20 個人年収
21 個人年収名称
22 世帯年収
23 世帯年収名称
24 結婚
25 結婚名称
26 家族と同居
27 家族と同居名称
28 子供
29 子供名称
30 介護
31 介護名称
32 怪我／病気
33 怪我／病気名称
34 精神科受診歴
35 精神科受診歴名称
36 入社形態
37 入社形態名称
38 勤務場所
39 勤務場所名称
40 回答時_退職・休職区分
41 回答時_退職日
42 回答時_退職理由
43 回答時_退職理由名称
44 回答時_休職日
45 回答時_休職理由
46 回答時_休職理由名称
47 現在_退職・休職区分
48 現在_退職日
49 現在_退職理由
50 現在_退職理由名称
51 現在_休職日
52 現在_休職理由
53 現在_休職理由名称
54 受診ID
55 回答日
56 実施期間ID
57 部署ID
58 B-1-2(非常にたくさんの仕事をしなければならない)
59 B-2-7(時間内に仕事が処理しきれない)
60 B-3-9(一生懸命働かなければならない)
61 B-4-14(かなり注意を集中する必要がある)
62 B-5-19(高度の知識や技術が必要なむずかしい仕事だ)
63 B-6-22(勤務時間中はいつも仕事のことを考えていなければならない)
64 B-7-27(からだを大変よく使う仕事だ)
65 B-8-29(自分のペースで仕事ができる)
66 B-9-33(自分で仕事の順番・やり方を決めることができる)
67 B-10-37(職場の仕事の方針に自分の意見を反映できる)
68 B-11-44(自分の技能や知識を仕事で使うことが少ない)
69 B-12-48(私の部署内で意見のくい違いがある)
70 B-13-52(私の部署と他の部署とはうまが合わない)
71 B-14-53(私の職場の雰囲気は友好的である)
72 B-15-60(私の職場の作業環境（騒音、照明、温度、換気など）はよくない)
73 B-1

In [7]:
########## encoding & missing data ##########
features = []
for i in range(len(column_names)):
#     print('feature {}:{}'.format(i, column_names[i]))
#     features.append(df[column_names[i]])
    features.append(np.array(df[column_names[i]]))
#     print('shape: {}, dtype: {}, null: {}'.format(df[column_names[i]].shape, df[column_names[i]].dtype, sum(df[column_names[i]].isnull())))

# feature1 = df[column_names[0]]
# print('type: {}'.format(type(feature1)))
# print('shape: {}'.format(feature1.shape))
# print('dtype: {}'.format(feature1.dtype))
# print('set: {}'.format(set(feature1)))
# print('null: {}'.format(sum(feature1.isnull())))

In [8]:
# feature 0: '会社ID'　不要
print('=====feature 0=====')
print('before')
print('set: ', list(set(features[0]))[:10])
print('num: ', len(list(set(features[0]))))
features[0] = features[0].astype(np.float32)
print('====================')


# feature 1: パートナーID　不要
print('=====feature 1=====')
print('before')
print('set: ', list(set(features[1])))
print('num: ', len(list(set(features[1]))))

features[1][features[1]==1000000900.0] = 0
features[1][features[1]==1.0] = 1
features[1][features[1]==1000001100.0] = 2

features[1] = features[1].astype(np.float32)
print('after')
print('set: ', list(set(features[1])))
print('num: ', len(list(set(features[1]))))
print('====================')


# feature 2: 業種大分類 ##### the same as feature 3 #####
print('=====feature 2=====')
print('before')
print('set: ', list(set(features[2])))
print('num: ', len(list(set(features[2]))))
features[2] = features[2].astype(np.float32)
print('====================')


# feature 3: 業種大分類名称
print('=====feature 3=====')
print('before')
print('set: ', list(set(features[3])))
print('num: ', len(list(set(features[3]))))

features[3][features[3]=='教育・研修'] = 0
features[3][features[3]=='介護・福祉・医療'] = 1
features[3][features[3]=='メーカー（電気・電子・機械）'] = 2
features[3][features[3]=='小売・流通'] = 3
features[3][features[3]=='IT・インターネット'] = 4
features[3][features[3]=='コンサルティング'] = 5 
features[3][features[3]=='広告・マスコミ・エンターテイメント'] = 6
features[3][features[3]=='建設・不動産'] = 7
features[3][features[3]=='その他'] = 8
features[3][features[3]=='人材サービス'] = 9
features[3][features[3]=='士業'] = 10
features[3][features[3]=='メーカー（素材・食品・医薬品他）'] = 11
features[3][features[3]=='運輸・物流'] = 12
features[3][features[3]=='サービス'] = 13
features[3][features[3]=='インフラ'] = 14
features[3][features[3]=='金融・保険'] = 15
features[3][features[3]=='商社'] = 16
features[3] = features[3].astype(np.float32)

print('after')
print('set: ', list(set(features[3])))
print('num: ', len(list(set(features[3]))))
print('====================')


# feature 4: 業種中分類 ##### the same as feature 5 #####
print('=====feature 4=====')
print('before')
print('set: ', list(set(features[4])))
print('num: ', len(list(set(features[4]))))
features[4] = features[4].astype(np.float32)
print('====================')


# feature 5: 業種中分類名称
print('=====feature 5=====')
print('before')
print('set: ', list(set(features[5])))
print('num: ', len(list(set(features[5]))))

features[5][features[5]=='その他メーカー'] = 0
features[5][features[5]=='自治体'] = 1
features[5][features[5]=='コールセンター'] = 2
features[5][features[5]=='社会保険労務士事務所'] = 3
features[5][features[5]=='通信・キャリア'] = 4
features[5][features[5]=='化粧品・医薬品・ドラッグストア（一部調剤も扱う）'] = 5
features[5][features[5]=='教育'] = 6
features[5][features[5]=='その他人材サービス'] = 7
features[5][features[5]=='その他コンサルティング'] = 8
features[5][features[5]=='専門コンサルティング'] = 9
features[5][features[5]=='映像・音楽・ゲーム'] = 10
features[5][features[5]=='精密・計測機械'] = 11
features[5][features[5]=='建設・建築・土木'] = 12
features[5][features[5]=='リース・レンタル'] = 13
features[5][features[5]=='重電・産業用電気機器'] = 14
features[5][features[5]=='ソフトウェア・SIer'] = 15
features[5][features[5]=='紙・パルプ'] = 16
features[5][features[5]=='海運・鉄道・陸運・空輸'] = 17
features[5][features[5]=='調剤専門薬局'] = 18
features[5][features[5]=='冠婚葬祭'] = 19
features[5][features[5]=='研修'] = 20
features[5][features[5]=='その他広告・マスコミ・エンターテイメント'] = 21
features[5][features[5]=='人材派遣'] = 22
features[5][features[5]=='食品・飲料'] = 23
features[5][features[5]=='その他運輸・物流'] = 24
features[5][features[5]=='その他サービス'] = 25
features[5][features[5]=='その他IT・インターネット'] = 26
features[5][features[5]=='化学・石油・ガラス・セラミック・セメント'] = 27
features[5][features[5]=='介護・福祉'] = 28
features[5][features[5]=='銀行・信金・組合'] = 29
features[5][features[5]=='電力'] = 30
features[5][features[5]=='その他建設・不動産'] = 31
features[5][features[5]=='専門商社'] = 32
features[5][features[5]=='監査・税理士法人'] = 33
features[5][features[5]=='百貨店・総合スーパー'] = 34
features[5][features[5]=='その他小売・流通'] = 35
features[5][features[5]=='広告・PR'] = 36
features[5][features[5]=='警備・メンテナンス・清掃'] = 37
features[5][features[5]=='飲食'] = 38
features[5][features[5]=='総合商社'] = 39
features[5][features[5]=='美容・リラクゼーション・エステ'] = 40
features[5][features[5]=='旅行'] = 41
features[5][features[5]=='住宅設備'] = 42
features[5][features[5]=='法律事務所'] = 43
features[5][features[5]=='医療機器'] = 44
features[5][features[5]=='生命保険・損害保険'] = 45
features[5][features[5]=='インターネットサービス'] = 46
features[5][features[5]=='病院・クリニック'] = 47
features[5][features[5]=='シンクタンク・調査'] = 48
features[5][features[5]=='医薬品・化粧品・バイオ'] = 49
features[5][features[5]=='その他（電気・電子・機械）'] = 50
features[5][features[5]=='専門店'] = 51
features[5][features[5]=='不動産'] = 52
features[5][features[5]=='コンビニエンスストア'] = 53
features[5][features[5]=='コンピュータ・通信機器・OA機器関連'] = 54
features[5][features[5]=='コンサルティング'] = 55
features[5][features[5]=='人材紹介'] = 56
features[5][features[5]=='レジャー・アミューズメント・スポーツ施設'] = 57
features[5][features[5]=='ファッション・アパレル・アクセサリー'] = 58
features[5][features[5]=='放送・新聞・出版'] = 59
features[5][features[5]=='クレジット・信販'] = 60
features[5][features[5]=='その他業界'] = 61
features[5][features[5]=='自動車・自動車部品・輸送機器'] = 62
features[5][features[5]=='証券'] = 63
features[5][features[5]=='物流・倉庫'] = 64
features[5][features[5]=='ホテル'] = 65
features[5] = features[5].astype(np.float32)

print('after')
print('set: ', list(set(features[5])))
print('num: ', len(list(set(features[5]))))
print('====================')


# feature 6: 社員ID　不要
print('=====feature 6=====')
print('before')
print('set: ', list(set(features[6]))[:10])
print('num: ', len(list(set(features[6]))))
features[6] = features[6].astype(np.float32)
print('====================')

=====feature 0=====
before
set:  [2, 4, 5, 6, 9, 10, 11, 14, 17, 19]
num:  614
=====feature 1=====
before
set:  [1, 1000000915, 1000001119]
num:  3
after
set:  [1000000900.0, 1.0, 1000001100.0]
num:  3
=====feature 2=====
before
set:  [64, 1, 33, 26, 5, 71, 10, 43, 15, 48, 81, 52, 21, 93, 56, 90, 61]
num:  17
=====feature 3=====
before
set:  ['教育・研修', '運輸・物流', 'サービス', '介護・福祉・医療', '人材サービス', 'インフラ', 'その他', '士業', '商社', 'メーカー（素材・食品・医薬品他）', 'コンサルティング', '建設・不動産', 'IT・インターネット', '小売・流通', '金融・保険', '広告・マスコミ・エンターテイメント', 'メーカー（電気・電子・機械）']
num:  17
after
set:  [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]
num:  17
=====feature 4=====
before
set:  [2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 49, 50, 51, 53, 54, 55, 57, 62, 63, 65, 66, 67, 68, 69, 74, 75, 76, 78, 79, 80, 83, 84, 85, 86, 87, 89, 91, 92, 95, 98]
num:  67
=====feature 5=====
before
set:  ['専門店',

In [9]:
# feature 7: 性別
print('=====feature 7=====')
print('before')
print('set: ', list(set(features[7])))
print('num: ', len(list(set(features[7]))))
features[7] = features[7].astype(np.float32)
print('====================')


# feature 8: 血液型
print('=====feature 8=====')
print('before')
print('set: ', list(set(features[8])))
print('num: ', len(list(set(features[8]))))

features[8][features[8]=='a'] = 0
features[8][features[8]=='o'] = 1
features[8][features[8]=='ab'] = 2
features[8][features[8]=='b'] = 3
features[8][features[8]=='z'] = np.random.randint(0, 4, len(features[8][features[8]=='z'])) # unknow

features[8] = features[8].astype(np.float32)
features[8][np.isnan(features[8])]  = np.random.randint(0, 4, sum(np.isnan(features[8]))) # nan

print('after')
print('set: ', list(set(features[8])))
print('num: ', len(list(set(features[8]))))
print('====================')


# feature 9: 誕生年
print('=====feature 9=====')
print('before')
print('set: ', list(set(features[9]))[:20])
print('num: ', len(list(set(features[9]))))

features[9] = pd.DataFrame(features[9]) # ndarray --> pandas
mean = features[9][features[9].notnull()].mean()
features[9] = features[9].fillna(mean) # nan
features[9] = np.array(features[9]).reshape(-1) # pandas --> ndarray
features[9] = features[9].astype(np.float32)

print('after')
print('set: ', list(set(features[9]))[:20])
print('num: ', len(list(set(features[9]))))
print('====================')


# feature 10: 誕生日 ##### the same as feature 9 #####
print('=====feature 10=====')
print('before')
print('set: ', list(set(features[10]))[:20])
print('num: ', len(list(set(features[10]))))
# features[10] = features[10].astype(np.float32) # can not convert to float, because of '1993-03-03' character
print('====================')


# feature 11: 入社年
print('=====feature 11=====')
print('before')
print('set: ', list(set(features[11]))[:20])
print('num: ', len(list(set(features[11]))))
features[11] = features[11].astype(np.float32)
print('====================')


# feature 12: 学歴 ##### the same as feature 13 #####
print('=====feature 12=====')
print('before')
print('set: ', list(set(features[12]))[:100])
print('num: ', len(list(set(features[12]))))
features[12] = features[12].astype(np.float32)
print('====================')


# feature 13: 学歴名称
print('=====feature 13=====')
print('before')
print('set: ', list(set(features[13]))[:100])
print('num: ', len(list(set(features[13]))))

features[13][features[13]=='その他'] = 0 # not educated
features[13][features[13]=='中学校'] = 1
features[13][features[13]=='高等学校'] = 2
features[13][features[13]=='専門学校'] = 3
features[13][features[13]=='高等専門学校・高等専修学校'] = 4
features[13][features[13]=='短期大学'] = 5
features[13][features[13]=='大学'] = 6
features[13][features[13]=='大学院'] = 7

features[13] = features[13].astype(np.float32)
features[13][np.isnan(features[13])]  = np.random.randint(0, 8, sum(np.isnan(features[13]))) # nan

print('after')
print('set: ', list(set(features[13])))
print('num: ', len(list(set(features[13]))))
print('====================')


# feature 14: 雇用形態 ##### the same as feature 15 #####
print('=====feature 14=====')
print('before')
print('set: ', list(set(features[14]))[:200])
print('num: ', len(list(set(features[14]))))
features[14] = features[14].astype(np.float32)
print('====================')


# feature 15: 雇用形態名称
print('=====feature 15=====')
print('before')
print('set: ', list(set(features[15]))[:100])
print('num: ', len(list(set(features[15]))))

features[15][features[15]=='正社員（専門職・特別職）'] = 0
features[15][features[15]=='派遣社員'] = 1
features[15][features[15]=='嘱託社員（再雇用）'] = 2
features[15][features[15]=='契約社員'] = 3
features[15][features[15]=='経営者・役員'] = 4
features[15][features[15]=='パート／アルバイト'] = 5
features[15][features[15]=='出向／業務委託'] = 6
features[15][features[15]=='正社員（係長・主任・課長補佐クラス）'] = 7
features[15][features[15]=='正社員（部長クラス）'] = 8
features[15][features[15]=='正社員（課長クラス）'] = 9
features[15][features[15]=='正社員（一般）'] = 10
features[15][features[15]=='その他'] = 11
features[15][features[15]=='未入力'] = np.random.randint(0, 12, len(features[15][features[15]=='未入力'])) # unknow

features[15] = features[15].astype(np.float32)
features[15][np.isnan(features[15])]  = np.random.randint(0, 12, sum(np.isnan(features[15]))) # nan

print('after')
print('set: ', list(set(features[15])))
print('num: ', len(list(set(features[15]))))
print('====================')


# feature 16: 勤務形態 ##### the same as feature 17 #####
print('=====feature 16=====')
print('before')
print('set: ', list(set(features[16]))[:10])
print('num: ', len(list(set(features[16]))))
features[16] = features[16].astype(np.float32)
print('====================')


# feature 17: 勤務形態名称
print('=====feature 17=====')
print('before')
print('set: ', list(set(features[17]))[:100])
print('num: ', len(list(set(features[17]))))

features[17][features[17]=='在宅勤務'] = 0
features[17][features[17]=='フレックス制'] = 1
features[17][features[17]=='シフト制（夜勤あり）'] = 2
features[17][features[17]=='シフト制（夜勤なし）'] = 3
features[17][features[17]=='裁量労働制'] = 4
features[17][features[17]=='定時出勤'] = 5
features[17][features[17]=='育児など時短'] = 6
features[17][features[17]=='その他'] = 7

features[17] = features[17].astype(np.float32)
features[17][np.isnan(features[17])]  = np.random.randint(0, 8, sum(np.isnan(features[17]))) # nan

print('after')
print('set: ', list(set(features[17])))
print('num: ', len(list(set(features[17]))))
print('====================')


# feature 18: 職種 ##### the same as feature 19 #####
print('=====feature 18=====')
print('before')
print('set: ', list(set(features[18]))[:1000])
print('num: ', len(list(set(features[18]))))
features[18] = features[18].astype(np.float32)
print('====================')


# feature 19: 職種名称
print('=====feature 19=====')
print('before')
print('set: ', list(set(features[19]))[:200])
print('num: ', len(list(set(features[19]))))

features[19][features[19]=='クリエイティブ'] = 0
features[19][features[19]=='ITエンジニア'] = 1
features[19][features[19]=='医療専門職'] = 2
features[19][features[19]=='事務'] = 3
features[19][features[19]=='建設・土木技術職'] = 4
features[19][features[19]=='技術職(機械・電子・電気・半導体等)'] = 5
features[19][features[19]=='専門職(コンサルタント・士業等)'] = 6
features[19][features[19]=='軽作業'] = 7
features[19][features[19]=='介護・福祉サービス'] = 8
features[19][features[19]=='企画'] = 9
features[19][features[19]=='警備・設備管理等'] = 10
features[19][features[19]=='輸送・機械運転'] = 11
features[19][features[19]=='管理部門'] = 12
features[19][features[19]=='サービス(販売以外)'] = 13
features[19][features[19]=='教育・保育・公的職種'] = 14
features[19][features[19]=='技術職(素材・化学・食品・医療品等)'] = 15
features[19][features[19]=='販売'] = 16
features[19][features[19]=='営業'] = 17
features[19][features[19]=='マーケティング'] = 18
features[19][features[19]=='技能工(整備・工場生産・製造)'] = 19
features[19][features[19]=='その他'] = 20
features[19][features[19]=='未入力'] = np.random.randint(0, 21, len(features[19][features[19]=='未入力'])) # unknow

features[19] = features[19].astype(np.float32)
features[19][np.isnan(features[19])]  = np.random.randint(0, 21, sum(np.isnan(features[19]))) # nan

print('after')
print('set: ', list(set(features[19])))
print('num: ', len(list(set(features[19]))))
print('====================')


# feature 20: 個人年収 ##### the same as feature 21 #####
print('=====feature 20=====')
print('before')
print('set: ', list(set(features[20]))[:100])
print('num: ', len(list(set(features[20]))))
features[20] = features[20].astype(np.float32)
print('====================')


# feature 21: 個人年収名称
print('=====feature 21=====')
print('before')
print('set: ', list(set(features[21]))[:100])
print('num: ', len(list(set(features[21]))))

features[21][features[21]=='100万円未満'] = 0
features[21][features[21]=='100〜200万円未満'] = 1
features[21][features[21]=='200〜400万円未満'] = 2
features[21][features[21]=='400〜700万円未満'] = 3
features[21][features[21]=='700〜1000万円未満'] = 4
features[21][features[21]=='1000〜1500万円未満'] = 5
features[21][features[21]=='1500万円以上'] = 6
features[21][features[21]=='分からない・答えたくない'] = np.random.randint(0, 7, len(features[21][features[21]=='分からない・答えたくない'])) # unknow

features[21] = features[21].astype(np.float32)
features[21][np.isnan(features[21])] = np.random.randint(0, 7, sum(np.isnan(features[21]))) # nan

print('after')
print('set: ', list(set(features[21])))
print('num: ', len(list(set(features[21]))))
print('====================')


# feature 22: 世帯年収 ##### the same as feature 23 #####
print('=====feature 22=====')
print('before')
print('set: ', list(set(features[22]))[:100])
print('num: ', len(list(set(features[22]))))
features[22] = features[22].astype(np.float32)
print('====================')


# feature 23: 世帯年収名称
print('=====feature 23=====')
print('before')
print('set: ', list(set(features[23]))[:100])
print('num: ', len(list(set(features[23]))))

features[23][features[23]=='100万円未満'] = 0
features[23][features[23]=='100〜200万円未満'] = 1
features[23][features[23]=='200〜400万円未満'] = 2
features[23][features[23]=='400〜700万円未満'] = 3
features[23][features[23]=='700〜1000万円未満'] = 4
features[23][features[23]=='1000〜1500万円未満'] = 5
features[23][features[23]=='1500万円以上'] = 6
features[23][features[23]=='分からない・答えたくない'] = np.random.randint(0, 7, len(features[23][features[23]=='分からない・答えたくない'])) # unknow

features[23] = features[23].astype(np.float32)
features[23][np.isnan(features[23])] = np.random.randint(0, 7, sum(np.isnan(features[23]))) # nan

print('after')
print('set: ', list(set(features[23])))
print('num: ', len(list(set(features[23]))))
print('====================')


# feature 24: 結婚 ##### the same as feature 25 #####
print('=====feature 24=====')
print('before')
print('set: ', list(set(features[24]))[:300])
print('num: ', len(list(set(features[24]))))
features[24] = features[24].astype(np.float32)
print('====================')


# feature 25: 結婚名称
print('=====feature 25=====')
print('before')
print('set: ', list(set(features[25])))
print('num: ', len(list(set(features[25]))))

features[25][features[25]=='未婚'] = 0
features[25][features[25]=='既婚'] = 1
features[25][features[25]=='答えたくない'] = np.random.randint(0, 2, len(features[25][features[25]=='答えたくない'])) # unknow

features[25] = features[25].astype(np.float32)
features[25][np.isnan(features[25])] = np.random.randint(0, 2, sum(np.isnan(features[25]))) # nan

print('after')
print('set: ', list(set(features[25])))
print('num: ', len(list(set(features[25]))))
print('====================')

=====feature 7=====
before
set:  [0, 1]
num:  2
=====feature 8=====
before
set:  [nan, 'z', 'a', 'b', 'o', 'ab']
num:  6
after
set:  [0.0, 1.0, 2.0, 3.0]
num:  4
=====feature 9=====
before
set:  [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1930.0, 1931.0, 1935.0, 1936.0, 1937.0, 1938.0, 1939.0, 1940.0]
num:  84
after
set:  [1930.0, 1931.0, 1935.0, 1936.0, 1937.0, 1938.0, 1939.0, 1940.0, 1941.0, 1942.0, 1943.0, 1944.0, 1945.0, 1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0]
num:  73
=====feature 10=====
before
set:  [nan, '1991-08-20', '1964-05-27', '1997-07-10', '1953-01-12', '1977-08-12', '1965-07-14', '1994-05-10', '1996-04-16', '1995-09-03', '1960-06-15', '1981-12-24', '1969-07-26', '1993-08-12', '1957-03-19', '1985-03-12', '1974-07-15', '1963-04-27', '1975-05-10', '1990-07-20']
num:  17097
=====feature 11=====
before
set:  [1950, 1954, 1957, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975]
num:  68
=====fe

ValueError: could not convert string to float: '700～1000万円未満'

In [10]:
# feature 26: 家族と同居 ##### the same as feature 27 #####
print('=====feature 26=====')
print('before')
print('set: ', list(set(features[26]))[:300])
print('num: ', len(list(set(features[26]))))
features[26] = features[26].astype(np.float32)
print('====================')


# feature 27: 家族と同居名称
print('=====feature 27=====')
print('before')
print('set: ', list(set(features[27])))
print('num: ', len(list(set(features[27]))))

features[27][features[27]=='一人暮らし'] = 0
features[27][features[27]=='家族と同居'] = 1
features[27][features[27]=='その他'] = 2
features[27][features[27]=='答えたくない'] = np.random.randint(0, 3, len(features[27][features[27]=='答えたくない'])) # unknow

features[27] = features[27].astype(np.float32)
features[27][np.isnan(features[27])]  = np.random.randint(0, 3, sum(np.isnan(features[27]))) # nan

print('after')
print('set: ', list(set(features[27])))
print('num: ', len(list(set(features[27]))))
print('====================')


# feature 28: 子供
print('=====feature 28=====')
print('before')
print('set: ', list(set(features[28])))
print('num: ', len(list(set(features[28]))))

for i in range(1, 120): # [0] is nan
    features[28][features[28] == list(set(features[28]))[i]] = i # 1-119
    
features[28] = features[28].astype(np.float32)
features[28][np.isnan(features[28])]  = np.random.randint(1, 120, sum(np.isnan(features[28]))) # nan

print('after')
print('set: ', list(set(features[28])))
print('num: ', len(list(set(features[28]))))
print('====================')


# feature 29: 子供名称  ##### the same as feature 28 #####
print('=====feature 29=====')
print('before')
print('all the nan:', sum(np.isnan(features[29])))
print('====================')


# feature 30: 介護  ##### the same as feature 31 #####
print('=====feature 30=====')
print('before')
print('set: ', list(set(features[30]))[:300])
print('num: ', len(list(set(features[30]))))
features[30] = features[30].astype(np.float32)
print('====================')


# feature 31: 介護名称
print('=====feature 31=====')
print('before')
print('set: ', list(set(features[31])))
print('num: ', len(list(set(features[31]))))

features[31][features[31]=='いる（同居している）'] = 0
features[31][features[31]=='いる（同居していない）'] = 1
features[31][features[31]=='いない'] = 2
features[31][features[31]=='答えたくない'] = np.random.randint(0, 3, len(features[31][features[31]=='答えたくない'])) # unknow
    
features[31] = features[31].astype(np.float32)
features[31][np.isnan(features[31])]  = np.random.randint(1, 3, sum(np.isnan(features[31]))) # nan

print('after')
print('set: ', list(set(features[31])))
print('num: ', len(list(set(features[31]))))
print('====================')


# feature 32: 怪我／病気  ##### the same as feature 33 #####
print('=====feature 32=====')
print('before')
print('set: ', list(set(features[32]))[:100])
print('num: ', len(list(set(features[32]))))
features[32] = features[32].astype(np.float32)
print('====================')


# feature 33: 怪我／病気名称
print('=====feature 33=====')
print('before')
print('set: ', list(set(features[33])))
print('num: ', len(list(set(features[33]))))

features[33][features[33]=='ない'] = 0
features[33][features[33]=='3年以上前で経験あり'] = 1
features[33][features[33]=='1年以上~3年未満で経験あり'] = 2
features[33][features[33]=='1年未満で経験あり'] = 3
features[33][features[33]=='現在治療中'] = 4
features[33][features[33]=='答えたくない'] = np.random.randint(0, 5, len(features[33][features[33]=='答えたくない'])) # unknow
    
features[33] = features[33].astype(np.float32)
features[33][np.isnan(features[33])]  = np.random.randint(0, 5, sum(np.isnan(features[33]))) # nan

print('after')
print('set: ', list(set(features[33])))
print('num: ', len(list(set(features[33]))))
print('====================')


# feature 34: 精神科受診歴  ##### the same as feature 35 #####
print('=====feature 34=====')
print('before')
print('set: ', list(set(features[34]))[:100])
print('num: ', len(list(set(features[34]))))
features[34] = features[34].astype(np.float32)
print('====================')


# feature 35: 精神科受診歴名称
print('=====feature 35=====')
print('before')
print('set: ', list(set(features[35])))
print('num: ', len(list(set(features[35]))))

features[35][features[35]=='ない'] = 0
features[35][features[35]=='3年以上前で経験あり'] = 1
features[35][features[35]=='1年以上~3年未満で経験あり'] = 2
features[35][features[35]=='1年未満で経験あり'] = 3
features[35][features[35]=='現在治療中'] = 4
features[35][features[35]=='答えたくない'] = np.random.randint(0, 5, len(features[35][features[35]=='答えたくない'])) # unknow
    
features[35] = features[35].astype(np.float32)
features[35][np.isnan(features[35])]  = np.random.randint(0, 5, sum(np.isnan(features[35]))) # nan

print('after')
print('set: ', list(set(features[35])))
print('num: ', len(list(set(features[35]))))
print('====================')


# feature 36: 入社形態  ##### the same as feature 37 #####
print('=====feature 36=====')
print('before')
print('set: ', list(set(features[36]))[:700])
print('num: ', len(list(set(features[36]))))
features[36] = features[36].astype(np.float32)
print('====================')


# feature 37: 入社形態名称
print('=====feature 37=====')
print('before')
print('set: ', list(set(features[37])))
print('num: ', len(list(set(features[37]))))

features[37][features[37]=='新卒採用入社'] = 0
features[37][features[37]=='中途採用入社(第二新卒含む）'] = 1
features[37][features[37]=='その他'] = 2
features[37][features[37]=='未入力'] = np.random.randint(0, 3, len(features[37][features[37]=='未入力'])) # unknow
    
features[37] = features[37].astype(np.float32)
features[37][np.isnan(features[37])]  = np.random.randint(0, 3, sum(np.isnan(features[37]))) # nan

print('after')
print('set: ', list(set(features[37])))
print('num: ', len(list(set(features[37]))))
print('====================')


# feature 38: 勤務場所  ##### the same as feature 39 #####
print('=====feature 38=====')
print('before')
print('set: ', list(set(features[38]))[:300])
print('num: ', len(list(set(features[38]))))
features[38] = features[38].astype(np.float32)
print('====================')


# feature 39: 勤務場所名称
print('=====feature 39=====')
print('before')
print('set: ', list(set(features[39])))
print('num: ', len(list(set(features[39]))))

features[39][features[39]=='職場勤務'] = 0
features[39][features[39]=='テレワーク'] = 1
    
features[39] = features[39].astype(np.float32)
features[39][np.isnan(features[39])]  = np.random.randint(0, 2, sum(np.isnan(features[39]))) # nan

print('after')
print('set: ', list(set(features[39])))
print('num: ', len(list(set(features[39]))))
print('====================')

=====feature 26=====
before
set:  [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 

In [11]:
#####(回答時: 40-46)#####
# feature 40: 回答時_退職・休職区分 ##### the same as feature 47 #####
print('=====feature 40=====')
print('before')
print('set: ', list(set(features[40])))
print('num: ', len(list(set(features[40]))))

features[40][features[40]==0] = 0 # working
features[40][features[40]==1] = 1 # retired
features[40][features[40]==2] = 1 # retired
features[40] = features[40].astype(np.float32)
print('working:', sum(features[40]==0), 'retired:', sum(features[40]==1))

print('after')
print('set: ', list(set(features[40])))
print('num: ', len(list(set(features[40]))))
print('====================')


# feature 41: 回答時_退職日 ##### the same as feature 48 #####
print('=====feature 41=====')
print('before')
print('set: ', list(set(features[41])))
print('num: ', len(list(set(features[41]))))
print('====================')


# feature 42: 回答時_退職理由 ##### the same as feature 49 #####
print('=====feature 42=====')
print('before')
print('====================')


# feature 43: 回答時_退職理由名称 ##### the same as feature 50 #####
print('=====feature 43=====')
print('before')
print('set: ', list(set(features[43])))
print('num: ', len(list(set(features[43]))))
print('====================')


# feature 44: 回答時_休職日 ##### the same as feature 51 #####
print('=====feature 44=====')
print('before')
print('set: ', list(set(features[44])))
print('num: ', len(list(set(features[44]))))
print('====================')


# feature 45: 回答時_休職日 ##### the same as feature 52 #####
print('=====feature 45=====')
print('before')
print('set: ', list(set(df['回答時_休職日'])))
print('num: ', len(list(set(df['回答時_休職日']))))
print('====================')


# feature 46: 回答時_休職理由名称 ##### the same as feature 53 #####
print('=====feature 46=====')
print('before')
print('====================')


#####(現在: 47-53)#####
# feature 47: 現在_退職・休職区分
print('=====feature 47=====')
print('before')
print('set: ', list(set(features[47])))
print('num: ', len(list(set(features[47]))))

features[47][features[47]==0] = 0 # working
features[47][features[47]==1] = 1 # retired
features[47][features[47]==2] = 1 # retired
features[47] = features[47].astype(np.float32)
print('working:', sum(features[47]==0), 'retired:', sum(features[47]==1))

print('after')
print('set: ', list(set(features[47])))
print('num: ', len(list(set(features[47]))))
print('====================')


# feature 48: 現在_退職日　不要
print('=====feature 48=====')
print('before')
print('set: ', list(set(features[48])))
print('num: ', len(list(set(features[48]))))
print('====================')


# feature 49: 現在_退職理由　不要
print('=====feature 49=====')
print('before')
print('====================')


# feature 50: 現在_退職理由名称　不要
print('=====feature 50=====')
print('before')
print('set: ', list(set(features[50])))
print('num: ', len(list(set(features[50]))))
print('====================')


# feature 51: 現在_休職日　不要
print('=====feature 51=====')
print('before')
print('set: ', list(set(features[51])))
print('num: ', len(list(set(features[51]))))
print('====================')


# feature 52: 現在_休職理由　不要
print('=====feature 52=====')
print('before')
print('====================')


# feature 53: 現在_休職理由名称　不要
print('=====feature 53=====')
print('before')
print('set: ', list(set(features[53])))
print('num: ', len(list(set(features[53]))))
print('====================')

=====feature 40=====
before
set:  [0, 1, 2]
num:  3
working: 89356 retired: 27
after
set:  [0.0, 1.0]
num:  2
=====feature 41=====
before
set:  [nan, '2020-07-01', '2019-12-01', '2021-03-01', '2021-12-01', '2020-11-01', '2020-06-01', '2020-05-01', '2020-04-01', '2020-01-01', '2021-05-01', '2020-09-01', '2020-12-01', '2019-09-01', '2021-01-01', '2021-02-01', '2020-10-01', '2020-08-01']
num:  18
=====feature 42=====
before
=====feature 43=====
before
set:  [nan, '理由不明']
num:  2
=====feature 44=====
before
set:  [nan, '2020-07-01', '2009-01-01', '2021-06-01', '2019-07-01', '2020-11-01', '2019-10-01', '2010-01-01', '2020-04-01', '2021-01-01', '2021-07-01', '2020-03-01', '2019-06-01', '2019-03-01', '2020-01-01', '2019-04-01', '2020-08-01', '2020-06-01', '2021-05-01', '2020-12-01', '2019-09-01', '2021-02-01', '2020-10-01', '2019-12-01', '2019-08-01', '2018-12-01', '2020-02-01', '2018-04-01', '2019-02-01', '2019-11-01']
num:  30
=====feature 45=====
before
set:  [nan, '2020-07-01', '2009-01-0

In [12]:
# feature 54: 受診ID　不要
print('=====feature 54=====')
print('before')
print('set: ', list(set(features[54]))[:10])
print('num: ', len(list(set(features[54]))))
print('====================')


# feature 55: 回答日　不要
print('=====feature 55=====')
print('before')
print('set: ', list(set(features[55]))[:10])
print('num: ', len(list(set(features[55]))))
print('====================')


# feature 56: 実施期間ID　不要
print('=====feature 56=====')
print('before')
print('set: ', list(set(features[56]))[:10])
print('num: ', len(list(set(features[56]))))
print('====================')


# feature 57: 部署ID　不要
print('=====feature 57=====')
print('before')
print('set: ', list(set(features[57]))[:10])
print('num: ', len(list(set(features[57]))))
print('====================')

=====feature 54=====
before
set:  [29820, 29826, 29831, 29832, 29840, 29845, 29846, 29847, 29849, 29850]
num:  89383
=====feature 55=====
before
set:  ['2020-06-08', '2020-09-21', '2021-03-23', '2021-07-12', '2020-03-05', '2021-05-08', '2020-02-27', '2019-11-13', '2020-01-27', '2020-05-04']
num:  614
=====feature 56=====
before
set:  [2048, 2049, 2051, 2052, 2054, 2055, 2056, 2057, 2058, 2059]
num:  1016
=====feature 57=====
before
set:  [43, 44, 45, 46, 49, 50, 51, 52, 53, 55]
num:  7993


In [13]:
##### feature 58-209: B Questions #####
##(Questions: 58-209, Answers: 1.そうだ, 2.まあそうだ, 3.ややちがう, 4.ちがう)
## OR (Answers: 1.ほとんどいつもあった, 2.ときどきあった, 3.あまりなかった, 4.ほとんどなかった)
## OR ...
## OR (Questions: 181-195, Answers: 0.True, 1.False)

## e.g.) feature 58: B-1-2(非常にたくさんの仕事をしなければならない)
## e.g.) feature 59: B-2-7(時間内に仕事が処理しきれない)
## ...
## e.g.) feature 209: B-228-1046(日中眠くなる)

for i in range(58, 210): # 58-209 set(0, 1, 2, 3, 4)
    print('=====feature ' + str(i) + '=====')
    print('before')
    print('set: ', list(set(features[i])))
    print('num: ', len(list(set(features[i]))))

    if i not in range(181, 196): # 181-195 set(0, 1)
        features[i][features[i]==0] = np.random.randint(1, 5, len(features[i][features[i]==0])) # unknow
        features[i] = features[58].astype(np.float32)

    print('after')
    print('set: ', list(set(features[i])))
    print('num: ', len(list(set(features[i]))))
    print('====================')

=====feature 58=====
before
set:  [0.0, 1.0, 2.0, 3.0, 4.0]
num:  5
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 59=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 60=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 61=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 62=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 63=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 64=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 65=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 66=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
after
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 67=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:

In [14]:
##### feature 210-212: B Questions #####
# feature 210: B-229-1048(仕事に対して最も重視することは次のうちどれですか？)
print('=====feature 210=====')
print('before')
print('set: ', list(set(features[210])))
print('num: ', len(list(set(features[210]))))

features[210][features[210]==1048.0] = 0
features[210][features[210]==1049.0] = 1
features[210][features[210]==1050.0] = 2
features[210][features[210]==1051.0] = 3
features[210][features[210]==1052.0] = 4
features[210][features[210]==1053.0] = 5
features[210][features[210]==1054.0] = 6
features[210] = features[210].astype(np.float32)

print('after')
print('set: ', list(set(features[210])))
print('num: ', len(list(set(features[210]))))
print('====================')


# feature 211: B-230-1056(人生において最も大事にしたいと思うものは次のうちどれですか？)
print('=====feature 211=====')
print('before')
print('set: ', list(set(features[211])))
print('num: ', len(list(set(features[211]))))

features[211][features[211]==1056.0] = 0
features[211][features[211]==1057.0] = 1
features[211][features[211]==1058.0] = 2
features[211][features[211]==1059.0] = 3
features[211][features[211]==1060.0] = 4
features[211][features[211]==1055.0] = 5
features[211] = features[211].astype(np.float32)

print('after')
print('set: ', list(set(features[211])))
print('num: ', len(list(set(features[211]))))
print('====================')


# feature 212: B-231-1063(人生でどのような時間を最も大切にしていますか？)
print('=====feature 212=====')
print('before')
print('set: ', list(set(features[212])))
print('num: ', len(list(set(features[212]))))

features[212][features[212]==1061.0] = 0
features[212][features[212]==1062.0] = 1
features[212][features[212]==1063.0] = 2
features[212][features[212]==1064.0] = 3
features[212][features[212]==1065.0] = 4
features[212][features[212]==1066.0] = 5
features[212][features[212]==1067.0] = 6
features[212] = features[212].astype(np.float32)

print('after')
print('set: ', list(set(features[212])))
print('num: ', len(list(set(features[212]))))
print('====================')

=====feature 210=====
before
set:  [1048.0, 1049.0, 1050.0, 1051.0, 1052.0, 1053.0, 1054.0]
num:  7


after
set:  [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
num:  7
=====feature 211=====
before
set:  [1056.0, 1057.0, 1058.0, 1059.0, 1060.0, 1055.0]
num:  6
after
set:  [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
num:  6
=====feature 212=====
before
set:  [1061.0, 1062.0, 1063.0, 1064.0, 1065.0, 1066.0, 1067.0]
num:  7
after
set:  [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
num:  7


In [15]:
##### feature 213-287: C Questions #####
##### feature 288-316: D Questions #####

for i in range(213, 316):
    print('=====feature ' + str(i) + '=====')
    print('before')
    print('set: ', list(set(features[i])))
    print('num: ', len(list(set(features[i]))))

    # not encoding, not unknown, not nan 
    features[i] = features[i].astype(np.float32)
    
#     print('after')
#     print('set: ', list(set(features[i])))
#     print('num: ', len(list(set(features[i]))))
#     print('====================')

=====feature 213=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 214=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 215=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 216=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 217=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 218=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 219=====
before
set:  [1.0, 2.0, 3.0, 4.0]
num:  4
=====feature 220=====
before
set:  [1.0, 2.0, 3.0, 5.0]
num:  4
=====feature 221=====
before
set:  [1.0, 2.0, 3.0, 5.0]
num:  4
=====feature 222=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 223=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 224=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 225=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 226=====
before
set:  [1.0, 2.0, 3.0, 4.0, 5.0]
num:  5
=====feature 227=====
before
set:  [1.0, 2.0, 3.0, 4.0

In [16]:
########## feature & label selection ##########
features_id = [1, 3, 5, 7, 8, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 28, 31, 33, 35, 37, 39]
for i in range(58, 316):
    features_id.append(i)
label_id = 47

print('features id:', features_id)
print('features numbers:', len(features_id))
print('label id:', label_id)

features id: [1, 3, 5, 7, 8, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 28, 31, 33, 35, 37, 39, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 

In [31]:
########## normalization: [-1, 1] & standardization(mean & standard deviation) ##########
for i in features_id:
    # before normalization
    print('before feature {}: mean={:.4f}, Standard Deviation={:.4f}, max={:.4f}, min={:.4f}'.format(i, features[i].mean(), features[i].std(), features[i].max(), features[i].min()))
    
    features[i] = normalization(features[i]).reshape(-1, 1) # normalization
    # features[i] = standardization(features[i]) # standardization
    
    # after normalization
    print('after feature {}: mean={:.4f}, Standard Deviation={:.4f}, max={:.4f}, min={:.4f}'.format(i, features[i].mean(), features[i].std(), features[i].max(), features[i].min()))

In [27]:
########## write pre-processed data ##########
new_feature_names = []
new_features_list = []

# features
for i in features_id:
    new_feature_names.append(column_names[i])
    new_features_list.append(features[i])
    
# label
new_feature_names.append(column_names[label_id])
new_features_list.append(features[label_id].reshape(-1, 1))

# features + label
new_data = np.hstack(new_features_list).astype(np.float32)

new_df = pd.DataFrame(data=new_data, columns=new_feature_names)
new_df.to_csv('data/data_utf-8.csv')