# 中文数据预处理

### 导入所需的包

In [2]:
# coding=utf-8
import sys
sys.path.append('../')
import datasets
from preprocessing import build_vocab
from preprocessing import vocab_index_descriptions
from constants import DATA_DIR

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
from sklearn.model_selection import train_test_split

from collections import Counter, defaultdict
import csv
import math
import operator
import importlib

### 首先将三个文件导入pandas.dataframe并合并、提取所需的列

In [3]:
data_path1 = "../data/2014首页.csv"
data_path2 = "../data/2015首页.csv"
data_path3 = "../data/2016首页.csv"
df1 = pd.read_csv(data_path1, engine='python', encoding='gb18030')
df2 = pd.read_csv(data_path2, engine='python', encoding='gb18030')
df3 = pd.read_csv(data_path3, engine='python', encoding='gb18030')
df = pd.concat([df1, df2, df3], ignore_index=True)
df_main = df[['MAIN_DIAG','MAIN_ICD_CODE']]
print(len(df_main))

242877


### 筛选出code频数大于threashold对应的记录，去除部分code后面的+符号

In [10]:
threshold = 100
value_counts = df_main['MAIN_ICD_CODE'].value_counts()
# to_remove = value_counts[value_counts <= threshold].index
to_remove = value_counts[50:].index
df_main = df_main.loc[~df_main['MAIN_ICD_CODE'].isin(to_remove),:]
df_main = df_main.dropna(axis=0, how='any')
df_main = df_main.applymap(lambda x: x.strip('+') if type(x) is str else x)
df_main = df_main.reset_index(drop=True)
value_counts_after = df_main['MAIN_ICD_CODE'].value_counts()
# df_main['MAIN_ICD_CODE'].value_counts()
print(len(df_main))
print(value_counts_after.index.values)
df_main.head()

101131
['Z51.102' 'Z51.100' 'J18.000' 'B08.401' 'O80.000' 'D25.900' 'O42.900'
 'P23.900' 'O34.201' 'Z36.001' 'K40.901' 'J18.900' 'O04.900' 'I63.900'
 'H25.900' 'P39.900' 'P59.901' 'Z51.901' 'C34.900' 'Z51.1' 'E04.902'
 'C50.900' 'O70.000' 'G45.004' 'N60.201' 'N40.x00' 'I63.905' 'J06.900'
 'J98.414' 'D06.900' 'C20.x00' 'K80.101' 'D27.x00' 'I84.201' 'M81.901'
 'O24.900' 'J38.102' 'G40.901' 'Z47.001' 'C73.x00' 'C53.900' 'I25.901'
 'C22.000' 'O82.000' 'P22.001' 'K80.000' 'J44.100' 'Z51.801' 'J03.901'
 'I20.000']


Unnamed: 0,MAIN_DIAG,MAIN_ICD_CODE
0,新生儿肺炎,P23.900
1,急性右额叶、胼胝体右侧部梗塞,I63.900
2,非霍奇金淋巴瘤Ⅳ期B组（T淋巴母细胞性）并淋巴瘤白血病,Z51.901
3,胎膜早破,O42.900
4,支气管肺炎,J18.000


### 对诊断描述进行分词并统计

In [11]:
jieba.load_userdict('../preprocessing/dict.txt')
df_main['DIAG_SPLIT'] = ""
for row in df_main.itertuples():
    cut = jieba.cut(row[1])
    diag_split = ' '.join(cut)
    df_main.loc[row[0]]['DIAG_SPLIT'] = diag_split
df_main.head()

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/lg/qxn9gw8j7x75pddgvfpgbfg40000gn/T/jieba.cache
Loading model cost 1.209 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,MAIN_DIAG,MAIN_ICD_CODE,DIAG_SPLIT
0,新生儿肺炎,P23.900,新生儿肺炎
1,急性右额叶、胼胝体右侧部梗塞,I63.900,急性 右 额叶 、 胼胝体 右侧 部 梗塞
2,非霍奇金淋巴瘤Ⅳ期B组（T淋巴母细胞性）并淋巴瘤白血病,Z51.901,非霍奇金淋巴瘤 Ⅳ 期 B 组 （ T 淋巴 母细胞 性 ） 并 淋巴瘤 白血病
3,胎膜早破,O42.900,胎膜早破
4,支气管肺炎,J18.000,支气管肺炎


In [12]:
#Tokens and types
types = set()
num_tok = 0
for row in df_main.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [13]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 5927
Num tokens 722792


### 划分训练集和测试集、建立词汇表

In [14]:
train, test = train_test_split(df_main, test_size=0.2)
train.to_csv('../data/train_raw.csv', index=False, encoding='gb18030')
test.to_csv('../data/test_raw.csv', index=False, encoding='gb18030')

In [15]:
importlib.reload(build_vocab)
vocab_min = 3
train_file = '../data/train_raw.csv'
vname = '../data/vocab.csv'
build_vocab.build_vocab(vocab_min, train_file, vname)

reading in data...
removing rare terms
3269 terms qualify out of 5545 total
writing output


In [16]:
with open('../data/code_list.csv', 'w', encoding='gb18030') as of:
    w = csv.writer(of)
    for code in code_list:
        w.writerow([code])

NameError: name 'code_list' is not defined

### 导入ICD Description并对训练集和测试集再做筛选

In [221]:
importlib.reload(vocab_index_descriptions)
importlib.reload(datasets)
vocab_index_descriptions.vocab_index_descriptions('../data/vocab.csv',
                                                  '../data/description_vectors.vocab')

100%|██████████| 23067/23067 [00:01<00:00, 14176.60it/s]


In [226]:
desc_code_list = []
with open("../data/description_vectors.vocab", 'r', encoding='gb18030') as vfile:
    r = csv.reader(vfile, delimiter=" ")
    next(r)
    for row in r:
        code = row[0]
        desc_code_list.append(code)

In [17]:
for splt in ['train', 'test']:
    filename = '../data/{}_raw.csv'.format(splt)
    df_temp = pd.read_csv(filename, encoding='gb18030')
#     df_temp = df_temp[df_temp['MAIN_ICD_CODE'].isin(desc_code_list)]
    df_temp['length'] = df_temp.apply(lambda row: len(str(row['DIAG_SPLIT']).split()), axis=1)
    df_temp = df_temp.sort_values(['length'])
    df_temp.to_csv('../data/{}.csv'.format(splt), index=False, encoding='gb18030')

In [18]:
df_train = pd.read_csv("../data/train.csv", engine='python', encoding='gb18030')
df_test = pd.read_csv("../data/test.csv", engine='python', encoding='gb18030')
code_list = df_train['MAIN_ICD_CODE'].value_counts().index.values
with open('../data/code_list.csv', 'w', encoding='gb18030') as of:
    w = csv.writer(of)
    for code in code_list:
        w.writerow([code])

# 测试区域

In [163]:
importlib.reload(datasets)
dicts = datasets.load_lookups('../data/train.csv', '../data/vocab.csv')
gen = datasets.data_generator('../data/train.csv', dicts, 8, num_labels=len(code_list), desc_embed=False)

In [166]:
for batch_idx, tup in enumerate(gen):
    data, target, _, code_set, descs = tup


In [19]:
len(code_list)

50

In [230]:
desc_dict = defaultdict(str)
desc_code_list2 = []
with open('../data/ICD_Descriptions.csv', 'r', encoding='gb18030') as labelfile:
    f = csv.reader(labelfile)
    next(f)
    for row in f:
        code = row[0] if row[0] != '' else row[1]
        code = code.strip('_')
        #code = reformat(row[0])
        if code not in desc_dict.keys():
            desc_dict[code] = ' '.join(row[2:])
for code, desc in desc_dict.items():
#     print(code)
    desc_code_list2.append(code)
print(len(desc_code_list2))

data_code_list = []
with open('../data/code_list.csv', 'r', encoding='gb18030') as codefile:
    f = csv.reader(codefile)
    for row in f:
        data_code_list.append(row[0])

for c in data_code_list:
    if c not in desc_code_list:
        print(c)

23067
Z51.1
M81.901
G40.901
P22.001
QTZD1
J44.101
Z92.8
O28.001
D69.404
O14.102
z51.102
J45.904
N64.901
N05.901
M71.201
A86.X00
z51.100


In [173]:
ind2w = defaultdict(str)
with open('../data/vocab.csv', 'r', encoding='gb18030') as vocabfile:
    for i,line in enumerate(vocabfile):
        line = line.rstrip()
        if line != '':
            ind2w[i+1] = line.rstrip()
w2ind = {w:i for i,w in ind2w.items()}
print(ind2w)
print(w2ind)

defaultdict(<class 'str'>, {1: '乙状结肠', 2: '腺癌', 3: '累及', 4: '膀胱', 5: '顶壁', 6: '化疗', 7: '后', 8: 'P', 9: '-', 10: 'T4bN0M0', 11: 'IIC', 12: '期', 13: '食管', 14: '上', 15: '段', 16: '鳞癌', 17: '左乳', 18: '浸润性', 19: '导管', 20: '癌', 21: 'T4bN3M0', 22: 'IIIC', 23: '右', 24: '下叶', 25: '中央', 26: '型', 27: '肺', 28: '（', 29: 'T1aN1M0IIA', 30: '）', 31: '术后', 32: '急性', 33: 'B', 34: '淋巴细胞', 35: '白血病', 36: '左', 37: '乳腺', 38: '腺病', 39: '并', 40: '感染', 41: '、', 42: '脓肿', 43: '形成', 44: 'G2P2', 45: '宫内', 46: '妊娠', 47: '41', 48: '+', 49: '2', 50: '周', 51: 'LOA', 52: '，', 53: '顺娩', 54: '；', 55: '胃窦', 56: '局部', 57: '切除术', 58: '子宫', 59: '肌瘤', 60: '右上', 61: '双', 62: '肺门', 63: '纵隔', 64: '锁骨', 65: '淋巴结', 66: '转移', 67: 'c', 68: 'T1N3M0', 69: 'Ⅲ', 70: 'b', 71: '卵巢', 72: '浆液', 73: '性囊', 74: '腺瘤', 75: '左侧', 76: '基底节', 77: '区', 78: '脑出血', 79: '破入', 80: '脑室', 81: '上呼吸道', 82: '39', 83: '3', 84: '腘', 85: '窝', 86: '囊肿', 87: '右侧', 88: '精索', 89: '鞘', 90: '膜', 91: '积液', 92: '甲状腺', 93: '可能', 94: 'T2N3M0', 95: '右乳', 96: 'pT2N0M0', 97

In [238]:
jieba.load_userdict('../preprocessing/dict.txt')
df_main['DIAG_SPLIT2'] = ""
for row in df_main.itertuples():
    cut = jieba.cut(row[1])
    diag_split = ' '.join(cut)
    df_main.loc[row[0]]['DIAG_SPLIT2'] = diag_split
df_main.head()

Unnamed: 0,MAIN_DIAG,MAIN_ICD_CODE,DIAG_SPLIT,DIAG_SPLIT2
0,新生儿肺炎,P23.900,新生儿 肺炎,新生儿肺炎
1,急性右额叶、胼胝体右侧部梗塞,I63.900,急性 右 额叶 、 胼胝 体 右侧 部 梗塞,急性 右 额叶 、 胼胝体 右侧 部 梗塞
2,非霍奇金淋巴瘤Ⅳ期B组（T淋巴母细胞性）并淋巴瘤白血病,Z51.901,非 霍奇金 淋巴瘤 Ⅳ 期 B 组 （ T 淋巴 母细胞 性 ） 并 淋巴瘤 白血病,非霍奇金淋巴瘤 Ⅳ 期 B 组 （ T 淋巴 母细胞 性 ） 并 淋巴瘤 白血病
3,左肾上腺髓脂肪瘤,D35.000,左 肾上腺 髓 脂肪瘤,左 肾上腺 髓 脂肪瘤
4,急性腹泻病,K52.916,急性 腹泻 病,急性腹泻 病


In [241]:
df_main.head(50)

Unnamed: 0,MAIN_DIAG,MAIN_ICD_CODE,DIAG_SPLIT,DIAG_SPLIT2
0,新生儿肺炎,P23.900,新生儿 肺炎,新生儿肺炎
1,急性右额叶、胼胝体右侧部梗塞,I63.900,急性 右 额叶 、 胼胝 体 右侧 部 梗塞,急性 右 额叶 、 胼胝体 右侧 部 梗塞
2,非霍奇金淋巴瘤Ⅳ期B组（T淋巴母细胞性）并淋巴瘤白血病,Z51.901,非 霍奇金 淋巴瘤 Ⅳ 期 B 组 （ T 淋巴 母细胞 性 ） 并 淋巴瘤 白血病,非霍奇金淋巴瘤 Ⅳ 期 B 组 （ T 淋巴 母细胞 性 ） 并 淋巴瘤 白血病
3,左肾上腺髓脂肪瘤,D35.000,左 肾上腺 髓 脂肪瘤,左 肾上腺 髓 脂肪瘤
4,急性腹泻病,K52.916,急性 腹泻 病,急性腹泻 病
5,胎膜早破,O42.900,胎膜 早破,胎膜早破
6,左下肢静脉曲张,I83.900,左 下肢 静脉曲张,左 下肢静脉曲张
7,支气管肺炎,J18.000,支气管 肺炎,支气管肺炎
8,椎基动脉供血不足（左侧椎动脉硬化）,G45.002,椎基 动脉 供血 不足 （ 左侧 椎 动脉硬化 ）,椎基 动脉 供血 不足 （ 左侧椎动脉 硬化 ）
9,传染性单核细胞增多症(EB病毒感染+巨细胞病毒感染),B27.900,传染性 单核细胞 增多 症 ( EB 病毒感染 + 巨细胞 病毒感染 ),传染性单核细胞增多症 ( EB病毒感染 + 巨细胞病毒感染 )


In [20]:
df.head()

Unnamed: 0,PATIENT_ID,EVENT_NO,CASE_NO,HOS_NAME,HOS_ID,HEALTH_CARD,ID_CARD,NAME,SEX,BIRTHDAY,...,FEE_CNZCY,FEE_20,FEE_21,FEE_22,FEE_23,FEE_24,FEE_25,FEE_26,FEE_27,FEE_OTHER
0,1980034,298842001,298842,zzsyy,,,未发,曾瑞荣之女,女,2014-02-13,...,,,,,,,27.63,212.91,,76.5
1,1935493,291066001,291066,zzsyy,,,510902195607179313,余江龙,男,1954-12-19,...,,,,,,,352.39,109.78,,
2,1936683,291222001,291222,zzsyy,,,350622193411033021,李银丝,女,1934-11-03,...,,,,,,,360.88,697.14,,
3,1796991,270569006,270569,zzsyy,,,/,林志联,男,1985-07-29,...,,,,,,,8.28,108.21,,
4,1928057,291400001,291400,zzsyy,,,350600196712133526,吴晔,女,1967-12-13,...,,,,,,,71.01,416.97,3115.32,
