# 中文数据预处理

### 导入所需的包

In [1]:
# coding=utf-8
import sys
sys.path.append('../')
import datasets
from preprocessing import build_vocab
from preprocessing import vocab_index_descriptions
from constants import DATA_DIR

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
from sklearn.model_selection import train_test_split

from collections import Counter, defaultdict
import csv
import math
import operator
import importlib

### 首先将三个文件导入pandas.dataframe并合并、提取所需的列

In [7]:
data_path1 = "../data/2014首页.csv"
data_path2 = "../data/2015首页.csv"
data_path3 = "../data/2016首页.csv"
df1 = pd.read_csv(data_path1, engine='python', encoding='gb18030')
df2 = pd.read_csv(data_path2, engine='python', encoding='gb18030')
df3 = pd.read_csv(data_path3, engine='python', encoding='gb18030')
df = pd.concat([df1, df2, df3], ignore_index=True)
df_main = df[['MAIN_DIAG','MAIN_ICD_CODE']]
print(len(df_main))

242877


### 筛选出code频数大于threashold对应的记录，去除部分code后面的+符号

In [6]:
threshold = 100
value_counts = df_main['MAIN_ICD_CODE'].value_counts()
print(len(value_counts))
# print(value_counts[100].index)
# to_remove = value_counts[value_counts <= threshold].index
to_remove = value_counts[100:].index
df_main = df_main.loc[~df_main['MAIN_ICD_CODE'].isin(to_remove),:]
df_main = df_main.dropna(axis=0, how='any')
df_main = df_main.applymap(lambda x: x.strip('+') if type(x) is str else x)
df_main = df_main.reset_index(drop=True)
value_counts_after = df_main['MAIN_ICD_CODE'].value_counts()
# df_main['MAIN_ICD_CODE'].value_counts()
print(len(df_main))
print(value_counts_after.index.values)
df_main.head()

100
129132
['Z51.102' 'Z51.100' 'J18.000' 'B08.401' 'O80.000' 'D25.900' 'O42.900'
 'P23.900' 'O34.201' 'Z36.001' 'K40.901' 'J18.900' 'O04.900' 'I63.900'
 'H25.900' 'P39.900' 'P59.901' 'Z51.901' 'C34.900' 'Z51.1' 'E04.902'
 'C50.900' 'O70.000' 'G45.004' 'N40.x00' 'N60.201' 'I63.905' 'J06.900'
 'J98.414' 'D06.900' 'C20.x00' 'K80.101' 'D27.x00' 'I84.201' 'M81.901'
 'O24.900' 'J38.102' 'G40.901' 'Z47.001' 'C73.x00' 'C53.900' 'I25.901'
 'C22.000' 'O82.000' 'P22.001' 'J44.100' 'K80.000' 'Z51.801' 'J03.901'
 'I20.000' 'D24.x00' 'N80.100' 'C34.101' 'Z35.401' 'QTZD1' 'Z51.002'
 'I25.103' 'O60.100' 'N84.001' 'O03.901' 'P07.300' 'Z29.101' 'J21.900'
 'C15.400' 'A16.200' 'N04.900' 'J44.000' 'J03.900' 'A41.901' 'P21.900'
 'E11.700' 'K56.100' 'E05.003' 'J20.900' 'O00.104' 'Z51.001' 'P07.101'
 'D34.x00' 'R10.400' 'Z29.100' 'J44.101' 'K52.916' 'E11.900' 'J32.901'
 'N87.001' 'N13.202' 'D61.900' 'C11.900' 'Z92.8' 'N87.101' 'I83.900'
 'J35.000' 'Z33.x00' 'C18.700' 'I61.004' 'A86.x00' 'C22.900' 'B99.x01'
 

Unnamed: 0,MAIN_DIAG,MAIN_ICD_CODE
0,新生儿肺炎,P23.900
1,急性右额叶、胼胝体右侧部梗塞,I63.900
2,非霍奇金淋巴瘤Ⅳ期B组（T淋巴母细胞性）并淋巴瘤白血病,Z51.901
3,急性腹泻病,K52.916
4,胎膜早破,O42.900


### 对诊断描述进行分词并统计

In [5]:
jieba.load_userdict('../preprocessing/dict.txt')
df_main['DIAG_SPLIT'] = ""
for row in df_main.itertuples():
    cut = jieba.cut(row[1])
    diag_split = ' '.join(cut)
    df_main.loc[row[0]]['DIAG_SPLIT'] = diag_split
df_main.head()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/lg/qxn9gw8j7x75pddgvfpgbfg40000gn/T/jieba.cache
Loading model cost 1.023 seconds.
Prefix dict has been built succesfully.


Unnamed: 0,MAIN_DIAG,MAIN_ICD_CODE,DIAG_SPLIT
0,新生儿肺炎,P23.900,新生儿肺炎
1,急性右额叶、胼胝体右侧部梗塞,I63.900,急性 右 额叶 、 胼胝体 右侧 部 梗塞
2,非霍奇金淋巴瘤Ⅳ期B组（T淋巴母细胞性）并淋巴瘤白血病,Z51.901,非霍奇金淋巴瘤 Ⅳ 期 B 组 （ T 淋巴 母细胞 性 ） 并 淋巴瘤 白血病
3,急性腹泻病,K52.916,急性腹泻 病
4,胎膜早破,O42.900,胎膜早破


In [6]:
#Tokens and types
types = set()
num_tok = 0
for row in df_main.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [7]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 7539
Num tokens 902873


### 划分训练集和测试集、建立词汇表

In [8]:
train, test = train_test_split(df_main, test_size=0.2)
train.to_csv('../data/train_raw.csv', index=False, encoding='gb18030')
test.to_csv('../data/test_raw.csv', index=False, encoding='gb18030')

In [9]:
importlib.reload(build_vocab)
vocab_min = 3
train_file = '../data/train_raw.csv'
vname = '../data/vocab.csv'
build_vocab.build_vocab(vocab_min, train_file, vname)

reading in data...
removing rare terms
3959 terms qualify out of 7001 total
writing output


### Pre-train word embedding

In [None]:
reload(word_embeddings)
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 5)

### Write pre-trained word embeddings with new vocab

In [None]:
reload(extract_wvs)
reload(datasets)
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR)

### 导入ICD Description并对训练集和测试集再做筛选

In [10]:
importlib.reload(vocab_index_descriptions)
importlib.reload(datasets)
vocab_index_descriptions.vocab_index_descriptions('../data/vocab.csv',
                                                  '../data/description_vectors.vocab')

100%|██████████| 23067/23067 [00:01<00:00, 17695.39it/s]


In [226]:
desc_code_list = []
with open("../data/description_vectors.vocab", 'r', encoding='gb18030') as vfile:
    r = csv.reader(vfile, delimiter=" ")
    next(r)
    for row in r:
        code = row[0]
        desc_code_list.append(code)

In [11]:
for splt in ['train', 'test']:
    filename = '../data/{}_raw.csv'.format(splt)
    df_temp = pd.read_csv(filename, encoding='gb18030')
#     df_temp = df_temp[df_temp['MAIN_ICD_CODE'].isin(desc_code_list)]
    df_temp['length'] = df_temp.apply(lambda row: len(str(row['DIAG_SPLIT']).split()), axis=1)
    df_temp = df_temp.sort_values(['length'])
    df_temp.to_csv('../data/{}.csv'.format(splt), index=False, encoding='gb18030')

In [12]:
df_train = pd.read_csv("../data/train.csv", engine='python', encoding='gb18030')
df_test = pd.read_csv("../data/test.csv", engine='python', encoding='gb18030')
code_list = df_train['MAIN_ICD_CODE'].value_counts().index.values
with open('../data/code_list.csv', 'w', encoding='gb18030') as of:
    w = csv.writer(of)
    for code in code_list:
        w.writerow([code])

# 测试区域