In [1]:
import pandas as pd
import chardet
import csv
import tempfile

from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

from tempfile import TemporaryDirectory

In [2]:
cleanDatafile = '../Data/ct_report_dmg_thoracic_only_CLEAN.csv'
classesfile = '../Data/Output/ct_report_dmg_thoracic_only_LDAtrain_CHECKED_SAFE_COPY.csv'

rawfile = open(cleanDatafile, 'rb').read()
encodeInfo = chardet.detect(rawfile[:10000])
df_cases = pd.read_csv(cleanDatafile, encoding=encodeInfo['encoding'], names=['id', 'normalized'])
df_cases = df_cases.fillna('')

df_classes = pd.read_csv(classesfile, sep=',', names=['ID', 'Cancer', 'Prog'])

In [3]:
df_unified = pd.merge(df_cases, df_classes, how='outer', left_on='id', right_on='ID')
df_labeled = df_unified[df_unified['Cancer'].notna()].copy()

df_labeled['label_c'] = '__label__' + df_labeled['Cancer']
df_labeled['label_p'] = '__label__' + df_labeled['Prog']

del df_labeled['Prog'], df_labeled['Cancer']

In [12]:
df_labeled[df_labeled['label_p']=='__label__P']

Unnamed: 0,id,normalized,ID,label_c,label_p,count
852,8237,cect brain date DATE COLON multipl well defin ...,8237.0,__label__NC,__label__P,1
3013,5363,ct SCREEN thorax date COLON DATE CONTRAST_ENHA...,5363.0,__label__NC,__label__P,1
3344,412,ct SCREEN thorax & abdomen date DATE COLON com...,412.0,__label__C,__label__P,1
3733,4944,ct SCREEN thorax date DATE CONTRAST_ENHANCED_C...,4944.0,__label__C,__label__P,1
3809,6369,ct SCREEN thorax abdomen date DATE COLON compa...,6369.0,__label__NC,__label__P,1
3874,9260,ct SCREEN brain date DATE plain & CONTRAST_ENH...,9260.0,__label__C,__label__P,1
4193,5426,cect SCREEN thorax COMMA abdomen date DATE con...,5426.0,__label__NC,__label__P,1
4276,8298,ct SCREEN thorax date DATE LEFT_PAREN report d...,8298.0,__label__NC,__label__P,1
4822,20460,c PERIOD PERIOD SCREEN thorax abdomen date DAT...,20460.0,__label__NC,__label__P,1
4894,21697,ct SCREEN thorax & abdomen date DATE COLON CON...,21697.0,__label__C,__label__P,1


In [15]:
df_labeled[df_labeled['label_p']=='__label__P']['normalized'][14165]

'ct SCREEN thorax date DATE CONTRAST_ENHANCED_CT_SCAN thorax perform mdct scanner PERIOD case CARCINOMA LUNG chemotherapi PERIOD compar previous ct date DATE PERIOD multipl new parenchym LUNG nodul seen compar previous SCREEN PERIOD post lobectomi chang seen RIGHT_LUNG field PERIOD rest imag find remain unchang PERIOD multipl new parenchym LUNG nodul seen compar previous SCREEN PERIOD overal COMMA progress neoplast diseas process compar previous SCREEN PERIOD rest imag find remain unchang PERIOD'

In [4]:
# I do not know if cases with multiple reports contain useful information on the first report or the last one. So I will ignore these cases.

df_counts = df_labeled.groupby('id')['id'].count().reset_index(name="count")
df_labeled = pd.merge(df_labeled, df_counts, how='outer', left_on='id', right_on='id')
df_labeled = df_labeled[df_labeled['count']==1].copy()


In [5]:
def trainFlairClassifier(df_labeled, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName):
    ids = df_labeled['id'].tolist()

    nSamples = len(ids)
    idx80 = int(nSamples * 0.7)
    idx90 = int(nSamples * 0.9)

    train_ids = ids[:idx80]
    test_ids  = ids[idx80:idx90]
    dev_ids   = ids[idx90:]

    with TemporaryDirectory() as temp_dir:
        trainCsv = temp_dir + trainNameCsv
        testCsv  = temp_dir + testNameCsv
        devCsv   = temp_dir + devNameCsv

        df_labeled[df_labeled['id'].isin(train_ids)].to_csv(trainCsv, columns=columns, sep='\t', index=False, header=False)
        df_labeled[df_labeled['id'].isin(test_ids) ].to_csv(testCsv , columns=columns, sep='\t', index=False, header=False)
        df_labeled[df_labeled['id'].isin(dev_ids)  ].to_csv(devCsv  , columns=columns, sep='\t', index=False, header=False)

        corpus = NLPTaskDataFetcher.load_classification_corpus(temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv)

        word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
        document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
        classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
        trainer = ModelTrainer(classifier, corpus)

        trainer.train(temp_dir, max_epochs=50)

        classifier.save(classifierFileName)

In [6]:
columns=['label_p', 'normalized']
trainNameCsv = '/train_p.csv'
testNameCsv  = '/test_p.csv'
devNameCsv   = '/dev_p.csv'
classifierFileName = './classifier_p'

trainFlairClassifier(df_labeled, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName)

2019-03-27 10:18:54,212 Reading data from /tmp/tmpwq54ykej
2019-03-27 10:18:54,216 Train: /tmp/tmpwq54ykej/train_p.csv
2019-03-27 10:18:54,217 Dev: /tmp/tmpwq54ykej/dev_p.csv
2019-03-27 10:18:54,219 Test: /tmp/tmpwq54ykej/test_p.csv
2019-03-27 10:18:56,371 ----------------------------------------------------------------------------------------------------
2019-03-27 10:18:56,374 Evaluation method: MICRO_F1_SCORE
2019-03-27 10:18:56,376 ----------------------------------------------------------------------------------------------------
2019-03-27 10:19:04,439 epoch 1 - iter 0/4 - loss 0.02514661
2019-03-27 10:19:27,741 epoch 1 - iter 1/4 - loss 0.01948788
2019-03-27 10:19:47,197 epoch 1 - iter 2/4 - loss 0.01903839
2019-03-27 10:20:00,143 epoch 1 - iter 3/4 - loss 0.02098880
2019-03-27 10:20:00,155 ----------------------------------------------------------------------------------------------------
2019-03-27 10:20:00,158 EPOCH 1 done: loss 0.0210 - lr 0.1000 - bad epochs 0
2019-03-27 10

In [18]:
columns=['label_c', 'normalized']
trainNameCsv = '/train_c.csv'
testNameCsv  = '/test_c.csv'
devNameCsv   = '/dev_c.csv'
classifierFileName = './classifier_c'

trainFlairClassifier(df_labeled, columns, trainNameCsv, testNameCsv, devNameCsv, classifierFileName)

2019-03-27 16:49:40,236 Reading data from /tmp/tmp24g7r2nc
2019-03-27 16:49:40,237 Train: /tmp/tmp24g7r2nc/train_c.csv
2019-03-27 16:49:40,237 Dev: /tmp/tmp24g7r2nc/dev_c.csv
2019-03-27 16:49:40,238 Test: /tmp/tmp24g7r2nc/test_c.csv
2019-03-27 16:49:42,605 ----------------------------------------------------------------------------------------------------
2019-03-27 16:49:42,611 Evaluation method: MICRO_F1_SCORE
2019-03-27 16:49:42,613 ----------------------------------------------------------------------------------------------------
2019-03-27 16:49:51,658 epoch 1 - iter 0/4 - loss 0.02113067
2019-03-27 16:50:01,964 epoch 1 - iter 1/4 - loss 0.02046596
2019-03-27 16:50:13,360 epoch 1 - iter 2/4 - loss 0.01940820
2019-03-27 16:50:17,114 epoch 1 - iter 3/4 - loss 0.02068288
2019-03-27 16:50:17,134 ----------------------------------------------------------------------------------------------------
2019-03-27 16:50:17,148 EPOCH 1 done: loss 0.0207 - lr 0.1000 - bad epochs 0
2019-03-27 16