In [1]:
import os
import numpy as np
import pandas as pd 
import re
from collections import Counter

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# !pip install scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz
import scispacy
import spacy
import en_core_sci_lg


[nltk_data] Downloading package stopwords to C:\Users\Yash
[nltk_data]     Shinge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
input_path = "HFI_redo/input/"
filename = "assignment_data.csv"
ex_file = "example_output.csv"

In [3]:
example_data = pd.read_csv(os.path.join(input_path, ex_file))

In [4]:
data = pd.read_csv(os.path.join(input_path, filename))

In [5]:
train = example_data.copy()
test = data.copy()

In [6]:
print(f'Shape of example file: {train.shape}')
print(f'Shape of main file: {test.shape}')

Shape of example file: (10, 3)
Shape of main file: (990, 3)


In [7]:
#error_check in main file
index = test[(test['Adverse Reactions'] == 'ERROR1') | (test['Adverse Reactions'] == '') |
                  (test['Adverse Reactions'] == 'Adverse Reactions')].index 

In [8]:
index

Int64Index([953], dtype='int64')

In [9]:
test.drop(index, inplace=True)
test.reset_index(drop=True, inplace=True)

In [10]:
print(f'Shape of main file: {test.shape}')

Shape of main file: (989, 3)


### Text Cleaning

In [11]:
stop_words = set(stopwords.words('english')) 

In [12]:
def text_cleaner(text):
    new_text = text.lower()
    new_text = re.sub(r'\([^)]*\)', '', new_text)
    new_text = re.sub('"','', new_text)
    new_text = re.sub(r"'s\b","",new_text)
    new_text = re.sub("[^a-zA-Z]", " ", new_text) 
    new_text = re.sub(r'[_"\-;%()|,+&=*%.!?:#$@\[\]/]', '', new_text)
   
    # remove stopwords:
    tokens = word_tokenize(new_text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # remove short words
    long_words=[]
    for i in tokens:
        if len(i)>2:                                                
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [13]:
cleaned_ex_text = []
for t in train['Adverse Reactions']:
    cleaned_ex_text.append(text_cleaner(t))
cleaned_ex_text

['common adverse reactions including laboratory abnormalities leukopenia lymphopenia fatigue anemia neutropenia increased creatinine increased alanine aminotransferase increased glucose thrombocytopenia nausea decreased appetite musculoskeletal pain decreased albumin constipation dyspnea decreased sodium increased aspartate aminotransferase vomiting cough decreased magnesium diarrhea',
 'adverse reactions adverse reactions isosorbide dinitrate generally dose related almost reactions result isosorbide dinitrate activity vasodilator headache may severe commonly reported side effect headache may recurrent daily dose especially higher doses transient episodes lightheadedness occasionally related blood pressure changes may also occur hypotension occurs infrequently patients may severe enough warrant discontinuation therapy syncope crescendo angina rebound hypertension reported uncommon extremely rarely ordinary doses organic nitrates caused methemoglobinemia normal seeming patients methemog

In [14]:
cleaned_text = []
for t in test['Adverse Reactions']:
    cleaned_text.append(text_cleaner(t))

In [15]:
cleaned_text[0]

'following adverse events observed reported patients using propranolol cardiovascular bradycardia congestive heart failure intensification block hypotension paresthesia hands thrombocytopenic purpura arterial insufficiency usually raynaud type central nervous system light headedness mental depression manifested insomnia lassitude weakness fatigue catatonia visual disturbances hallucinations vivid dreams acute reversible syndrome characterized disorientation time place short term memory loss emotional lability slightly clouded sensorium decreased performance neuropsychometrics immediate release formulations fatigue lethargy vivid dreams appear dose related gastrointestinal nausea vomiting epigastric distress abdominal cramping diarrhea constipation mesenteric arterial thrombosis ischemic colitis allergic hypersensitivity reactions including anaphylactic anaphylactoid reactions pharyngitis agranulocytosis erythematous rash fever combined aching sore throat laryngospasm respiratory distre

In [16]:
train.loc[:, 'cleaned_adverse_reactions'] = cleaned_ex_text

In [17]:
test.loc[:, 'cleaned_adverse_reactions'] = cleaned_text

In [18]:
train.head()

Unnamed: 0,SetID,Adverse Reactions,Summary,cleaned_adverse_reactions
0,632bb50c-3bcb-4c85-9056-fc33410550ae,The most common adverse reactions including la...,"Leukopenia lymphopenia, fatigue, anemia, neutr...",common adverse reactions including laboratory ...
1,723d9f78-9d77-4575-af27-1aa117e6b8d7,ADVERSE REACTIONS Adverse reactions to isosorb...,"Headache, lightheadedness in response to blood...",adverse reactions adverse reactions isosorbide...
2,8589d376-ac10-4ddb-9c53-2e0c8d5675c4,The most common adverse reactions (incidence 5...,"Instillation-site irritation, dysegeusia, decr...",common adverse reactions following use xiidra ...
3,9087c92f-c753-4bd4-82e4-5aeee31e0ec3,Most common adverse reactions (>>10%): constip...,"constipation, nausea, and sedation.",common adverse reactions constipation nausea s...
4,a500b8db-fed5-7a0e-e053-2995a90ab877,Most common adverse reaction to amlodipine is ...,"Edema. Fatigue, nausea, abdominal pain, and s...",common adverse reaction amlodipine edema occur...


In [19]:
test.head()

Unnamed: 0,SetID,Adverse Reactions,Summary,cleaned_adverse_reactions
0,a834d1cf-72fc-93bf-e053-2995a90a6191,The following adverse events were observed and...,,following adverse events observed reported pat...
1,a835b697-2beb-1ba8-e053-2995a90a470c,The following serious adverse reactions are de...,,following serious adverse reactions described ...
2,a837f13e-fafc-0535-e053-2995a90a5070,ADVERSE REACTIONS Clinical Trials Experience I...,,adverse reactions clinical trials experience c...
3,a838204b-9564-9aa6-e053-2a95a90af02f,ADVERSE REACTIONS Clinical Trials Experience I...,,adverse reactions clinical trials experience c...
4,f265e6dd-f47e-4511-9468-282184bcd1b1,The most common adverse reactions leading to d...,,common adverse reactions leading discontinuati...


----
### Loading the model

In [20]:
nlp = en_core_sci_lg.load()

In [21]:
summ = []
for i in range(len(train)):
    docs_ex = nlp(train['cleaned_adverse_reactions'][i])
    sent = []
    for t in docs_ex.ents:
        sent.append(str(t))
    summ.append(', '.join(sent))

In [22]:
train.loc[:, 'Summary_0'] = summ

In [23]:
train.head()

Unnamed: 0,SetID,Adverse Reactions,Summary,cleaned_adverse_reactions,Summary_0
0,632bb50c-3bcb-4c85-9056-fc33410550ae,The most common adverse reactions including la...,"Leukopenia lymphopenia, fatigue, anemia, neutr...",common adverse reactions including laboratory ...,"adverse reactions, laboratory abnormalities, l..."
1,723d9f78-9d77-4575-af27-1aa117e6b8d7,ADVERSE REACTIONS Adverse reactions to isosorb...,"Headache, lightheadedness in response to blood...",adverse reactions adverse reactions isosorbide...,"adverse reactions, adverse reactions, isosorbi..."
2,8589d376-ac10-4ddb-9c53-2e0c8d5675c4,The most common adverse reactions (incidence 5...,"Instillation-site irritation, dysegeusia, decr...",common adverse reactions following use xiidra ...,"adverse reactions, xiidra, instillation, site,..."
3,9087c92f-c753-4bd4-82e4-5aeee31e0ec3,Most common adverse reactions (>>10%): constip...,"constipation, nausea, and sedation.",common adverse reactions constipation nausea s...,"adverse reactions, nausea sedation"
4,a500b8db-fed5-7a0e-e053-2995a90ab877,Most common adverse reaction to amlodipine is ...,"Edema. Fatigue, nausea, abdominal pain, and s...",common adverse reaction amlodipine edema occur...,"adverse reaction, amlodipine, dose, adverse ex..."


In [24]:
test.head()

Unnamed: 0,SetID,Adverse Reactions,Summary,cleaned_adverse_reactions
0,a834d1cf-72fc-93bf-e053-2995a90a6191,The following adverse events were observed and...,,following adverse events observed reported pat...
1,a835b697-2beb-1ba8-e053-2995a90a470c,The following serious adverse reactions are de...,,following serious adverse reactions described ...
2,a837f13e-fafc-0535-e053-2995a90a5070,ADVERSE REACTIONS Clinical Trials Experience I...,,adverse reactions clinical trials experience c...
3,a838204b-9564-9aa6-e053-2a95a90af02f,ADVERSE REACTIONS Clinical Trials Experience I...,,adverse reactions clinical trials experience c...
4,f265e6dd-f47e-4511-9468-282184bcd1b1,The most common adverse reactions leading to d...,,common adverse reactions leading discontinuati...


In [25]:
summ_test = []
for i in range(len(test)):
    docs = nlp(test['cleaned_adverse_reactions'][i])
    sent = []
    for t in docs.ents:
        sent.append(str(t))
    summ_test.append(', '.join(sent))

In [26]:
summ_test[0]

'adverse events, patients, propranolol, cardiovascular bradycardia congestive heart failure, intensification, thrombocytopenic purpura arterial insufficiency, raynaud, type central nervous system, insomnia lassitude weakness, fatigue catatonia, visual disturbances, hallucinations, dreams, acute reversible syndrome, disorientation time, loss, emotional lability, sensorium, performance, neuropsychometrics, formulations, fatigue lethargy, dreams, dose, gastrointestinal nausea vomiting, epigastric distress, abdominal cramping diarrhea constipation, mesenteric arterial thrombosis, ischemic colitis, allergic hypersensitivity reactions, anaphylactic anaphylactoid reactions, pharyngitis agranulocytosis, erythematous rash, fever, aching sore throat, laryngospasm, respiratory distress, respiratory bronchospasm, hematologic agranulocytosis, nonthrombocytopenic purpura thrombocytopenic purpura autoimmune systemic lupus erythematosus, skin mucous membranes, johnson syndrome, toxic, epidermal necrol

----
### Text cleaning 2

In [27]:
single_string = ''.join(summ_test)

In [28]:
vocab1 = Counter(single_string.split(', '))
vocab2 = Counter(single_string.split())

In [29]:
vocab1 = {k:v for k,v in sorted(vocab1.items(), key=lambda x:-x[1])}
high_freq1 = []
for k,v in vocab1.items():
    if v > 100:
        print(k,v)
        high_freq1.append(k)

patients 2176
adverse reactions 981
increased 614
treated 556
adverse events 514
incidence 454
placebo 432
treatment 412
clinical trials 409
abnormal 299
decreased 263
reactions 236
dose 203
events 200
symptoms 197
tablets 185
relationship 181
dry mouth 174
frequency 169
vision 166
cases 164
injection 162
olanzapine 151
infrequent 150
doses 149
hypotension 146
drug 143
administration 143
therapy 139
patient 134
rare 134
discontinuation 133
increase 133
pediatric 130
day 130
transient 130
reports 116
pioglitazone 113
post 111
nervous system 110
metabolic 107
dosage 103


In [30]:
custom_high_freq1 = ['patients',
 'adverse reactions',
 'increased',
 'treated',
 'adverse events',
 'incidence',
 'treatment',
 'clinical trials',
 'abnormal',
 'decreased',
 'reactions',
 'dose',
 'events',
 'symptoms',
 'tablets',
 'relationship',
 'frequency',
 'vision',
 'cases',
 'injection',
 'olanzapine',
 'infrequent',
 'doses',
 'drug',
 'administration',
 'therapy',
 'patient',
 'rare',
 'discontinuation',
 'increase',
 'pediatric',
 'day',
 'transient',
 'reports',
 'pioglitazone',
 'post',
 'dosage']

In [31]:
vocab2 = {k:v for k,v in sorted(vocab2.items(), key=lambda x:-x[1])}
high_freq2 = []
for k,v in vocab2.items():
    if v > 100:
        print(k,v)
        high_freq2.append(k)

reactions, 2555
patients, 2180
adverse 1899
clinical 767
events, 764
trials, 617
increased, 614
nausea 577
system, 560
treated, 556
skin 496
incidence, 455
treatment, 452
placebo, 445
disorders, 397
edema, 392
therapy, 385
disorder, 381
pain, 365
gastrointestinal 363
rash, 355
syndrome, 351
respiratory 349
dizziness 343
drug 341
abdominal 338
pain 331
diarrhea, 315
abnormal, 303
nervous 294
headache 291
symptoms, 289
dose, 287
tablets, 284
dry 283
reaction, 279
vomiting, 270
decreased, 264
studies, 247
failure, 238
urinary 238
depression, 233
renal 225
effects, 223
cardiovascular 222
liver 218
constipation, 217
infection, 216
hypotension, 212
allergic 208
mouth, 200
doses, 198
administration, 197
relationship, 194
drug, 192
vomiting 191
frequency, 188
hydrochloride, 185
headache, 183
controlled 180
musculoskeletal 178
thrombocytopenia, 178
peripheral 177
body 173
injection, 173
blood 171
vision, 171
heart 165
cases, 165
hypertension, 162
fatigue 155
discontinuation, 152
olanzapine, 152

In [32]:
custom_high_freq2 = ['adverse', 'reactions,', 'patients,','clinical', 'events,', 'trials,', 'increased,', 'treated,',
 'incidence,', 'treatment,', 'therapy,', 'disorder,', 'dose,', 'symptoms,', 'tablets,', 'reaction,', 'decreased,', 'studies,', 'failure,',
 'depression,', 'effects,', 'administration,', 'doses,', 'relationship,', 'drug,', 'frequency,', 'hydrochloride,', 'controlled',
 'injection,', 'cases,', 'olanzapine,', 'discontinuation,', 'serum', 'infrequent,', 'central', 'reports,', 'weight', 'loss,',
 'patient,', 'rare,', 'increase,', 'pediatric,', 'day,', 'transient,', 'discontinued,', 'propranolol,', 'dosage,', 'postmarketing,',
 'function,', 'pioglitazone,',  'confusion,', 'side', 'adults,', 'male', 'event,', 'experiences,', 'study,', 'johnson', 'somnolence', 'group,',
 'appetite', 'elevated,', 'serious,', 'warnings', 'precautions', 'section,', 'labeling,', 'data']

In [33]:
custom_stop_words = set(custom_high_freq1 + custom_high_freq2)

In [34]:
def post_cleaner(text):
    new_text = text.lower()
    tokens = [word for word in new_text.split() if word not in custom_stop_words]

    return (" ".join(tokens)).strip()

In [35]:
clean_summ = []
for i in summ_test:
    clean_summ.append(post_cleaner(i))

In [36]:
len(clean_summ)

989

----
#### Saving the output

In [37]:
test['Summary'] = clean_summ

In [38]:
test.drop('cleaned_adverse_reactions', axis=1, inplace=True)

In [39]:
test.to_csv('HFI_redo/output/submission_11_10.csv', index=False)