## Data Read-in

Reading in the data and looking through the first few rows and ACR groups

In [1]:
#importing the needed packages
import pandas as pd
import numpy as np

#reading in the clinical groupings library
test = pd.read_csv('clinical grouping.csv', encoding = 'latin-1') #clinical library
acr = pd.read_csv('acr.csv', encoding = 'latin-1') #dictionary of clinical groups

In [2]:
test.head()

Unnamed: 0,CODE,DIAGNOSIS,ACR group,ICD 10 Chapter Number,ICD 10 chapter description,ICD 10 block,NOTES
0,A00.0,"CHOLERA DUE TO VIBRIO CHOLERAE 01, BIOVAR CHOL...",ACUTE GASTROENTERITIS,1,Certain infectious and parasitic diseases,Intestinal Infectious disease,1
1,A00.1,"CHOLERA DUE TO VIBRIO CHOLERAE 01, BIOVAR ELTO...",ACUTE GASTROENTERITIS,1,Certain infectious and parasitic diseases,Intestinal Infectious disease,1
2,A00.9,"CHOLERA, UNSPECIFIED",ACUTE GASTROENTERITIS,1,Certain infectious and parasitic diseases,Intestinal Infectious disease,1
3,A00+G53.1*,MULTIPLE CRANIAL NERVE PALSIES IN CHOLERA,,1,Certain infectious and parasitic diseases,Intestinal Infectious disease,1
4,A00+G94.0*,HYDROCEPHALUS IN CHOLERA,,1,Certain infectious and parasitic diseases,Intestinal Infectious disease,1


In [3]:
acr.head()

Unnamed: 0,ICD 10 CODE,DESCRIPTION,GROUP,Unnamed: 3
0,P91.3,Neonatal cerebral irritability,ABNORMAL SENSORIUM IN THE NEWBORN,
1,P91.4,Neonatal cerebral depression,ABNORMAL SENSORIUM IN THE NEWBORN,
2,P91.6,Hypoxic ischaemic encephalopathy of newborn,ABNORMAL SENSORIUM IN THE NEWBORN,
3,P91.8,Other specified disturbances of cerebral statu...,ABNORMAL SENSORIUM IN THE NEWBORN,
4,P91.9,"Disturbance of cerebral status of newborn, uns...",ABNORMAL SENSORIUM IN THE NEWBORN,


In [4]:
#looking at the first few ACR groups
groups = acr.GROUP.unique()
len(groups)
groups[1:20]

array(['ABSCESS OF RESPIRATORY TRACT', 'ACUTE GASTROENTERITIS',
       'ACUTE RENAL FAILURE', 'ADULT RESPIRATORY DISTRESS SYNDROME',
       'ALLERGIC REACTIONS', 'AMOEBIASIS, HEPATIC',
       'AMOEBIASIS, NONHEPATIC', 'AMYLOIDOSIS', 'ANAPHYLACTIC SHOCK',
       'ANEMIA', 'ARTHRITIS INFECTIOUS', 'ARTHRITIS NONINFECTIOUS',
       'ARTHROPOD BORNE VIRAL FEVER', 'ARTHROSIS',
       'ASTHMA IN ACUTE EXACERBATION', 'ATELECTASIS',
       'BACK PAIN, RADICULOPATHY, SCIATICA', 'BLADDER DYSFUNCTION',
       'BOTULISM'], dtype=object)

In [5]:
#how many diagnoses are in the testing set?
print(len(test.index))

#how many of these already have classifications?
test[test['ACR group'].notnull()].shape

23027


(4694, 7)

### Data Preparation

1. Come up with a new dataframe containing all diagnoses listed per ACR group. This will be used as the dataset upon which a classification model will be trained.
2. The testing set will be all entries in the clinical gorupings dataset that have not yet been matched to an ACR group.

In [6]:
import re
diagnoses = []
classes = []

#collect all diagnoses with stated ACR groups into a single dataframe
#this dataframe will be used for training the text classification model
for group in groups:
    descs = acr[acr['GROUP'] == group]['DESCRIPTION']
    
    for desc in descs:
        phrases = [e.lower() for e in [d.strip() for d in desc.split(';')]]
        diagnoses.extend(phrases)
        classes.extend([group] * len(phrases))
        
    descs2 = test[test['ACR group'] == group]['DIAGNOSIS']
    
    for desc in descs2:
        diags = [e.lower() for e in [d.strip() for d in desc.split(';')]]
        diagnoses.extend(diags)
        classes.extend([group] * len(diags))

train = pd.DataFrame(columns = ['Diagnoses','Groups'])

diagnoses = pd.Series(diagnoses)
train['Diagnoses'] = diagnoses.values

classes = pd.Series(classes)
train['Groups'] = classes.values

train.head()

Unnamed: 0,Diagnoses,Groups
0,neonatal cerebral irritability,ABNORMAL SENSORIUM IN THE NEWBORN
1,neonatal cerebral depression,ABNORMAL SENSORIUM IN THE NEWBORN
2,hypoxic ischaemic encephalopathy of newborn,ABNORMAL SENSORIUM IN THE NEWBORN
3,other specified disturbances of cerebral statu...,ABNORMAL SENSORIUM IN THE NEWBORN
4,"disturbance of cerebral status of newborn, uns...",ABNORMAL SENSORIUM IN THE NEWBORN


In [7]:
#stripping stopwords from train and test sets
import nltk
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

def clean_words(diag):
    diag_spl = diag.split()
    diag_new = [w for w in diag_spl if w not in stops]
    return(" ".join(diag_new))

train['Diagnoses'] = train['Diagnoses'].apply(clean_words)
test['DIAGNOSIS'] = test['DIAGNOSIS'].apply(clean_words)

In [8]:
#stripping classified entries from the test set
test = test[pd.isnull(test['ACR group'])]

### Preparing the text classification model

In [None]:
#bag of words representation for the train and test sets
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None)

train_features = vectorizer.fit_transform(train['Diagnoses'])
test_features = vectorizer.transform(test['DIAGNOSIS'])

train_features_a = train_features.toarray()
test_features_a = test_features.toarray()

In [None]:
#preparing the random forest classifier
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_depth = 1000) 
forest = forest.fit(train_features_a, train['Groups'])

### Identifying the five most probable ACR groups for each diagnostic category

In [None]:
predictions = forest.predict_proba(test_features)

In [None]:
test['preds'] = 0
count = 0

for array in predictions:
    max_indices = array.argsort()[-5:][::-1]
    probs = list(forest.classes_[max_indices])
    test['preds'].iloc[count] = probs
    count += 1

In [None]:
test.to_excel('test_out.xlsx')