## Competition

[Competition](https://www.kaggle.com/t/18595a086f0346969b9cf443604ef801)

> Your task is to classify medical transcription into types of medical speciality with use of dependency parsing.

In [43]:
import pandas as pd
import spacy

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [44]:
data_path = "data"

df = pd.read_csv(f"{data_path}/train.csv")
df.head()

Unnamed: 0,id,medical_specialty,transcription
0,0,Cardiovascular / Pulmonary,"PREOPERATIVE DIAGNOSIS: , Persistent pneumonia..."
1,1,General Medicine,"REASON FOR VISIT: , Mr. ABC is a 30-year-old m..."
2,2,Cardiovascular / Pulmonary,"REASON FOR CONSULTATION: , Mesothelioma.,HISTO..."
3,3,General Medicine,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu..."
4,4,Cardiovascular / Pulmonary,"CHIEF COMPLAINT:, The patient complains of che..."


In [45]:
df.isna().sum()

id                    0
medical_specialty     0
transcription        18
dtype: int64

In [46]:
df.dropna(inplace=True)

I guess I've seen this dataset

In [47]:
df["medical_specialty"].tolist()[0]

' Cardiovascular / Pulmonary'

In [48]:
def remove_space(row: str) -> str:
    return row[1:]


df["medical_specialty"] = df["medical_specialty"].apply(remove_space)

In [49]:
CLASS_ID_TO_NAME = {
    0: "Cardiovascular / Pulmonary",
    1: "Consult - History and Phy.",
    2: "Gastroenterology",
    3: "General Medicine",
    4: "Surgery"
}
NAME_TO_CLASS_ID = {v: k for k, v in CLASS_ID_TO_NAME.items()}

In [50]:
def preprocess(__y: str) -> int:
    return NAME_TO_CLASS_ID[__y]


x, y = df["transcription"], df["medical_specialty"]
y = y.apply(preprocess)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

In [51]:
nlp = spacy.load("en_core_web_lg")
clf = MultinomialNB()


def extract_features(doc):
    features = {}
    for token in doc:
        for child in token.children:
            dep = child.dep_
            if dep not in features:
                features[dep] = 0
            features[dep] += 1
    return features

In [52]:
x_train_vectorized = pd.DataFrame([extract_features(nlp(text)) for text in x_train])
x_val_vectorized = pd.DataFrame([extract_features(nlp(text)) for text in x_val])
x_val_vectorized = x_val_vectorized.reindex(columns=x_train_vectorized.columns)

x_train_vectorized.fillna(0, inplace=True)
x_val_vectorized.fillna(0, inplace=True)

clf.fit(x_train_vectorized, y_train)

y_pred = clf.predict(x_val_vectorized)
f1_score(y_val, y_pred, average="macro")

0.4351898102495605

In [53]:
nlp = spacy.load("en_core_web_lg")
clf = MultinomialNB()

x_vectorized = pd.DataFrame([extract_features(nlp(text)) for text in x])
x_vectorized.fillna(0, inplace=True)
clf.fit(x_vectorized, y)

In [54]:
test_df = pd.read_csv(f"{data_path}/test.csv")
test_df.head()

Unnamed: 0,id,transcription
0,0,"INDICATIONS FOR PROCEDURE:, The patient has pr..."
1,1,"CLINICAL HISTORY: ,This 78-year-old black woma..."
2,2,"PREOPERATIVE DIAGNOSIS: , Penoscrotal abscess...."
3,3,"INDICATIONS:, Ischemic cardiomyopathy, status..."
4,4,"PREOPERATIVE DIAGNOSIS: , Ruptured distal bice..."


In [55]:
x_test_vectorized = pd.DataFrame([extract_features(nlp(text)) for text in test_df["transcription"]])
x_test_vectorized.fillna(0, inplace=True)
x_test_vectorized = x_test_vectorized.reindex(columns=x_vectorized.columns)

y_test = clf.predict(x_test_vectorized)

In [56]:
submission_data = pd.DataFrame({"id": range(len(y_test)), "class_id": y_test})
submission_data.to_csv("submission.csv", index=False)