In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

X_train = pd.read_csv('/content/drive/MyDrive/ds310 /project 3/X_train.csv')
y_train = pd.read_csv('/content/drive/MyDrive/ds310 /project 3/y_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/ds310 /project 3/X_test.csv')
sample_sub =  pd.read_csv('/content/drive/MyDrive/ds310 /project 3/sampleSubmission.csv')

Mounted at /content/drive


# data preprocessing

## lower case & removing special characters

In [27]:
X_train.head()

Unnamed: 0,Sl. No.,transcription
0,480,"CC:, Orthostatic lightheadedness.,HX:, This 76..."
1,374,"PREOPERATIVE DIAGNOSES,1. Bowel obstruction.,..."
2,114,"PROCEDURE: , Newborn circumcision.,INDICATIONS..."
3,729,"CC: ,Episodic confusion.,HX: ,This 65 y/o RHM ..."
4,764,"HX: ,This 46y/o RHM with HTN was well until 2 ..."


In [2]:
# make text lower case
X_train['transcription'] = X_train['transcription'].str.lower()
X_test['transcription'] = X_test['transcription'].str.lower()

# remove special characters
X_train['transcription'] = X_train['transcription'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
X_test['transcription'] = X_test['transcription'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)

In [29]:
X_train.head()

Unnamed: 0,Sl. No.,transcription
0,480,cc orthostatic lightheadednesshx this 76 yo ma...
1,374,preoperative diagnoses1 bowel obstruction2 c...
2,114,procedure newborn circumcisionindications pa...
3,729,cc episodic confusionhx this 65 yo rhm reporte...
4,764,hx this 46yo rhm with htn was well until 2 wee...


## removing stopwords (I, he, am, is, etc.)

In [3]:
# remove stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

X_train['transcription'] = X_train['transcription'].apply(remove_stopwords)
X_test['transcription'] = X_test['transcription'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
X_train.head()

Unnamed: 0,Sl. No.,transcription
0,480,cc orthostatic lightheadednesshx 76 yo male co...
1,374,preoperative diagnoses1 bowel obstruction2 cen...
2,114,procedure newborn circumcisionindications pare...
3,729,cc episodic confusionhx 65 yo rhm reportedly s...
4,764,hx 46yo rhm htn well 2 weeks prior exam experi...


## lemmatize (to noun form)

In [4]:
# lemmatize the words
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# WordNetLemmatizer only works on individual words, not on entire strings
def lemmatize_words(text):
  words = text.split()
  lemmatized_words = [lemmatizer.lemmatize(word, pos = 'n') for word in words]
  return ' '.join(lemmatized_words)

X_train['transcription'] = X_train['transcription'].apply(lemmatize_words)
X_test['transcription'] = X_test['transcription'].apply(lemmatize_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [17]:
X_train.head()

Unnamed: 0,Sl. No.,transcription
0,480,cc orthostatic lightheadednesshx 76 yo male co...
1,374,preoperative diagnoses1 bowel obstruction2 cen...
2,114,procedure newborn circumcisionindications pare...
3,729,cc episodic confusionhx 65 yo rhm reportedly s...
4,764,hx 46yo rhm htn well 2 week prior exam experie...


## tokenization and vectorization (tf-idf vectorizer has tokenization on its own)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)

X_train_v = vectorizer.fit_transform(X_train['transcription'])
X_test_v = vectorizer.transform(X_test['transcription'])

In [20]:
print(X_train_v)

  (0, 258)	0.06595154696392722
  (0, 777)	0.07472498255963922
  (0, 345)	0.07806531578134777
  (0, 241)	0.08105077066440206
  (0, 194)	0.09194272855188064
  (0, 872)	0.0648063402938391
  (0, 372)	0.07651999038587738
  (0, 831)	0.05290078020141891
  (0, 728)	0.09426182853960348
  (0, 359)	0.06458404764036384
  (0, 105)	0.08853567546916061
  (0, 545)	0.056949213176627614
  (0, 915)	0.08198089230868141
  (0, 190)	0.07151053262086858
  (0, 533)	0.06017325470310224
  (0, 566)	0.0792972274833733
  (0, 210)	0.07438234728703298
  (0, 121)	0.08557243883872438
  (0, 889)	0.09194272855188064
  (0, 351)	0.08671191708231989
  (0, 499)	0.06595154696392722
  (0, 495)	0.04171662090827811
  (0, 547)	0.0977741234549479
  (0, 782)	0.11203818306469654
  (0, 57)	0.045984586466009794
  :	:
  (643, 387)	0.08412824842383823
  (643, 661)	0.18054071631724003
  (643, 445)	0.09346697491011993
  (643, 226)	0.09027035815862001
  (643, 549)	0.07912286753284835
  (643, 807)	0.07589452678704338
  (643, 852)	0.11132652

# Stacking Algorithm

## create stacking algorithm model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

base_models = [
    ('lr', LogisticRegression(max_iter = 500)),
    ('nb', MultinomialNB()),
    ('xgb', XGBClassifier(use_label_encoder=False, objective='multi:softmax', num_class = 6, eval_metric='mlogloss')) # multi:softmax bc we are classifying into one of 6
]

meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)

## hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'lr__C': [0.1, 1, 10], # regularization prevent overfitting
    'xgb__learning_rate': [0.01, 0.1, 0.2], # step sizes for updating the model weights (bigger step will learn faster but less deatailed)
    'xgb__max_depth': [3, 6, 9], # increasing makes the model more complex and capable of learning more detailed patterns
    'xgb__n_estimators': [100, 200, 300] # number of trees in the ensemble (each tree is built to correct the errors made by the previous trees)
}

grid_search = GridSearchCV(estimator=stacking_clf, param_grid=param_grid, cv=3, scoring='f1_weighted', verbose=2, n_jobs=-1)

grid_search.fit(X_train_v, y_train['medical_specialty'])

best_params = grid_search.best_params_
print("best parameters:", grid_search.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


## make prediction with the best parameters

In [None]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_v)

# make submission

In [24]:
sample_sub['medical_specialty'] = y_pred
sample_sub.to_csv('/content/drive/MyDrive/ds310 /project 3/sampleSubmission.csv', index=False)