In [13]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

X_train = pd.read_csv('/content/drive/MyDrive/ds310 /project 3/X_train.csv')
y_train = pd.read_csv('/content/drive/MyDrive/ds310 /project 3/y_train.csv')
X_test = pd.read_csv('/content/drive/MyDrive/ds310 /project 3/X_test.csv')
sample_sub =  pd.read_csv('/content/drive/MyDrive/ds310 /project 3/sampleSubmission.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# data preprocessing

## lower case & removing special characters

In [14]:
# make text lower case
X_train['transcription'] = X_train['transcription'].str.lower()
X_test['transcription'] = X_test['transcription'].str.lower()

# remove special characters
X_train['transcription'] = X_train['transcription'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
X_test['transcription'] = X_test['transcription'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)

## removing stopwords (I, he, am, is, etc.)

In [15]:
# remove stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

X_train['transcription'] = X_train['transcription'].apply(remove_stopwords)
X_test['transcription'] = X_test['transcription'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## lemmatize (to verb form)

In [16]:
# lemmatize the words
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# WordNetLemmatizer only works on individual words, not on entire strings
def lemmatize_words(text):
  words = text.split()
  lemmatized_words = [lemmatizer.lemmatize(word, 'n') for word in words]
  return ' '.join(lemmatized_words)

X_train['transcription'] = X_train['transcription'].apply(lemmatize_words)
X_test['transcription'] = X_test['transcription'].apply(lemmatize_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
X_train.head()

Unnamed: 0,Sl. No.,transcription
0,480,cc orthostatic lightheadednesshx 76 yo male co...
1,374,preoperative diagnoses1 bowel obstruction2 cen...
2,114,procedure newborn circumcisionindications pare...
3,729,cc episodic confusionhx 65 yo rhm reportedly s...
4,764,hx 46yo rhm htn well 2 week prior exam experie...


In [17]:
X_test.head()

Unnamed: 0,Sl. No.,transcription
0,718,reason visit elevated psa nocturia occasional ...
1,544,reason consultation newly diagnosed cholangioc...
2,871,preoperative diagnosis penile skin bridge circ...
3,627,preoperative diagnosis acute abdominal pain ru...
4,352,subjective patient seen today nursing home mul...


## tokenization and vectorization (tf-idf vectorizer has tokenization on its own)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)

X_train_v = vectorizer.fit_transform(X_train['transcription'])
X_test_v = vectorizer.transform(X_test['transcription'])

# Stacking Algorithm

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

base_models = [
    ('lr', LogisticRegression(max_iter=2000)),
    ('rf', RandomForestClassifier(n_estimators=300)),
    ('nb', MultinomialNB()),
    ('xgb', XGBClassifier(learning_rate = 0.1, use_label_encoder=False, eval_metric='mlogloss'))
]

meta_model = LogisticRegression()

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

stacking_clf.fit(X_train_v, y_train['medical_specialty'])

y_pred = stacking_clf.predict(X_test_v)

# make submission

In [24]:
sample_sub['medical_specialty'] = y_pred
sample_sub.to_csv('/content/drive/MyDrive/ds310 /project 3/sampleSubmission.csv', index=False)