# dataset loading

In [885]:
import pandas as pd
import numpy as np

X_train_text = pd.read_csv('/content/X_train.csv')
X_test_text = pd.read_csv('/content/X_test.csv')
y_train = pd.read_csv('/content/y_train.csv')

In [886]:
X_train_text.head()

Unnamed: 0,Sl. No.,transcription
0,480,"CC:, Orthostatic lightheadedness.,HX:, This 76..."
1,374,"PREOPERATIVE DIAGNOSES,1. Bowel obstruction.,..."
2,114,"PROCEDURE: , Newborn circumcision.,INDICATIONS..."
3,729,"CC: ,Episodic confusion.,HX: ,This 65 y/o RHM ..."
4,764,"HX: ,This 46y/o RHM with HTN was well until 2 ..."


In [887]:
X_test_text.head()

Unnamed: 0,Sl. No.,transcription
0,718,"REASON FOR VISIT: ,Elevated PSA with nocturia..."
1,544,"REASON FOR CONSULTATION:, Newly diagnosed cho..."
2,871,"PREOPERATIVE DIAGNOSIS: , Penile skin bridges ..."
3,627,"PREOPERATIVE DIAGNOSIS: , Acute abdominal pain..."
4,352,"SUBJECTIVE:, The patient was seen today by me..."


In [888]:
y_train.head()

Unnamed: 0,Sl. No.,medical_specialty
0,480,1
1,374,5
2,114,3
3,729,1
4,764,1


In [889]:
# length of X_train and X_test
len_train = len(X_train_text)
len_test = len(X_test_text)

print(len_train, len_test)

644 276


In [890]:
# merge X_train_text and X_test_text
corpus = X_train_text.append(X_test_text)
corpus.head()

Unnamed: 0,Sl. No.,transcription
0,480,"CC:, Orthostatic lightheadedness.,HX:, This 76..."
1,374,"PREOPERATIVE DIAGNOSES,1. Bowel obstruction.,..."
2,114,"PROCEDURE: , Newborn circumcision.,INDICATIONS..."
3,729,"CC: ,Episodic confusion.,HX: ,This 65 y/o RHM ..."
4,764,"HX: ,This 46y/o RHM with HTN was well until 2 ..."


# **tfidf**

In [891]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [892]:
#preprocessing
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string
string.punctuation

stop_words = set(stopwords.words('english')+["CC","HX","EXAM","CN","LAB",
                                             'preoperative', 'postoperative', 'diagnosis', 'procedure', 
                                               'anesthesia', 'indication', 'indications', 'findings', 'technique', 
                                              'cc', 'chief complaint', 'complaint', 'complains', 'hx', 'history', 
                                               'meds', 'medication', 'medications', 'pmh', 'prior medical history',
                                               'fhx', 'shx', 'exam', 'labs', 'course', 'complications', 'summary',
                                               'study', 'operation', 'performed', 'clinical', 'note', 'description'])
 
stemmer = SnowballStemmer("english", ignore_stopwords=True)

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

def tokenize(text):
  
    # remove punctuation
    text="".join([i for i in text if i not in string.punctuation])

    # lower
    text=text.lower()

    # remove stop words
    tokens = nltk.word_tokenize(text)
    filtered_sentence = [w for w in tokens if not w.lower() in stop_words]
    filtered_sentence = []

    # stem
    stems = []
    for item in tokens:
        stems.append(stemmer.stem(item))
    
    for w in stems:
      if w not in stop_words:
        filtered_sentence.append(w)

    return filtered_sentence


In [893]:
# Convert text data to numerical features using sklearn.feature_extraction: Feature Extraction
# feature_extraction.text.CountVectorizer or feature_extraction.text.TfidfVectorizer
# In this sample code, we use TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# auto remove punctuation
vectorizer = TfidfVectorizer(tokenizer=tokenize,min_df = 10) # you can tune the value of min_df, here I just try 10
X = vectorizer.fit_transform(corpus['transcription'])

# get the features of X_train and X_test
x_train_feature = X[:len_train]
x_test_feature = X[len_train:]

In [894]:
x_train_feature

<644x2276 sparse matrix of type '<class 'numpy.float64'>'
	with 73063 stored elements in Compressed Sparse Row format>

# Model 1: KNN

In [895]:
from sklearn.model_selection import cross_val_score

In [896]:
# train your machine learning model such as KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(neigh, x_train_feature, y_train['medical_specialty'], cv=5, scoring="f1_weighted")
scores.mean()


0.7952403259679091

# Model 2: MultinomNB

In [897]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0.6).fit(x_train_feature, y_train['medical_specialty'])
scores = cross_val_score(nb, x_train_feature, y_train['medical_specialty'], cv=5, scoring="f1_weighted")
scores.mean()

0.7713655454458039

# Model 3: Random Forest

In [898]:
from sklearn.ensemble import RandomForestClassifier
  
# create classifer object
rf = RandomForestClassifier(n_estimators = 1000, random_state = 0)
  
# fit the classifer with x and y data
rf.fit(x_train_feature, y_train['medical_specialty']) 

scores = cross_val_score(rf, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.7080547480620154

# Model 4: perceptron

In [899]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score
per = Perceptron(tol=0.000001, random_state=3, penalty='l2',alpha=0.000001)
per.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(per, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.7375121124031008

# Model 5: passive agg

In [900]:
from sklearn.linear_model import PassiveAggressiveClassifier
pa = PassiveAggressiveClassifier(C=0.0001,max_iter=100000, random_state=0,
                                 tol=1e-5, validation_fraction=0.1)
pa.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(pa, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.7888202519379844

# Model 6: SGDC

In [901]:
from sklearn.linear_model import SGDClassifier

svm=SGDClassifier(loss='hinge', penalty='l2',alpha=8e-3,
                  random_state=42,tol=1e-6)
svm.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(svm, x_train_feature, y_train['medical_specialty'], cv=5, scoring="f1_weighted")
scores.mean()

0.7913622984421049

# Model 7: MLPC

In [902]:
from sklearn.neural_network import MLPClassifier
mlpc= MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(30,), 
                    random_state=1,
                    max_iter=1000000)
mlpc.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(mlpc, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.7623425387596898

In [903]:
#from sklearn.model_selection import GridSearchCV
#parameters = {'clf__alpha': (1e-2, 1e-3)}

# Model 8: gradient boost

In [904]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                 max_depth=3, random_state=0)

gb.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(gb, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.725109011627907

# Model 9: logistic

In [905]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(C=500, penalty="l1", solver="saga", tol=0.01,max_iter=100000)
lg.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(lg, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.7608284883720929

# Model 10: xgboost

In [906]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb= XGBClassifier(base_score=0.5,
                 booster="gbtree",
                 colsample_bylevel=0.5,
                 colsample_bynode=0.5,
                 colsample_bytree=0.5,
                 gamma=0.0000001,
                 learning_rate=1e-1,
                 max_delta_step=0,
                 max_depth=40,
                 min_child_weight=1,
                 missing=None,
                 n_estiamtors=1000,
                 n_jobs=1,
                 nthread=None,
                 objective='multi:softprob',
                 randome_state=42,
                 reg_alpha=0.001,
                 reg_lambda=0.001,
                 scale_pos_weight=1,
                 sub_sample=0.3,
                 verbosity=1
                 )
xgb.fit(x_train_feature, y_train['medical_specialty'])

XGBClassifier(colsample_bylevel=0.5, colsample_bynode=0.5, colsample_bytree=0.5,
              gamma=1e-07, max_depth=40, n_estiamtors=1000,
              objective='multi:softprob', randome_state=42, reg_alpha=0.001,
              reg_lambda=0.001, sub_sample=0.3)

In [907]:
scores = cross_val_score(xgb, x_train_feature, y_train['medical_specialty'], cv=5)
scores.mean()

0.7422238372093023

#linearSVC

In [908]:
from sklearn.svm import LinearSVC
svc = LinearSVC(random_state=0, tol=1e-5)
svc.fit(x_train_feature, y_train['medical_specialty'])

scores = cross_val_score(svc, x_train_feature, y_train['medical_specialty'], cv=5,scoring="f1_weighted")
scores.mean()

0.7785332968232701

# Ensemble: vote

In [909]:
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(
 estimators=[('KNN', neigh), ('Navie Bayes', nb),
             ('perceptron', per), ('Passive Aggressive', pa), 
             ('SGDC', svm), ('MLPC', mlpc), 
             ('logistic', lg)],
 voting='hard'
)

In [910]:
vote.fit(x_train_feature, y_train['medical_specialty'])

VotingClassifier(estimators=[('KNN', KNeighborsClassifier(n_neighbors=7)),
                             ('Navie Bayes', MultinomialNB(alpha=0.6)),
                             ('perceptron',
                              Perceptron(alpha=1e-06, penalty='l2',
                                         random_state=3, tol=1e-06)),
                             ('Passive Aggressive',
                              PassiveAggressiveClassifier(C=0.0001,
                                                          max_iter=100000,
                                                          random_state=0,
                                                          tol=1e-05)),
                             ('SGDC',
                              SGDClassifier(alpha=0.008, random_state=42,
                                            tol=1e-06)),
                             ('MLPC',
                              MLPClassifier(alpha=1e-05,
                                            hidden_layer_sizes=(30,

In [911]:
scores = cross_val_score(vote, x_train_feature, y_train['medical_specialty'], cv=5,scoring="f1_weighted")
scores.mean()

0.7883786606569986

# submission

In [916]:
for clf, label in zip([neigh, nb, rf, per,pa,svm,mlpc,gb,lg,xgb,vote], 
                      ['KNN', 'Naive Bayes', 'Random Forest', 'perceptron',
                       'passive aggresive','SGDC','MLPC','gradient','logistic',
                       'xgboost','Ensemble']):

 scores = cross_val_score(clf, x_train_feature, y_train['medical_specialty'], scoring='f1_weighted', cv=5)

 print("F1-weighted: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

F1-weighted: 0.80 (+/- 0.03) [KNN]
F1-weighted: 0.77 (+/- 0.02) [Naive Bayes]
F1-weighted: 0.71 (+/- 0.02) [Random Forest]
F1-weighted: 0.74 (+/- 0.03) [perceptron]
F1-weighted: 0.79 (+/- 0.04) [passive aggresive]
F1-weighted: 0.79 (+/- 0.04) [SGDC]
F1-weighted: 0.76 (+/- 0.05) [MLPC]
F1-weighted: 0.73 (+/- 0.02) [gradient]
F1-weighted: 0.78 (+/- 0.04) [logistic]
F1-weighted: 0.74 (+/- 0.02) [xgboost]
F1-weighted: 0.79 (+/- 0.04) [Ensemble]


In [913]:
# make predictions on the test data
pred = neigh.predict(x_test_feature)
pred

array([3, 5, 3, 2, 4, 2, 1, 1, 5, 5, 5, 3, 1, 3, 1, 2, 3, 3, 1, 1, 6, 3,
       5, 3, 2, 3, 2, 3, 2, 4, 2, 3, 3, 6, 4, 2, 2, 1, 6, 4, 5, 5, 5, 1,
       2, 5, 6, 5, 4, 4, 6, 5, 1, 4, 1, 1, 4, 2, 5, 1, 4, 5, 1, 1, 1, 5,
       1, 1, 1, 6, 4, 5, 4, 1, 4, 4, 5, 5, 3, 2, 6, 4, 3, 2, 5, 4, 3, 6,
       1, 1, 3, 6, 6, 3, 6, 1, 5, 3, 6, 4, 4, 6, 6, 5, 3, 2, 2, 5, 6, 3,
       2, 1, 3, 4, 6, 3, 5, 1, 1, 2, 6, 5, 4, 5, 2, 6, 6, 5, 3, 2, 5, 4,
       3, 3, 1, 1, 3, 2, 1, 6, 5, 2, 2, 6, 1, 1, 1, 2, 1, 6, 1, 5, 5, 5,
       6, 6, 5, 4, 2, 3, 4, 1, 4, 5, 4, 3, 4, 1, 5, 3, 1, 6, 1, 4, 1, 6,
       5, 5, 4, 6, 6, 2, 5, 4, 3, 1, 3, 5, 6, 4, 2, 6, 6, 6, 6, 5, 5, 6,
       2, 5, 6, 1, 5, 1, 3, 2, 1, 2, 4, 3, 3, 4, 1, 4, 1, 1, 6, 4, 6, 3,
       5, 3, 2, 1, 3, 5, 3, 4, 2, 4, 5, 6, 5, 5, 1, 3, 1, 4, 3, 6, 6, 4,
       5, 2, 5, 5, 2, 5, 1, 4, 3, 4, 6, 1, 6, 1, 4, 6, 4, 1, 3, 6, 3, 3,
       2, 5, 1, 5, 2, 3, 2, 6, 1, 6, 3, 2])

In [914]:
# write your predictions to the submission file and submit it to Kaggle
submission = pd.read_csv('sampleSubmission.csv')
submission['medical_specialty'] = pred
submission.to_csv('sample_submission.csv', index=False)