In [1]:
import pandas as pd

df = pd.read_csv("../data/medical_transcription.csv")

# Keep only what we need
df = df[["transcription", "medical_specialty"]]

df.head()


Unnamed: 0,transcription,medical_specialty
0,"SUBJECTIVE:, This 23-year-old white female pr...",Allergy / Immunology
1,"PAST MEDICAL HISTORY:, He has difficulty climb...",Bariatrics
2,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...",Bariatrics
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary


In [3]:
df.isna().sum()

transcription        33
medical_specialty     0
dtype: int64

In [7]:
df = df.dropna()

In [9]:
from sklearn.model_selection import train_test_split

X = df["transcription"]
y = df["medical_specialty"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


In [15]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.2515090543259557
                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         1
                       Autopsy       0.00      0.00      0.00         2
                    Bariatrics       0.00      0.00      0.00         4
    Cardiovascular / Pulmonary       0.25      0.22      0.23        74
                  Chiropractic       0.00      0.00      0.00         3
    Consult - History and Phy.       0.23      0.46      0.30       103
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         5
                     Dentistry       0.00      0.00      0.00         5
                   Dermatology       0.00      0.00      0.00         6
          Diets and Nutritions       0.00      0.00      0.00         2
             Discharge Summary       0.31      0.23      0.26        22
          ENT - Otolaryngology       0.00      0.00      0.00        19
        Emergency Room Reports    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
sample_text = df["transcription"].iloc[10]

sample_vec = vectorizer.transform([sample_text])
prediction = model.predict(sample_vec)

prediction[0]


' Surgery'

Classification Notes:
- Problem treated as multi-class text classification
- Used TF-IDF for explainable feature extraction
- Logistic Regression chosen as a strong baseline
- Goal is document categorization, not medical inference
