Import Dependencies

In [140]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

csv to dataframe

In [141]:
df = pd.read_csv("/content/refined_medical_dataset.csv")

In [142]:
df.head()

Unnamed: 0,text,label
0,"The patient is other, age: 78, height: 196.3 c...",patient history
1,"The patient was diagnosed with inflammation, f...",diagnosis
2,"The treatment provided was amlodipine, for 30 ...",treatment
3,"The patient is female, age: 57, height: 195.6 ...",patient history
4,"The patient was diagnosed with depression, fat...",diagnosis


In [143]:
print(df['label'].value_counts())

label
patient history    1000
diagnosis          1000
treatment          1000
Name: count, dtype: int64


In [144]:
df.shape

(3000, 2)

In [145]:
empty_rows = df[df.isnull().all(axis = 1)]
print(empty_rows)

Empty DataFrame
Columns: [text, label]
Index: []


In [146]:
def clean_text(text):
  text = text.lower()
  text_removed = re.sub(r'[^a-z\s]', '', text)
  words = text_removed.split()

  filtered = [word for word in words if word not in stopwords.words('english')]

  return ' '.join(filtered)

In [147]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [148]:
df.head()

Unnamed: 0,text,label,cleaned_text
0,"The patient is other, age: 78, height: 196.3 c...",patient history,patient age height cm weight kg bmi none chron...
1,"The patient was diagnosed with inflammation, f...",diagnosis,patient diagnosed inflammation fever
2,"The treatment provided was amlodipine, for 30 ...",treatment,treatment provided amlodipine days none
3,"The patient is female, age: 57, height: 195.6 ...",patient history,patient female age height cm weight kg bmi hyp...
4,"The patient was diagnosed with depression, fat...",diagnosis,patient diagnosed depression fatigue headache ...


In [149]:
vectorizer = TfidfVectorizer(max_features = 5000)
x = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

In [150]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [151]:
y_label = df['label'].unique()
print(y_label)

['patient history' 'diagnosis' 'treatment']


In [152]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
model.fit(x_train, y_train)



In [153]:
x_train_pred = model.predict(x_train)
training_accuracy = accuracy_score(x_train_pred, y_train)

In [154]:
print('training data accuracy', training_accuracy)

training data accuracy 1.0


In [155]:
y_test_pred = model.predict(x_test)
test_accuracy = accuracy_score(y_test_pred, y_test)
print('test data accuracy', test_accuracy)

test data accuracy 1.0


Model deployment

In [156]:
import joblib

joblib.dump(model, 'medical_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [157]:
def text_classification(input_data):
  #input_data = ("the patient is diagnosed with cancer")
  #step 1: preprocess text data
  cleaned = clean_text(input_data)

  #step 2: text to vector
  input_vector = vectorizer.transform([cleaned])  # Must be a list

  # Step 3: Predict using the model
  output = model.predict(input_vector)

  print("Predicted class:", output[0])

In [164]:
!pip install streamlit
import streamlit as st

def main():
  st.title("Medical Text Classifier web App")

!pip install streamlit
import streamlit as st

def main():
  st.title("Medical Text Classifier web App")

  user_input = st.text_area("Enter patient diagnosis note:")

  diagnosis_class = ''

# creating a button for prediction
  if st.button('classify the input'):
    diagnosis_class = text_classification(user_input)

  st.success(diagnosis_class)



In [162]:
if __name__ == '__main__':
  main()

