<a href="https://colab.research.google.com/github/astikasinha/Adverse_Medical_Event_Prediction_System/blob/Model-training/veersawork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Step 2: Load dataset
# If your file is named 'symptom_dataset.csv', update the path accordingly
df = pd.read_csv('/content/drive/My Drive/veersahack/symptom_dataset.csv')  # Replace with your actual file path

# Step 3: Check for missing values (optional)
df.dropna(inplace=True)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Step 5: Create pipeline with TF-IDF and Logistic Regression
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# Step 7: Save the model to disk
joblib.dump(model, 'symptom_classifier_model.pkl')


Classification Report:

              precision    recall  f1-score   support

     adverse       1.00      1.00      1.00        12
    moderate       1.00      0.88      0.93         8
 not serious       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.96        30
weighted avg       0.97      0.97      0.97        30



['symptom_classifier_model.pkl']

In [None]:
# Load model
model = joblib.load('symptom_classifier_model.pkl')

# Predict example
user_input = "Slight dizziness after walking"
prediction = model.predict([user_input])[0]
print(f"Prediction: {prediction}")


Prediction: moderate


In [None]:
joblib.dump(model, '/content/drive/My Drive/veersahack/symptom_classifier_model.pkl')


['/content/drive/My Drive/veersahack/symptom_classifier_model.pkl']

In [None]:
from google.colab import files
files.download('symptom_classifier_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import pandas as pd

# Load and clean dataset
df = pd.read_csv('/content/drive/My Drive/veersahack/symptom_dataset.csv')
df.dropna(inplace=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Split pipeline for easier access
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Save model and vectorizer
joblib.dump(model, 'symptom_classifier_model.pkl')
joblib.dump(vectorizer, 'symptom_vectorizer.pkl')


['symptom_vectorizer.pkl']

In [None]:
import numpy as np

def get_top_keywords(text, model, vectorizer, top_n=5):
    # Transform input text
    text_tfidf = vectorizer.transform([text])

    # Get feature names and class index
    feature_names = np.array(vectorizer.get_feature_names_out())
    class_index = model.predict_proba(text_tfidf).argmax()

    # Get coefficients
    coefficients = model.coef_[class_index]

    # Get top contributing indices
    word_indices = text_tfidf.toarray()[0].nonzero()[0]
    word_scores = [(feature_names[i], text_tfidf[0, i] * coefficients[i]) for i in word_indices]

    # Sort and return top words
    top_keywords = sorted(word_scores, key=lambda x: -abs(x[1]))[:top_n]
    return top_keywords


In [None]:
sample = "Paralysis in limbs"
predicted = model.predict(vectorizer.transform([sample]))[0]
keywords = get_top_keywords(sample, model, vectorizer)

print("Predicted Label:", predicted)
print("Top Keywords Contributing to Prediction:")
for word, score in keywords:
    print(f"{word}: {score:.4f}")


Predicted Label: adverse
Top Keywords Contributing to Prediction:
limbs: 0.3342
paralysis: 0.3342
in: 0.1019


In [None]:
!pip install fpdf


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=2177295fea77339b7b6b4bfac8d815dba8808046fc43a69c712bfdbbc668c250
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
from fpdf import FPDF

def generate_report(text, predicted_label, keywords, probs, filename="report.pdf"):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Title
    pdf.set_font("Arial", 'B', size=16)
    pdf.cell(200, 10, txt="Symptom Analysis Report", ln=True, align='C')

    # Input text
    pdf.set_font("Arial", size=12)
    pdf.ln(10)
    pdf.multi_cell(0, 10, f"Input Text: {text}")
    pdf.ln(5)

    # Prediction
    pdf.set_font("Arial", 'B', size=12)
    pdf.cell(0, 10, f"Predicted Label: {predicted_label}", ln=True)

    # Probabilities
    pdf.set_font("Arial", size=12)
    pdf.ln(5)
    pdf.cell(0, 10, "Class Probabilities:", ln=True)
    for label, prob in probs.items():
        pdf.cell(0, 10, f"  {label}: {prob:.2f}", ln=True)

    # Keywords
    pdf.ln(5)
    pdf.cell(0, 10, "Top Contributing Keywords:", ln=True)
    for word, score in keywords:
        pdf.cell(0, 10, f"  {word}: {score:.4f}", ln=True)

    # Save PDF
    pdf.output(filename)


In [None]:
sample = "Paralysis in limbs"
X_tfidf = vectorizer.transform([sample])
predicted = model.predict(X_tfidf)[0]
probs = model.predict_proba(X_tfidf)[0]

class_probs = {label: p for label, p in zip(model.classes_, probs)}
keywords = get_top_keywords(sample, model, vectorizer)

generate_report(sample, predicted, keywords, class_probs, filename="symptom_report.pdf")


In [None]:
from google.colab import files
files.download("symptom_report.pdf")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download("symptom_classifier_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download("symptom_vectorizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>