In [2]:
# Task 1: Data Preparation & Normalization
import pandas as pd
import re

# Load raw data
data = pd.read_excel("O-Health_Task_Inputs.xlsx")

# Normalize text
data['Symptoms'] = data['Symptoms'].str.lower()
data['Symptoms'] = data['Symptoms'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Handle negations
def handle_negations(text):
    if "but" in text:
        return "no " + text.split("but")[1].strip()
    return text

data['Symptoms'] = data['Symptoms'].apply(handle_negations)

# Save cleaned data
data.to_csv("cleaned_data.csv", index=False)

print("Task 1: Data Preparation & Normalization Complete")
print("Cleaned data saved to 'cleaned_data.csv'")

Task 1: Data Preparation & Normalization Complete
Cleaned data saved to 'cleaned_data.csv'


In [3]:
# Task 2: Symptom Extraction Model

import spacy
from spacy.matcher import PhraseMatcher

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Symptom dictionary
symptom_dict = {
    "chest pain": ["chest pain", "pain in chest", "aching chest"],
    "headache": ["headache", "mild headache"],
    "stomach pain": ["stomach pain", "stomach ache"],
    "knee pain": ["knee pain", "pain in knee"],
    "back pain": ["back pain", "lower back pain"],
}

# Create PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab)
for symptom, patterns in symptom_dict.items():
    patterns = [nlp(text) for text in patterns]
    matcher.add(symptom, None, *patterns)

# Function to extract symptoms
def extract_symptoms(text):
    doc = nlp(text)
    matches = matcher(doc)
    symptoms = set()
    for match_id, start, end in matches:
        symptoms.add(doc[start:end].text)
    return list(symptoms)

# Test symptom extraction
test_text = "I have a mild headache and pain in my chest."
extracted_symptoms = extract_symptoms(test_text)
print("Extracted Symptoms:", extracted_symptoms)

print("Task 2: Symptom Extraction Model Complete")

Extracted Symptoms: ['headache', 'mild headache']
Task 2: Symptom Extraction Model Complete


In [4]:
# Task 3: Severity & Sentiment Analysis

# Severity detection
severity_terms = {"mild": 1, "moderate": 2, "severe": 3}

def detect_severity(text):
    for term, score in severity_terms.items():
        if term in text:
            return score
    return 0

# Risk categorization
def assign_risk(severity, duration):
    if severity == 3 and duration > 7:
        return "High"
    elif severity == 2 and duration > 3:
        return "Moderate"
    else:
        return "Low"

# Test severity and risk categorization
test_text = "I have had severe chest pain for 10 days."
severity = detect_severity(test_text)
risk = assign_risk(severity, duration=10)
print("Severity Score:", severity)
print("Risk Category:", risk)

print("Task 3: Severity & Sentiment Analysis Complete")

Severity Score: 3
Risk Category: High
Task 3: Severity & Sentiment Analysis Complete


In [8]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract cause
def extract_cause(text):
    doc = nlp(text)
    cause = ""

    # Look for causal indicators like "after", "because", "due to"
    for token in doc:
        if token.text.lower() in ["after", "because", "due to", "since"]:
            # Extract the subtree of the token to get the full cause phrase
            cause = " ".join([t.text for t in token.subtree])
            break  # Stop after finding the first cause indicator

    return cause

# Test cause extraction
test_text = "My lower back started aching after lifting a heavy box."
cause = extract_cause(test_text)
print("Extracted Cause:", cause)

print("Task 4: Reasoning/Root Cause Extraction Complete")

Extracted Cause: after lifting a heavy box
Task 4: Reasoning/Root Cause Extraction Complete


In [13]:
!pip install -U openai-whisper
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("openai/whisper-small")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [15]:
# Step 1: Install Required Libraries
!pip install torch torchaudio transformers datasets soundfile librosa
!pip install jiwer  # For WER calculation

# Step 2: Load Pre-trained Whisper Model Locally
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Load Whisper model and processor from local directory
model_path = "./whisper-small"  # Replace with the path to your downloaded model
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)

# Step 3: Load Dogri Dataset (Example: Using Hugging Face Datasets)
from datasets import load_dataset, Audio

# Load a sample dataset (replace with Dogri dataset)
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="train[:10%]")  # Use Hindi as a proxy
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Step 4: Preprocess Data
def preprocess_function(batch):
    # Resample audio to 16kHz
    audio = batch["audio"]["array"]
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
    return inputs

# Apply preprocessing
dataset = dataset.map(preprocess_function, remove_columns=["audio"])

# Step 5: Fine-tune Whisper on Dogri Data
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-dogri",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    num_train_epochs=3,
    fp16=True,  # Use mixed precision for faster training
    save_steps=500,
    eval_steps=500,
    logging_dir="./logs",
    evaluation_strategy="steps",
    predict_with_generate=True,
)

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.tokenizer,
)

# Fine-tune the model
trainer.train()

# Step 6: Evaluate the Model
from jiwer import wer

# Evaluate on a test set
test_dataset = load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="test[:5%]")
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

def evaluate(batch):
    inputs = processor(batch["audio"]["array"], sampling_rate=16000, return_tensors="pt")
    predicted_ids = model.generate(inputs.input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    batch["predicted"] = transcription[0]
    return batch

# Apply evaluation
test_dataset = test_dataset.map(evaluate)
wer_score = wer(test_dataset["sentence"], test_dataset["predicted"])
print(f"Word Error Rate (WER): {wer_score}")

# Step 7: Convert to TensorFlow Lite for Edge Deployment
from transformers import TFWhisperForConditionalGeneration
import tensorflow as tf

# Convert PyTorch model to TensorFlow
tf_model = TFWhisperForConditionalGeneration.from_pretrained("./whisper-dogri", from_pt=True)

# Save as TFLite model
converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
tflite_model = converter.convert()

with open("whisper-dogri.tflite", "wb") as f:
    f.write(tflite_model)

print("TFLite model saved to 'whisper-dogri.tflite'")



OSError: Incorrect path_or_model_id: './whisper-small'. Please provide either the path to a local folder or the repo_id of a model on the Hub.