In [1]:
import sys
import subprocess
import pkgutil

print("Python:", sys.version)
try:
    import numpy as np
    print("Current NumPy:", np.__version__)
except Exception as e:
    print("NumPy import failed:", e)

# If NumPy 2.x is installed, install a NumPy <2 to be compatible with some extensions
import re
version = None
try:
    import numpy as np
    version = np.__version__
except Exception:
    pass

if version and re.match(r"^2\.", version):
    print("Detected NumPy 2.x; installing numpy<2. This may take a minute...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy<2"], shell=False)
    print("numpy<2 installed. Please restart the kernel and re-run the notebook cells.")
else:
    print("NumPy version is compatible or not installed; no action taken.")

Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
Current NumPy: 1.26.4
NumPy version is compatible or not installed; no action taken.


In [2]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [3]:
import pandas as pd
import random

random.seed(42)

DIAGNOSES = [
    "BPPV",
    "RECURRENT_BPPV",
    "VESTIBULAR_NEURITIS",
    "LABYRINTHITIS",
    "MENIERES_DISEASE",
    "PPPD",
    "OTOTOXICITY",
    "VESTIBULAR_MIGRAINE",
    "SCDS",
    "ALTERNOBARIC_VERTIGO",
    "VESTIBULAR_PAROXYSMIA",
    "CENTRAL_CAUSES",
    "PERILYMPH_FISTULA",
    "AUTOIMMUNE"
]

def generate_row(diagnosis):
    row = {}

    # Defaults
    row["onset_type"] = random.choice(["sudden", "gradual"])
    row["vertigo_duration"] = random.choice(["seconds", "minutes", "hours", "days", "months"])
    row["motion_type"] = random.choice(["spinning", "back_forth"])
    row["pattern_type"] = random.choice(["episodic", "persistent"])
    row["episode_duration"] = random.choice(["seconds", "minutes", "hours"])
    row["remission_type"] = random.choice(["complete", "partial", "none"])

    triggers = [
        "trigger_head_movement",
        "trigger_pressure_change",
        "trigger_loud_sound",
        "trigger_visual",
        "trigger_air_travel",
        "trigger_anxiety"
    ]

    for t in triggers:
        row[t] = 0

    row["recent_head_injury"] = 0

    # Ear symptoms
    row["hearing_loss"] = 0
    row["tinnitus"] = 0
    row["aural_fullness"] = 0
    row["otalgia"] = 0
    row["otorrhea"] = 0
    row["hearing_laterality"] = "none"
    row["hearing_onset"] = "none"
    row["hearing_progression"] = "none"

    # Neuro
    row["cerebellar_symptoms"] = 0
    row["cranial_nerve_symptoms"] = 0
    row["unsteady_gait"] = 0

    row["ototoxic_drug_use"] = 0

    # ---- DIAGNOSIS RULES ----
    if diagnosis == "BPPV":
        row.update({
            "onset_type": "sudden",
            "pattern_type": "episodic",
            "episode_duration": "seconds",
            "motion_type": "spinning",
            "trigger_head_movement": 1,
            "remission_type": "complete"
        })

    elif diagnosis == "MENIERES_DISEASE":
        row.update({
            "pattern_type": "episodic",
            "episode_duration": "hours",
            "hearing_loss": 1,
            "tinnitus": 1,
            "aural_fullness": 1,
            "hearing_laterality": "unilateral",
            "hearing_progression": "fluctuating"
        })

    elif diagnosis == "VESTIBULAR_NEURITIS":
        row.update({
            "pattern_type": "persistent",
            "vertigo_duration": "days",
            "motion_type": "spinning"
        })

    elif diagnosis == "LABYRINTHITIS":
        row.update({
            "hearing_loss": 1,
            "tinnitus": 1,
            "pattern_type": "persistent"
        })

    elif diagnosis == "CENTRAL_CAUSES":
        row.update({
            "cerebellar_symptoms": 1,
            "cranial_nerve_symptoms": 1,
            "unsteady_gait": 1,
            "pattern_type": "persistent"
        })

    elif diagnosis == "OTOTOXICITY":
        row.update({
            "hearing_loss": 1,
            "tinnitus": 1,
            "ototoxic_drug_use": 1
        })

    elif diagnosis == "VESTIBULAR_MIGRAINE":
        row.update({
            "trigger_visual": 1,
            "trigger_anxiety": 1
        })

    row["diagnosis"] = diagnosis
    return row


rows = []
SAMPLES_PER_CLASS = 120

for d in DIAGNOSES:
    for _ in range(SAMPLES_PER_CLASS):
        rows.append(generate_row(d))

df = pd.DataFrame(rows)

df.to_csv("vertigo_training_data.csv", index=False)
print("Dataset created:", df.shape)
df.head()


Dataset created: (1680, 26)


Unnamed: 0,onset_type,vertigo_duration,motion_type,pattern_type,episode_duration,remission_type,trigger_head_movement,trigger_pressure_change,trigger_loud_sound,trigger_visual,...,otalgia,otorrhea,hearing_laterality,hearing_onset,hearing_progression,cerebellar_symptoms,cranial_nerve_symptoms,unsteady_gait,ototoxic_drug_use,diagnosis
0,sudden,seconds,spinning,episodic,seconds,complete,1,0,0,0,...,0,0,none,none,none,0,0,0,0,BPPV
1,sudden,months,spinning,episodic,seconds,complete,1,0,0,0,...,0,0,none,none,none,0,0,0,0,BPPV
2,sudden,minutes,spinning,episodic,seconds,complete,1,0,0,0,...,0,0,none,none,none,0,0,0,0,BPPV
3,sudden,minutes,spinning,episodic,seconds,complete,1,0,0,0,...,0,0,none,none,none,0,0,0,0,BPPV
4,sudden,hours,spinning,episodic,seconds,complete,1,0,0,0,...,0,0,none,none,none,0,0,0,0,BPPV


In [4]:
df = pd.read_csv("../../../../Downloads/vertigo_training_data.csv")

X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]


In [5]:
categorical_cols = X.select_dtypes(include="object").columns
numeric_cols = X.select_dtypes(exclude="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)


In [6]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))


Accuracy: 0.5625


In [8]:
def predict_with_confidence(input_dict):
    input_df = pd.DataFrame([input_dict])
    probs = pipeline.predict_proba(input_df)[0]
    classes = pipeline.classes_

    best_idx = np.argmax(probs)

    return {
        "predictedDiagnosis": classes[best_idx],
        "confidenceScore": float(probs[best_idx]),
        "top3": sorted(
            zip(classes, probs),
            key=lambda x: x[1],
            reverse=True
        )[:3]
    }


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [10]:
# Load dataset
df = pd.read_csv("../../../../Downloads/vertigo_training_data.csv")

# Separate features & target
X = df.drop("diagnosis", axis=1)
y = df["diagnosis"]

# Encode categorical columns
label_encoders = {}
for col in X.columns:
    if X[col].dtype == "object":
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# Encode target
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [11]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=14,
    min_samples_split=4,
    random_state=42
)

model.fit(X_train, y_train)


In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))


Accuracy: 0.5476190476190477

Classification Report:

                       precision    recall  f1-score   support

 ALTERNOBARIC_VERTIGO       0.23      0.25      0.24        24
           AUTOIMMUNE       0.07      0.04      0.05        24
                 BPPV       1.00      1.00      1.00        24
       CENTRAL_CAUSES       1.00      1.00      1.00        24
        LABYRINTHITIS       1.00      1.00      1.00        24
     MENIERES_DISEASE       1.00      1.00      1.00        24
          OTOTOXICITY       1.00      1.00      1.00        24
    PERILYMPH_FISTULA       0.09      0.08      0.09        24
                 PPPD       0.09      0.08      0.09        24
       RECURRENT_BPPV       0.07      0.08      0.08        24
                 SCDS       0.13      0.12      0.13        24
  VESTIBULAR_MIGRAINE       1.00      1.00      1.00        24
  VESTIBULAR_NEURITIS       0.69      0.92      0.79        24
VESTIBULAR_PAROXYSMIA       0.08      0.08      0.08        24


In [13]:
sample = X_test.iloc[[0]]
pred = model.predict(sample)
confidence = max(model.predict_proba(sample)[0])

print("Predicted Diagnosis:", target_encoder.inverse_transform(pred)[0])
print("Confidence Score:", round(confidence, 2))


Predicted Diagnosis: ALTERNOBARIC_VERTIGO
Confidence Score: 0.54


In [14]:
DIAGNOSES = [
    "BPPV",
    "Recurrent BPPV",
    "Vestibular Neuritis",
    "Labyrinthitis Right",
    "Labyrinthitis Left",
    "Menieres Left",
    "Menieres Right",
    "Vestibular Migraine",
    "PPPD",
    "Central Causes"
]

FEATURES = {
    "onset_sudden": [0, 1],
    "episode_duration_sec": [5, 30, 120, 600, 3600, 86400],
    "episodic": [0, 1],
    "persistent_days": [0, 1, 3, 7, 30],
    "trigger_head_movement": [0, 1],
    "trigger_sound": [0, 1],
    "trigger_pressure": [0, 1],
    "visual_motion_sensitivity": [0, 1],
    "nausea": [0, 1],
    "vomiting": [0, 1],
    "hearing_loss": [0, 1],
    "hearing_loss_progressive": [0, 1],
    "tinnitus": [0, 1],
    "aural_fullness": [0, 1],
    "unilateral": [0, 1],
    "laterality": ["none", "left", "right"],
    "migraine_features": [0, 1],
    "neuro_red_flags": [0, 1],
    "ototoxic_exposure": [0, 1]
}


CLINICAL_RULES = {
    "BPPV": {
    "onset_sudden": 1,
    "episodic": 1,
    "episode_duration_sec": 30,
    "trigger_head_movement": 1,
    "hearing_loss": 0,
    "neuro_red_flags": 0
},
    "Central Causes": {
    "persistent_days": 7,
    "episodic": 0,
    "neuro_red_flags": 1,
    "hearing_loss": 0
},
    "Menieres Left": {
        "onset": "sudden",
        "episodic": 1,
        "duration": "hours",
        "hearing_loss": 1,
        "tinnitus": 1,
        "aural_fullness": 1,
        "laterality": "left"
    },
    "Vestibular Migraine": {
        "episodic": 1,
        "duration": "minutes",
        "visual_trigger": 1,
        "anxiety": 1,
        "hearing_loss": 0
    }
}


In [15]:
import random
import pandas as pd

NUM_SAMPLES = 1000000

def generate_patient(diagnosis):
    base = CLINICAL_RULES.get(diagnosis, {})
    patient = {}

    for feature, values in FEATURES.items():
        if feature in base:
            patient[feature] = base[feature]
        else:
            patient[feature] = random.choice(values)

    patient["diagnosis"] = diagnosis
    return patient

data = []

for _ in range(NUM_SAMPLES):
    diag = random.choice(DIAGNOSES)
    data.append(generate_patient(diag))

df = pd.DataFrame(data)
df.head()


Unnamed: 0,onset_sudden,episode_duration_sec,episodic,persistent_days,trigger_head_movement,trigger_sound,trigger_pressure,visual_motion_sensitivity,nausea,vomiting,hearing_loss,hearing_loss_progressive,tinnitus,aural_fullness,unilateral,laterality,migraine_features,neuro_red_flags,ototoxic_exposure,diagnosis
0,1,30,1,0,1,1,0,1,1,1,0,0,1,0,1,left,1,0,1,BPPV
1,0,5,0,30,0,1,1,1,1,0,1,0,1,0,0,none,0,0,0,Labyrinthitis Left
2,1,5,0,30,1,1,1,0,0,0,0,0,1,1,1,left,1,0,1,Labyrinthitis Right
3,1,600,0,1,1,0,1,0,0,0,1,1,0,0,1,right,1,1,1,Recurrent BPPV
4,1,30,0,7,0,0,0,0,0,1,0,1,1,0,1,right,1,1,0,Central Causes


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Encode categorical features
df_encoded = df.copy()
encoders = {}

for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col])
        encoders[col] = le

X = df_encoded.drop("diagnosis", axis=1)
y = df_encoded["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98     19891
           1       0.87      0.99      0.92     19825
           2       0.16      0.13      0.14     20184
           3       0.16      0.12      0.14     19887
           4       0.89      0.99      0.94     20006
           5       0.16      0.12      0.14     19949
           6       0.16      0.12      0.14     20237
           7       0.16      0.13      0.14     19956
           8       0.40      0.81      0.53     19977
           9       0.16      0.13      0.14     20088

    accuracy                           0.45    200000
   macro avg       0.41      0.45      0.42    200000
weighted avg       0.41      0.45      0.42    200000



In [17]:
import joblib

joblib.dump(model, "vertease_model_v1.pkl")
joblib.dump(encoders, "encoders.pkl")


['encoders.pkl']