In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load MedQuAD JSON dataset
dataset_path = "medquads.json"  # Make sure this file is already converted
with open(dataset_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Extract questions and answers
questions = [item["question"] for item in data]
answers = [item["answer"] for item in data]

# Convert to DataFrame
df = pd.DataFrame({"question": questions, "answer": answers})

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["question"])

# Save the vectorizer and model
with open("tfidf_vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)
with open("tfidf_matrix.pkl", "wb") as matrix_file:
    pickle.dump(X, matrix_file)

print("Model trained and saved successfully!")

# Load the model and test it with a similarity threshold
def get_answer(user_query, threshold=0.3):  # Set a similarity threshold
    with open("tfidf_vectorizer.pkl", "rb") as vec_file:
        vectorizer = pickle.load(vec_file)
    with open("tfidf_matrix.pkl", "rb") as matrix_file:
        X = pickle.load(matrix_file)
    
    user_vec = vectorizer.transform([user_query])
    similarities = cosine_similarity(user_vec, X).flatten()
    
    best_match_idx = np.argmax(similarities)
    best_score = similarities[best_match_idx]
    
    if best_score < threshold:
        return "Sorry, I couldn't find an exact answer. Please try rephrasing your question."
    
    return df.iloc[best_match_idx]["answer"]

# Example test
user_question = "What are the symptoms of diabetes?"
print("User Question:", user_question)
print("Answer:", get_answer(user_question))

Model trained and saved successfully!
User Question: What are the symptoms of diabetes?
Answer: What are the symptoms of brittle diabetes? The main symptom of brittle diabetes is severe instability of blood glucose levels with frequent and unpredictable episodes of hypoglycemia and/or ketoacidosis that cause a disruption of daily activities. Three clinical presentations have been described: Predominant hyperglycemia with recurrent ketoacidosis, Predominant hypoglycemia, and Mixed hyper- and hypoglycemia. Patients with brittle diabetes have wide swings in their blood sugar levels and often experience differing blood sugar responses to the same dose and type of insulin. Complications such as neuropathy, nephropathy, and retinopathy are common. Most patients are females in their twenties of thirties, though any age or gender can be affected.
