In [7]:
import os
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm 
import warnings

# Suppress warnings that might occur during data processing
warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
# CONFIRMED PATH: Assuming the notebook is run from the directory *above* Task2 
# and the data file is correctly placed in Task2/data/
INPUT_FILE_NAME = (r"C:\Users\yashm\OneDrive\Desktop\NullClass\Task2\data\All-2479-Answers-retrieved-from-MedQuAD.csv")
# Output paths relative to the current working directory
QA_KNOWLEDGE_BASE_PATH = '../data/qa_knowledge_base.csv'
VECTORIZER_MODEL_PATH = '../models/vectorizer.pkl'
ANSWER_VECTORS_PATH = '../models/answer_vectors.npy'

# Create necessary directories
os.makedirs(os.path.dirname(QA_KNOWLEDGE_BASE_PATH), exist_ok=True)
os.makedirs(os.path.dirname(VECTORIZER_MODEL_PATH), exist_ok=True)


# ====================================================================
# STEP 1: DATA LOADING AND STRUCTURING
# ====================================================================

print(f"Loading data from: {INPUT_FILE_NAME}")

try:
    # Load the CSV file provided by the user
    df = pd.read_csv(INPUT_FILE_NAME)
    print(f"Successfully loaded {len(df)} initial records.")
    
except FileNotFoundError:
    print(f"Error: Dataset file '{INPUT_FILE_NAME}' not found.")
    print("Please ensure the file is at that exact path.")
    raise

def extract_qa_parts(text):
    """Parses the combined 'Answer' column text to extract Question, Answer, and URL (Source)."""
    if pd.isna(text):
        return None, None, None

    # Use regex to find and extract the parts based on explicit tags
    question_match = re.search(r'Question:\s*(.*?)(?=\s*URL:)', text, re.DOTALL)
    url_match = re.search(r'URL:\s*(.*?)(?=\s*Answer:)', text, re.DOTALL)
    answer_match = re.search(r'Answer:\s*(.*)', text, re.DOTALL)

    question = question_match.group(1).strip() if question_match else None
    url = url_match.group(1).strip() if url_match else None
    answer = answer_match.group(1).strip() if answer_match else None
    
    # Simple cleanup for Question (removes trailing parentheses like the part that says "Also called:...")
    if question and question.endswith(')'):
        question = re.sub(r'\s*\([^)]*\)$', '', question).strip()

    return question, answer, url

# Apply the extraction function to create structured columns
print("Structuring Q&A data...")
# The following line applies the extraction and creates the new columns
df[['question', 'answer', 'source']] = df['Answer'].apply(
    lambda x: pd.Series(extract_qa_parts(x))
)

# Clean up the DataFrame
df.dropna(subset=['question', 'answer'], inplace=True)
df.drop(columns=['AnswerID', 'Answer'], errors='ignore', inplace=True)
print(f"Structured and cleaned data now contains {len(df)} Q&A pairs.")

# Save the structured knowledge base as CSV
df[['question', 'answer', 'source']].to_csv(QA_KNOWLEDGE_BASE_PATH, index=False)
print(f"Saved Knowledge Base CSV to {QA_KNOWLEDGE_BASE_PATH}")


# ====================================================================
# STEP 2: VECTORIZER TRAINING AND MODEL SAVING
# ====================================================================

# We use the 'answer' column for the retrieval knowledge base index
if df.empty:
    raise SystemExit("Data is empty after cleaning. Cannot train model.")

print("\nStarting TF-IDF Vectorizer training...")
# 1. Vectorizer Initialization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# 2. Vectorizer Training: We vectorize the ANSWER text
answer_vectors = vectorizer.fit_transform(df['answer'])
print("Vectorizer training complete.")


# 3. Save Model and Data

# Save the Vectorizer Model
with open(VECTORIZER_MODEL_PATH, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"Saved TF-IDF Vectorizer to {VECTORIZER_MODEL_PATH}")

# Save the Vectorized Data (as numpy array)
np.save(ANSWER_VECTORS_PATH, answer_vectors.toarray())
print(f"Saved Answer Vectors to {ANSWER_VECTORS_PATH}")

print("\nTraining and data preparation successful! 🎉 You can now proceed to develop the retrieval and chatbot modules.")

Loading data from: C:\Users\yashm\OneDrive\Desktop\NullClass\Task2\data\All-2479-Answers-retrieved-from-MedQuAD.csv
Successfully loaded 2479 initial records.
Structuring Q&A data...
Structured and cleaned data now contains 2479 Q&A pairs.
Saved Knowledge Base CSV to ../data/qa_knowledge_base.csv

Starting TF-IDF Vectorizer training...
Vectorizer training complete.
Saved TF-IDF Vectorizer to ../models/vectorizer.pkl
Saved Answer Vectors to ../models/answer_vectors.npy

Training and data preparation successful! 🎉 You can now proceed to develop the retrieval and chatbot modules.
