In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

In [8]:
# Load the dataset
data_path = 'medquad.csv'  # Update this path
data = pd.read_csv(data_path)

In [9]:
# Define text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'\W', ' ', text)
    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

In [10]:
# Apply preprocessing to questions and answers
data['processed_question'] = data['question'].apply(preprocess_text)
data['processed_answer'] = data['answer'].astype(str).apply(preprocess_text)  # Ensuring all answers are strings

In [11]:
# Drop rows where 'focus_area' is NaN
data = data.dropna(subset=['focus_area'])

In [12]:
# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=1000)

In [14]:
# Create a simple Logistic Regression pipeline
model = make_pipeline(vectorizer, LogisticRegression(random_state=42))

In [15]:
# Fit model on training data
model.fit(train_data['processed_question'], train_data['focus_area'])

In [16]:
# Evaluate on test data
predictions = model.predict(test_data['processed_question'])
report = classification_report(test_data['focus_area'], predictions, zero_division=0)

In [17]:
# Print the classification report
print(report)

                                                                                                                                          precision    recall  f1-score   support

                                                                                                               16p11.2 deletion syndrome       0.00      0.00      0.00         1
                                                                                                          16q24.3 microdeletion syndrome       0.00      0.00      0.00         2
                                                                                                     17q23.1q23.2 microdeletion syndrome       0.00      0.00      0.00         1
                                                                                                                   18q deletion syndrome       0.00      0.00      0.00         1
                                                                                                             