In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


In [7]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Load the original dataset
df = pd.read_csv('medquad.csv')

In [10]:
df[['question', 'answer']].isnull().sum()

Unnamed: 0,0
question,0
answer,5


In [11]:
df.dropna(subset=['question', 'answer'], inplace=True)
df[['question', 'answer']].isnull().sum()

Unnamed: 0,0
question,0
answer,0


In [12]:
# Custom dictionary for medical abbreviation and unit normalization
normalization_dict = {
    'HTN': 'hypertension',
    'BP': 'blood pressure',
    'DM': 'diabetes mellitus',
    'mg/dL': 'milligrams per deciliter',
    'mmHg': 'millimeters of mercury'
}

# Medically relevant stopwords to preserve
preserve_stopwords = {'no', 'not', 'without', 'between', 'before', 'after', 'during', 'should', 'could'}

# Full stopword list, minus preserved ones
custom_stopwords = set(stopwords.words('english')) - preserve_stopwords

def normalize_text(text):
    # Normalize units and abbreviations
    for abbr, full in normalization_dict.items():
        text = re.sub(rf'\b{abbr}\b', full, text, flags=re.IGNORECASE)
    return text

def preprocess_text(text):
    text = normalize_text(text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove excess whitespace
    text = re.sub(r'[\(\)\[\]\{\}<>]', '', text)  # Remove brackets
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in custom_stopwords]
    return ' '.join(tokens)

# Apply preprocessing
df['question_preprocessed'] = df['question'].apply(preprocess_text)
df['answer_preprocessed'] = df['answer'].apply(preprocess_text)

# Save output
df.to_csv('medquad_preprocessed.csv', index=False)
