In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sam_curryokee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sam_curryokee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sam_curryokee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

# Load your dataset
df = pd.read_csv("medquad.csv")
df.tail()

Unnamed: 0,question,answer,source,focus_area
16407,What is (are) Diabetic Neuropathies: The Nerve...,Focal neuropathy appears suddenly and affects ...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16408,How to prevent Diabetic Neuropathies: The Nerv...,The best way to prevent neuropathy is to keep ...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16409,How to diagnose Diabetic Neuropathies: The Ner...,Doctors diagnose neuropathy on the basis of sy...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16410,What are the treatments for Diabetic Neuropath...,The first treatment step is to bring blood glu...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16411,What to do for Diabetic Neuropathies: The Nerv...,- Diabetic neuropathies are nerve disorders ca...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...


In [4]:
# df.columns, df.shape

missing_values = df.isnull().sum()
print(missing_values)

question       0
answer         5
source         0
focus_area    14
dtype: int64


In [5]:
class DataCleaner:
    def __init__(self, df):
        self.df = df
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def lowercase(self, column):
        self.df[column] = self.df[column].str.lower()
        return self

    def remove_punctuation(self, column):
        self.df[column] = self.df[column].str.translate(
            str.maketrans('', '', string.punctuation))
        return self

    def remove_stopwords(self, column):
        self.df[column] = self.df[column].apply(lambda x: ' '.join(
            [word for word in x.split() if word not in self.stop_words]))
        return self

    def lemmatize(self, column):
        self.df[column] = self.df[column].apply(lambda x: ' '.join(
            [self.lemmatizer.lemmatize(word) for word in x.split()]))
        return self

    def handle_missing_data(self):
        self.df.dropna(inplace=True)
        return self

    def get_clean_data(self):
        return self.df

In [6]:
cleaner = DataCleaner(df)

# Clean the data
clean_df = (
    cleaner
    .handle_missing_data()
    .lowercase('question')
    .lowercase('answer')
    .remove_punctuation('question')
    .remove_punctuation('answer')
    .remove_stopwords('question')
    .remove_stopwords('answer')  
    .get_clean_data()
)

In [7]:
clean_df


Unnamed: 0,question,answer,source,focus_area
0,glaucoma,glaucoma group diseases damage eyes optic nerv...,NIHSeniorHealth,Glaucoma
1,causes glaucoma,nearly 27 million people glaucoma leading caus...,NIHSeniorHealth,Glaucoma
2,symptoms glaucoma,symptoms glaucoma glaucoma develop one eyes co...,NIHSeniorHealth,Glaucoma
3,treatments glaucoma,although openangle glaucoma cannot cured usual...,NIHSeniorHealth,Glaucoma
4,glaucoma,glaucoma group diseases damage eyes optic nerv...,NIHSeniorHealth,Glaucoma
...,...,...,...,...
16407,diabetic neuropathies nerve damage diabetes,focal neuropathy appears suddenly affects spec...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16408,prevent diabetic neuropathies nerve damage dia...,best way prevent neuropathy keep blood glucose...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16409,diagnose diabetic neuropathies nerve damage di...,doctors diagnose neuropathy basis symptoms phy...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16410,treatments diabetic neuropathies nerve damage ...,first treatment step bring blood glucose level...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...


In [8]:
clean_df.to_json('clean_data.json', orient='records')
