# CS5821 - INDIVIDUAL PROJECT
### Measuring Sentence Similarity in Biomedical Domain using Deep Learning Models
#### 3. Experiment
#### 3.2 Preprocessing


In [68]:
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from string import punctuation


In [69]:
data_path = "/Users/aswathshakthi/PycharmProjects/MLOps/Semantic Analysis/ClinicalSTS"
df0 = pd.read_csv(
    os.path.join(data_path, "clinicalSTS.train.txt"),
    sep='\t',
    header=None,
    names=["Sent1", "Sent2", "Score"]
)
df0


Unnamed: 0,Sent1,Sent2,Score
0,Insulin NPH Human [NOVOLIN N] 100 unit/mL susp...,Insulin NPH Human [NOVOLIN N] 100 unit/mL sus...,3.50
1,"Patient arrives ambulatory, Gait steady, Hist...","Complex assessment performed, Patient arrives...",2.50
2,"Peripheral IV site, established in the right ...","Peripheral IV site, present prior to arrival,...",3.45
3,No: new confusion or inability to stay alert ...,No: new confusion or inability to stay alert ...,4.00
4,Spent 15 minutes with the patient and greater ...,"Nurse visit ten minutes, over half of which w...",3.00
...,...,...,...
745,Discussed the necessity of other members of th...,"We discussed the risks, the goals, the altern...",3.00
746,"Barriers to learning: (cultural, religious/sp...","Barriers assessed (Cultural, Religious/Spirit...",2.75
747,Unable to assess if there is a history or con...,Unable to assess if there is a history or con...,3.90
748,"Discussed the risks, benefits, alternatives, ...","Informed consent: Discussed risks, goals, alt...",3.25


In [70]:
data_path = "/Users/aswathshakthi/PycharmProjects/MLOps/Semantic Analysis/ClinicalSTS"
df1 = pd.read_csv(
    os.path.join(data_path, "clinicalSTS.test.txt"),
    sep='\t',
    header=None,
    names=["Sent1", "Sent2", "Score"]
)
df1["Score"] = pd.read_csv("/Users/aswathshakthi/PycharmProjects/MLOps/Semantic Analysis/ClinicalSTS/clinicalSTS.test.gs.sim.txt",header=None,names=["Score"])
df1

Unnamed: 0,Sent1,Sent2,Score
0,No: new confusion or inability to stay alert ...,No: new confusion or inability to stay alert ...,3.00
1,"Negative gastrointestinal review of systems, ...","Negative ears, nose, throat review of systems...",1.00
2,No: new confusion or inability to stay alert ...,No: new confusion or inability to stay alert ...,3.00
3,Procedural pause conducted to verify: correct...,"Before procedure, pause conducted and patient...",3.25
4,Instructions: Take 2 tablets on first day foll...,"Instructions: Take 1 tab TID for 3 days, then...",1.50
...,...,...,...
313,Explained diagnosis and treatment plan as dire...,Explained diagnosis and treatment plan; patie...,4.90
314,Insulin Regular Human [NOVOLIN R] 100 unit/mL ...,Lantus Solostar 100 unit/mL (3 mL) Insulin Pe...,3.50
315,"Patient discharged to home, ambulating withou...","Patient discharged to home, carried, family d...",3.25
316,The client verbalized understanding and consen...,Those present reported an adequate understand...,1.25


In [71]:
df = pd.concat([df0, df1],ignore_index=True)
df

Unnamed: 0,Sent1,Sent2,Score
0,Insulin NPH Human [NOVOLIN N] 100 unit/mL susp...,Insulin NPH Human [NOVOLIN N] 100 unit/mL sus...,3.50
1,"Patient arrives ambulatory, Gait steady, Hist...","Complex assessment performed, Patient arrives...",2.50
2,"Peripheral IV site, established in the right ...","Peripheral IV site, present prior to arrival,...",3.45
3,No: new confusion or inability to stay alert ...,No: new confusion or inability to stay alert ...,4.00
4,Spent 15 minutes with the patient and greater ...,"Nurse visit ten minutes, over half of which w...",3.00
...,...,...,...
1063,Explained diagnosis and treatment plan as dire...,Explained diagnosis and treatment plan; patie...,4.90
1064,Insulin Regular Human [NOVOLIN R] 100 unit/mL ...,Lantus Solostar 100 unit/mL (3 mL) Insulin Pe...,3.50
1065,"Patient discharged to home, ambulating withou...","Patient discharged to home, carried, family d...",3.25
1066,The client verbalized understanding and consen...,Those present reported an adequate understand...,1.25


In [72]:
def preprocess_text(series, custom_stopwords=[]):
    """
    Perform the full preprocessing pipeline on a Pandas Series:
    - Lowercasing
    - Removing numbers and punctuation
    - Tokenizing
    - Lemmatizing
    - Removing stopwords
    - Detokenizing
    """
    # Initialize tools
    tokenizer = TreebankWordTokenizer()
    detokenizer = TreebankWordDetokenizer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')).union(custom_stopwords)

    def clean_and_process(sentence):
        # Lowercase and clean
        sentence = sentence.lower()
        sentence = ''.join(c for c in sentence if not c.isdigit())
        sentence = ''.join(c for c in sentence if c not in punctuation)

        # Tokenize
        tokens = tokenizer.tokenize(sentence)

        # Lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # Detokenize
        return detokenizer.detokenize(tokens)

    return series.apply(clean_and_process)


In [73]:
custom_stopwords_path = os.path.join(data_path, "stopwords")
custom_stopwords = (
    [line.strip() for line in open(custom_stopwords_path, 'r')]
    if os.path.exists(custom_stopwords_path)
    else []
)


In [74]:
df["Sent1_Processed"] = preprocess_text(df["Sent1"], custom_stopwords)
df["Sent2_Processed"] = preprocess_text(df["Sent2"], custom_stopwords)


In [75]:
sent1_tokens = sum(len(sentence.split()) for sentence in df["Sent1_Processed"])
sent2_tokens = sum(len(sentence.split()) for sentence in df["Sent2_Processed"])
print(f"Sentence 1 has {sent1_tokens} tokens.")
print(f"Sentence 2 has {sent2_tokens} tokens.")
print(f"Total tokens: {sent1_tokens + sent2_tokens}")


Sentence 1 has 17808 tokens.
Sentence 2 has 17996 tokens.
Total tokens: 35804


In [76]:
output_train_path = os.path.join(data_path, "train.csv")
output_test_path = os.path.join(data_path, "test.csv")
output_combined_path = os.path.join(data_path, "clinic_c.csv")

# Split into train and test
train_df = df.iloc[:750]
test_df = df.iloc[750:]

train_df[["Sent1", "Sent2", "Score"]].to_csv(output_train_path, index=False)
test_df[["Sent1", "Sent2", "Score"]].to_csv(output_test_path, index=False)
df[["Sent1", "Sent2", "Score"]].to_csv(output_combined_path, index=False)

print("Data exported successfully.")


Data exported successfully.
