In [None]:
####### Preprocessing Starts HERE ##########

In [None]:
import csv
import spacy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def read_tsv_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        for row in reader:
            data.append(row)
    return data

dataset = read_tsv_file("/content/drive/MyDrive/biocreative/dataset/BioCreativeVIII3_ValSet.tsv")

In [None]:
# convert offsets to word spans
def convert_span_to_words(text, span):
    doc = nlp(text)
    span_parts = span.split(',')
    spans = []
    fullspan = ""
    for span_part in span_parts:
        span_range = span_part.split('-')

        if len(span_range) >= 2:
            start_char = span_range[0]
            end_char = span_range[1]
            start_token = None
            end_token = None

            for token in doc:
                if token.idx == int(start_char):
                    start_token = token
                if token.idx == int(end_char):
                    end_token = token
                    break
                elif token.idx > int(end_char):
                    # Check if the end_char corresponds to whitespace
                    if text[int(end_char)].isspace():
                        end_token = doc[token.i - 1]
                    break
                elif token.i == len(doc) - 1:
                    if int(end_char)>= len(doc):
                       end_token = doc[token.i]
                    break

            if start_token and end_token:
                start_word = start_token.text
                end_word = end_token.text
                span_words = doc[start_token.i: end_token.i + 1]
                span_text = ' '.join([token.text for token in span_words])
                fullspan = fullspan + span_text # in case I had disjoint spans
                fullspan = fullspan + ' '
                if span_part == span_parts[-1]:
                  fullspan = fullspan.rstrip(' ')
                  fullspan = fullspan.rstrip('/;.,')
                  fullspan = fullspan.rstrip(' ')
                  spans.append((fullspan))

    return spans if spans else None

nlp = spacy.load("en_core_web_sm")  # Load the English language model
polarity = []
HPO = []
Obs = []

for row in dataset:
    polarity.append(row[3])
    HPO.append(row[2])
    Obs.append(row[0])
dataset = [
    [row[1], row[4]]  # Assuming "text" column is at index 1 and "span" column is at index 4
    for row in dataset
]
idx=0
with open('validation.tsv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    for text, span in dataset:
        span_data = convert_span_to_words(text, span)
        if idx==0:
            writer.writerow([Obs[idx],text,HPO[idx],polarity[idx], "Spans"])
            idx=idx+1
            continue
        if span_data:
            for span_text in span_data:
                writer.writerow([Obs[idx],text,HPO[idx],polarity[idx], span_text])
        else:
            writer.writerow([Obs[idx],text,HPO[idx],polarity[idx], "NA"])
        idx=idx+1

print("CSV file created successfully")

CSV file created successfully


In [None]:
with open('/content/validation.tsv', 'r') as file:
    reader = csv.DictReader(file,delimiter='\t')
    print(reader.fieldnames)

['ObservationID', 'Text', 'HPO Term', 'Polarity', 'Spans']


In [None]:
# add the tags based on the Polarity

with open('/content/validation.tsv', 'r') as file:
    reader = csv.DictReader(file,delimiter='\t')
    rows = list(reader)
    # Loop through the examples
    for row in rows:
        polarity = row['Polarity']
        span = row['Spans']

        if polarity == 'NA':
          if span != 'NA':
            row['Spans'] = 'KEYF: ' + str(span)
        elif polarity == 'X':
            row['Spans'] = 'NORMF: ' + str(span)

with open('Modified_Validation.tsv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=reader.fieldnames,delimiter='\t')
    writer.writeheader()
    writer.writerows(rows)

In [None]:
# Now I need to remove duplicated "Text" and merge all spans of the duplicate "Text" in one span separated by ";"

In [None]:
import csv

data = []
with open("/content/Modified_Validation.tsv", "r") as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        data.append(row)

# to remove duplicated examples based on the "text" column
unique_data = []
seen_texts = set()
seen_ids = set()
for row in data:
    if row["Text"] not in seen_texts:
        unique_data.append(row)
        seen_texts.add(row["Text"])
        seen_ids.add(row["ObservationID"])

# here we group the data by the "text" column and merge the values in "span" and "HPO Term" with ";"
merged_data = {}
for row in data:
    text = row["Text"]
    ids = row["ObservationID"]
    span = row["Spans"]
    hpo_term = row["HPO Term"]
    if text in merged_data:
        merged_data[text]["Spans"].append(span)
        merged_data[text]["HPO Term"].append(hpo_term)
    else:
        merged_data[text] = {"ObservationID": [ids], "Spans": [span], "HPO Term": [hpo_term]}

fieldnames = ["ObservationID","Text", "Spans", "HPO Term"]
with open("MergedData.tsv", "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames,delimiter='\t')
    writer.writeheader()
    for text, values in merged_data.items():
        writer.writerow({"ObservationID": "".join(values["ObservationID"]),"Text": text, "Spans": "; ".join(values["Spans"]), "HPO Term": "; ".join(values["HPO Term"])})

print("Merged data saved to MergedData.csv")


Merged data saved to MergedData.csv


In [None]:
print(merged_data.items())

dict_items([('MOUTH: Mildly high arched palate. Normal lips and tongue.', {'ObservationID': ['000f780da593b746a7cc4753de22a2ce'], 'Spans': ['KEYF: high arched palate', 'NORMF: Normal tongue', 'NORMF: Normal lips'], 'HPO Term': ['HP:0000218', 'HP:0030809', 'HP:0000159']}), ('EYES: Prominent infraorbital creases.', {'ObservationID': ['001d6c29f4e6ab6d37e2a4b0b84db25c'], 'Spans': ['KEYF: Prominent infraorbital creases'], 'HPO Term': ['HP:0100876']}), ('EYES: Mild epicanthus, mild up-slant', {'ObservationID': ['018fa5440489fec2f79b944c77c14a2b'], 'Spans': ['KEYF: epicanthus', 'KEYF: EYES : mild up - slant'], 'HPO Term': ['HP:0000286', 'HP:0000582']}), ('NEUROLOGIC: cooperative with exam and follows all instructions in both English and Spanish.', {'ObservationID': ['028ec43d4e6aabeac24997ad73c4b4e0'], 'Spans': ['NA'], 'HPO Term': ['NA']}), ('EYES: Bluish hue to sclerae.', {'ObservationID': ['033f7020c6330defb4a15d018bf8634e'], 'Spans': ['KEYF: Bluish hue to sclerae'], 'HPO Term': ['HP:00005

In [None]:
#this code is made to edit the spaces in punctuations found in the dataset:

import csv

def process_span(span):
    # Process the span value based on the conditions
    if "-" in span:
        span = span.replace(" -", "-").replace("- ", "-")
    if "/" in span:
        span = span.replace(" /", "/").replace("/ ", "/")
    if "%" in span:
        span = span.replace(" %", "%")
    if ")" in span:
        span = span.replace(" )", ")")
    if ":" in span:
        span = span.replace(" :", ":")
    if "+" in span:
        span = span.replace("+ ", "+")
    if "," in span:
        span = span.replace(" ,", ",")
    if "'" in span:
        span = span.replace(" '", "'")

    return span

def main():
    input_file = "/content/MergedData.tsv"
    output_file = "Final_Validation.tsv"
    column_to_process = "Spans"

    # Read the CSV file and process the data
    with open(input_file, "r", newline='') as csvfile:
        reader = csv.DictReader(csvfile,delimiter='\t')
        rows = list(reader)

        for row in rows:
            row[column_to_process] = process_span(row[column_to_process])

    # Write the modified data to a new CSV file
    with open(output_file, "w", newline='') as csvfile:
        fieldnames = list(rows[0].keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames,delimiter='\t')
        writer.writeheader()
        writer.writerows(rows)

if __name__ == "__main__":
    main()


In [None]:
####### Preprocessing Ends HERE ##########