## Imports

In [None]:
from dataclasses import dataclass
import traceback
from typing import List
import json
from tqdm import tqdm
import re
import pandas as pd
import traceback
# Define the dataclass representing each row in the file




## Data 

In [None]:
@dataclass
class RelationData:
    index: int
    entity_1: str
    entity_2: str
    text: str
    prediction: str
    label: str
    
denv_file_path = "D:/CSE498R_Resources/D3N/Dengue-Drug-Discovery-Network-D3N/Data/output/ner/jnlpba/filtered/d3n.json"
file_path="D:/CSE498R_Resources/D3N/Dengue-Drug-Discovery-Network-D3N/Data/output/ner/jnlpba/RE/_ac/predict_outputs.txt"
output_path = "D:/CSE498R_Resources/D3N/Dengue-Drug-Discovery-Network-D3N/Data/output/ner/jnlpba/filtered"

## Helper Functions

In [None]:
def set_ids(data):
    """ The following function extracts key sentence position data from d3n.json's "id" to extract the attributes: pmid, 
    total number of sentences, sentence number and sentence iteration number.
    """
    for d in tqdm(data, desc="Processing sentences"):
        id = d["id"]
        # Step 1: Split by the underscore "_"
        pmid, sent_info = re.split(r'_', id)
        
        # Step 2: Split the second part into individual digits
        sent_info_list = re.findall(r'\d', sent_info)

        d["pmid"] = int(pmid)
        d["sent_no"] = int(sent_info_list[0])
        d["entity_no"] = int(sent_info_list[1])
        d["sent_iter_no"] = int(sent_info_list[2])

    return data


def explode_data(data_dict):
    
    def split_pmid_and_sentence_no(data):
        for d in data:
            pmid_str = d['id']
            parts = pmid_str.split('_')
            try:
                pmid = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
                sentence_no = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
                entity_no = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
                iter_no = int(parts[3]) if len(parts) > 3 and parts[3].isdigit() else 0
            except Exception as e:
                print(f"PMID Before: {pmid_str}")
                print(f"PMID after: {pmid}, sentence_no: {sentence_no}, entity_no: {entity_no}, iter_no: {iter_no}")
                traceback.print_exc()
                pmid, sentence_no, entity_no, iter_no = 0, 0, 0, 0
            
            return pmid, sentence_no, entity_no, iter_no  # return values after processing each `d`
    
    result = []
    
    for i, relation in enumerate(data_dict["relation"]):
        pmid, sentence_no, entity_no, iter_no = split_pmid_and_sentence_no([data_dict])
        
        row = {
            "pmid": pmid,
            "sent_no": sentence_no,
            "entity_no": entity_no,
            "sent_iter_no": iter_no,
            "text": data_dict["text"],
            "text_with_entity_marker": data_dict["text_with_entity_marker"],
            "entity_1": relation["entity_1"],
            "entity_1_idx": relation["entity_1_idx"],
            "entity_1_type": relation["entity_1_type"],
            "entity_2": relation["entity_2"],
            "entity_2_idx": relation["entity_2_idx"],
            "entity_2_type": relation["entity_2_type"],
            "relation_id": relation["relation_id"],
            "relation_type": relation["relation_type"],
            "directed": data_dict["directed"],
            "reversed": data_dict["reverse"]
        }
    
        result.append(row)  
    
    return result


## Implementation

In [None]:
prediction_df = pd.read_csv(file_path, delimiter='\t')
print(f"Number of predictions extracted: {len(prediction_df)} rows. \n")

# Extracting relation corpus
data = []
with open(denv_file_path, 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                data.append(json.loads(line.strip()))
print(f"Example: {data[0]}\n")

all_rows = []
for d in tqdm(data, desc="Processing.."):
    all_rows.extend(explode_data(d))

relation_df = pd.DataFrame(all_rows)
print(f"Number of relations: {len(relation_df)} rows. \n")
print(f"Shape of relation_df: {relation_df.shape}. \n")
print("Assigning Predictions to Relation Df")
relation_df['relation_type'] = prediction_df['prediction']
relation_df.head(10)

Number of predictions extracted: 66490 rows. 

Example: {'id': '34696397_5_2_0', 'text': 'Here in this review article, we will be discussing different stages of the Dengue virus infection cycle inside mammalian host cells and how host proteins are exploited by the virus in the course of infection as well as how the host counteracts the virus by eliciting different antiviral responses.', 'text_with_entity_marker': 'Here in this review article, we will be discussing different stages of the Dengue virus infection cycle inside [E1]mammalian host cells [/E1]and how [E2]host proteins [/E2]are exploited by the virus in the course of infection as well as how the host counteracts the virus by eliciting different antiviral responses.', 'relation': [{'relation_type': 'Negative', 'relation_id': 2, 'entity_1': 'mammalian host cells', 'entity_1_idx': [111, 132], 'entity_1_idx_in_text_with_entity_marker': [111, 141], 'entity_1_type': '@CELL_TYPE', 'entity_1_type_id': 0, 'entity_2': 'host proteins', '

Processing..: 100%|██████████| 66490/66490 [00:00<00:00, 239499.15it/s]


Number of relations: 66490 rows. 

Shape of relation_df: (66490, 16). 

Assigning Predictions to Relation Df


Unnamed: 0,pmid,sent_no,entity_no,sent_iter_no,text,text_with_entity_marker,entity_1,entity_1_idx,entity_1_type,entity_2,entity_2_idx,entity_2_type,relation_id,relation_type,directed,reversed
0,34696397,5,2,0,"Here in this review article, we will be discus...","Here in this review article, we will be discus...",mammalian host cells,"[111, 132]",@CELL_TYPE,host proteins,"[140, 154]",@PROTEIN,2,negative,False,False
1,38649998,8,4,0,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genome,","[198, 210]",@DNA,"host genes,","[228, 239]",@DNA,2,negative,False,False
2,38649998,8,4,1,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genome,","[198, 210]",@DNA,"viral and host mRNAs,","[255, 276]",@RNA,2,negative,False,False
3,38649998,8,4,2,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genome,","[198, 210]",@DNA,"viral proteins,","[369, 384]",@PROTEIN,2,negative,False,False
4,38649998,8,4,3,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genes,","[228, 239]",@DNA,"viral and host mRNAs,","[255, 276]",@RNA,2,negative,False,False
5,38649998,8,4,4,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genes,","[228, 239]",@DNA,"viral proteins,","[369, 384]",@PROTEIN,2,negative,False,False
6,38649998,8,4,5,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"viral and host mRNAs,","[255, 276]",@RNA,"viral proteins,","[369, 384]",@PROTEIN,2,negative,False,False
7,11893341,3,2,0,Difference maps indicate the location of the s...,Difference maps indicate the location of the [...,small membrane protein M,"[45, 70]",@PROTEIN,E dimers.,"[109, 117]",@PROTEIN,2,negative,False,False
8,11893341,4,3,0,"The structure suggests that flaviviruses, and ...","The structure suggests that flaviviruses, and ...",distal beta barrels,"[115, 135]",@PROTEIN,domain II,"[138, 148]",@PROTEIN,2,structural,False,False
9,11893341,4,3,1,"The structure suggests that flaviviruses, and ...","The structure suggests that flaviviruses, and ...",distal beta barrels,"[115, 135]",@PROTEIN,glycoprotein E,"[155, 170]",@PROTEIN,2,negative,False,False


## Saving

In [4]:
relation_df.to_csv(f"{output_path}filtered.csv",index=False)
print(f"Dataframe Saved: '{output_path}filtered.csv.")

Dataframe Saved: 'D:/CSE498R_Resources/D3N/Dengue-Drug-Discovery-Network-D3N/Data/d3n_processed_data/filtered.csv.


In [6]:
relation_df.head()

Unnamed: 0,pmid,sent_no,entity_no,sent_iter_no,text,text_with_entity_marker,entity_1,entity_1_idx,entity_1_type,entity_2,entity_2_idx,entity_2_type,relation_id,relation_type,directed,reversed
0,34696397,5,2,0,"Here in this review article, we will be discus...","Here in this review article, we will be discus...",mammalian host cells,"[111, 132]",@CELL_TYPE,host proteins,"[140, 154]",@PROTEIN,2,negative,False,False
1,38649998,8,4,0,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genome,","[198, 210]",@DNA,"host genes,","[228, 239]",@DNA,2,negative,False,False
2,38649998,8,4,1,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genome,","[198, 210]",@DNA,"viral and host mRNAs,","[255, 276]",@RNA,2,negative,False,False
3,38649998,8,4,2,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genome,","[198, 210]",@DNA,"viral proteins,","[369, 384]",@PROTEIN,2,negative,False,False
4,38649998,8,4,3,This review aims to summarize the complex inte...,This review aims to summarize the complex inte...,"host genes,","[228, 239]",@DNA,"viral and host mRNAs,","[255, 276]",@RNA,2,negative,False,False


## Filter Conditions

In [7]:
ppi_conditions = (relation_df["entity_1_type"] == "@PROTEIN") & (relation_df["entity_2_type"] == "@PROTEIN")

gi_conditions = (
    (relation_df["entity_1_type"] == "@DNA") & (relation_df["entity_2_type"] == "@DNA") |
    (relation_df["entity_1_type"] == "@RNA") & (relation_df["entity_2_type"] == "@RNA") |
    (relation_df["entity_1_type"] == "@DNA") & (relation_df["entity_2_type"] == "@RNA") |
    (relation_df["entity_1_type"] == "@RNA") & (relation_df["entity_2_type"] == "@DNA") |
    (relation_df["entity_1_type"] == "@DNA") & (relation_df["entity_2_type"] == "@PROTEIN") |
    (relation_df["entity_1_type"] == "@PROTEIN") & (relation_df["entity_2_type"] == "@DNA") |
    (relation_df["entity_1_type"] == "@RNA") & (relation_df["entity_2_type"] == "@PROTEIN") |
    (relation_df["entity_1_type"] == "@PROTEIN") & (relation_df["entity_2_type"] == "@RNA")
)

metabolic_conditions = (
    (relation_df["entity_1_type"] == "@CELL_TYPE") & (relation_df["entity_2_type"] == "@CELL_TYPE") |
    (relation_df["entity_1_type"] == "@RNA") & (relation_df["entity_2_type"] == "@RNA") |
    (relation_df["entity_1_type"] == "@CELL_TYPE") & (relation_df["entity_2_type"] == "@RNA") |
    (relation_df["entity_1_type"] == "@RNA") & (relation_df["entity_2_type"] == "@CELL_TYPE") |
    (relation_df["entity_1_type"] == "@CELL_TYPE") & (relation_df["entity_2_type"] == "@PROTEIN") |
    (relation_df["entity_1_type"] == "@PROTEIN") & (relation_df["entity_2_type"] == "@CELL_TYPE") |
    (relation_df["entity_1_type"] == "@RNA") & (relation_df["entity_2_type"] == "@PROTEIN") |
    (relation_df["entity_1_type"] == "@PROTEIN") & (relation_df["entity_2_type"] == "@RNA")
)


# Create separate DataFrames
ppi_df = relation_df[ppi_conditions].copy()
gi_df = relation_df[gi_conditions].copy()
metabolic_df = relation_df[metabolic_conditions].copy()

negative_condition = relation_df["relation_type"] == "negative"
negative_df = relation_df[negative_condition]

ppi_df_filtered = ppi_df[ppi_df["relation_type"] != "negative"].copy()
gi_df_filtered = gi_df[gi_df["relation_type"] != "negative"].copy()
metabolic_df_filtered = metabolic_df[metabolic_df["relation_type"] != "negative"].copy()

ppi_df_filtered.to_csv(f"{output_path}ppi_filtered.csv",index=False)
gi_df_filtered.to_csv(f"{output_path}gi_filtered.csv",index=False)
metabolic_df_filtered.to_csv(f"{output_path}metabolic_filtered.csv",index=False)
negative_df.to_csv(f"{output_path}negative_filtered.csv",index=False)
print(len(negative_df))

46335


## EDA

In [15]:
print(f"Total PPI: {len(ppi_df)}, total positive PPI: {len(ppi_df_filtered)}")
ppi_df_filtered.head()

Total PPI: 38378, total positive PPI: 13442


Unnamed: 0,pmid,sent_no,entity_no,sent_iter_no,text,text_with_entity_marker,entity_1,entity_1_idx,entity_1_type,entity_2,entity_2_idx,entity_2_type,relation_id,relation_type,directed,reversed
8,11893341,4,3,0,"The structure suggests that flaviviruses, and ...","The structure suggests that flaviviruses, and ...",distal beta barrels,"[115, 135]",@PROTEIN,domain II,"[138, 148]",@PROTEIN,2,structural,False,False
12,23389466,3,3,0,DENV binds to its receptor molecules mediated ...,DENV binds to its [E1]receptor molecules [/E1]...,receptor molecules,"[18, 37]",@PROTEIN,"viral envelope (E) protein,","[56, 83]",@PROTEIN,2,structural,False,False
13,23389466,3,3,1,DENV binds to its receptor molecules mediated ...,DENV binds to its [E1]receptor molecules [/E1]...,receptor molecules,"[18, 37]",@PROTEIN,virus-receptor complex,"[117, 140]",@PROTEIN,2,structural,False,False
14,23389466,3,3,2,DENV binds to its receptor molecules mediated ...,DENV binds to its receptor molecules mediated ...,"viral envelope (E) protein,","[56, 83]",@PROTEIN,virus-receptor complex,"[117, 140]",@PROTEIN,2,structural,False,False
27,25157370,8,3,0,The viral entry process is mediated by viral p...,The viral entry process is mediated by [E1]vir...,viral proteins,"[39, 54]",@PROTEIN,cellular receptor molecules,"[58, 86]",@PROTEIN,2,structural,False,False


In [16]:
print(f"Total GI: {len(gi_df)}, total positive GI: {len(gi_df_filtered)}")
gi_df_filtered.head()

Total GI: 13016, total positive GI: 3674


Unnamed: 0,pmid,sent_no,entity_no,sent_iter_no,text,text_with_entity_marker,entity_1,entity_1_idx,entity_1_type,entity_2,entity_2_idx,entity_2_type,relation_id,relation_type,directed,reversed
28,25157370,8,3,1,The viral entry process is mediated by viral p...,The viral entry process is mediated by [E1]vir...,viral proteins,"[39, 54]",@PROTEIN,viral RNA,"[181, 191]",@RNA,2,structural,False,False
29,25157370,8,3,2,The viral entry process is mediated by viral p...,The viral entry process is mediated by viral p...,cellular receptor molecules,"[58, 86]",@PROTEIN,viral RNA,"[181, 191]",@RNA,2,structural,False,False
30,37175867,1,4,0,Mature virions are composed of an icosahedral ...,Mature virions are composed of an icosahedral ...,envelope (E) and membrane (M) proteins,"[55, 94]",@PROTEIN,approximately,"[166, 180]",@RNA,2,structural,False,False
31,37175867,1,4,1,Mature virions are composed of an icosahedral ...,Mature virions are composed of an icosahedral ...,envelope (E) and membrane (M) proteins,"[55, 94]",@PROTEIN,11 kb genomic RNA,"[180, 198]",@RNA,2,structural,False,False
33,37175867,1,4,3,Mature virions are composed of an icosahedral ...,Mature virions are composed of an icosahedral ...,approximately,"[166, 180]",@RNA,11 kb genomic RNA,"[180, 198]",@RNA,2,structural,False,False


In [17]:
print(f"Total Metabolic: {len(metabolic_df)}, total positive Metabolic: {len(metabolic_df_filtered)}")
metabolic_df_filtered.head()

Total Metabolic: 12328, total positive Metabolic: 3376


Unnamed: 0,pmid,sent_no,entity_no,sent_iter_no,text,text_with_entity_marker,entity_1,entity_1_idx,entity_1_type,entity_2,entity_2_idx,entity_2_type,relation_id,relation_type,directed,reversed
28,25157370,8,3,1,The viral entry process is mediated by viral p...,The viral entry process is mediated by [E1]vir...,viral proteins,"[39, 54]",@PROTEIN,viral RNA,"[181, 191]",@RNA,2,structural,False,False
29,25157370,8,3,2,The viral entry process is mediated by viral p...,The viral entry process is mediated by viral p...,cellular receptor molecules,"[58, 86]",@PROTEIN,viral RNA,"[181, 191]",@RNA,2,structural,False,False
30,37175867,1,4,0,Mature virions are composed of an icosahedral ...,Mature virions are composed of an icosahedral ...,envelope (E) and membrane (M) proteins,"[55, 94]",@PROTEIN,approximately,"[166, 180]",@RNA,2,structural,False,False
31,37175867,1,4,1,Mature virions are composed of an icosahedral ...,Mature virions are composed of an icosahedral ...,envelope (E) and membrane (M) proteins,"[55, 94]",@PROTEIN,11 kb genomic RNA,"[180, 198]",@RNA,2,structural,False,False
33,37175867,1,4,3,Mature virions are composed of an icosahedral ...,Mature virions are composed of an icosahedral ...,approximately,"[166, 180]",@RNA,11 kb genomic RNA,"[180, 198]",@RNA,2,structural,False,False


In [21]:
# Rows containing CELL-LINES. Hypothesis: Are they all negative interactions?
cell_line_conditions = ((relation_df["entity_1_type"] == "@CELL_LINE") | (relation_df["entity_2_type"] == "@CELL_LINE"))

cell_line_df = relation_df[cell_line_conditions]
cell_line_df_negative = cell_line_df[cell_line_df["relation_type"] == "negative"]
cell_line_df_filtered = cell_line_df[cell_line_df["relation_type"] != "negative"]

print(f"Length of Cell_Line: {len(cell_line_df)}, Length of Cell_Line Negative: {len(cell_line_df_negative)}, Length of Cell_Line Positive: {len(cell_line_df_filtered)}")

Length of Cell_Line: 4965, Length of Cell_Line Negative: 4251, Length of Cell_Line Positive: 714


In [22]:
cell_line_df_filtered.head()

Unnamed: 0,pmid,sent_no,entity_no,sent_iter_no,text,text_with_entity_marker,entity_1,entity_1_idx,entity_1_type,entity_2,entity_2_idx,entity_2_type,relation_id,relation_type,directed,reversed
192,30463971,8,4,1,The expression of RHA or RHA-K417R mutant prot...,The expression of [E1]RHA [/E1]or RHA-K417R mu...,RHA,"[18, 22]",@PROTEIN,RHA-knockdown cells,"[86, 106]",@CELL_LINE,2,enzyme,False,False
194,30463971,8,4,3,The expression of RHA or RHA-K417R mutant prot...,The expression of RHA or [E1]RHA-K417R mutant ...,RHA-K417R mutant protein,"[25, 50]",@PROTEIN,RHA-knockdown cells,"[86, 106]",@CELL_LINE,2,enzyme,False,False
196,30463971,8,4,5,The expression of RHA or RHA-K417R mutant prot...,The expression of RHA or RHA-K417R mutant prot...,RHA-knockdown cells,"[86, 106]",@CELL_LINE,RHA,"[195, 199]",@PROTEIN,2,enzyme,False,False
359,27313500,5,4,0,Pretreatment of B16 cells with synthetase inhi...,Pretreatment of [E1]B16 cells [/E1]with synthe...,B16 cells,"[16, 26]",@CELL_LINE,B16,"[86, 90]",@CELL_TYPE,2,enzyme,False,False
368,27313500,7,2,0,"Furthermore, GM3 was colocalized with DENV vir...","Furthermore, GM3 was colocalized with [E1]DENV...",DENV viral replication complex,"[38, 69]",@PROTEIN,B16 cells.,"[110, 119]",@CELL_LINE,2,enzyme,False,False
