In [None]:
# Install required libraries
!pip install pandas
import pandas as pd
import numpy as np
import re


In [None]:
# Import the drive module from Google Colab for file access, mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Upload the dataset to be added to PrimeKG, each celltype is being processed as separate CSV files
# Manually add the celltype to nodes.csv before proceeding
cell = pd.read_csv('/content/drive/My Drive/primekg_files/ex.csv')

def looks_like_date_or_nan(name):
    # Check if the input is NaN
    if pd.isna(name):
        return True
    # Ensure the input is a string before applying the regex
    elif isinstance(name, str):
        # Regular expression to match patterns like '12-Sept' found in mathys dataset
        if re.match(r"^\d{1,2}-[A-Za-z]{3,4}$", name):
            return True
    return False


# Extract columns representing genes, p-values, and log fold change
column_indices1 = [0, 1, 4]   # Indices for no-pathology vs pathology
column_indices2 = [11, 12, 15]  # Indices for no-pathology vs early-pathology
column_indices3 = [22, 23, 26]  # Indices for early-pathology vs late-pathology
cell_1 = cell.iloc[:, column_indices1]
cell_2 = cell.iloc[:, column_indices2]
cell_3 = cell.iloc[:, column_indices3]

# Filter out rows where gene names look like dates in each DataFrame
cell_1 = cell_1[~cell_1['Unnamed: 0'].apply(looks_like_date_or_nan)]
cell_2 = cell_2[~cell_2['Unnamed: 11'].apply(looks_like_date_or_nan)]
cell_3 = cell_3[~cell_3['Unnamed: 22'].apply(looks_like_date_or_nan)]

In [None]:
# Filter rows based on p-value threshold (p < 0.05) and non-null criteria
cell_1 = cell_1[(cell_1['IndModel.adj.pvals'] < 0.05) & (~np.isnan(cell_1['IndModel.adj.pvals']))].copy()
cell_2 = cell_2[(cell_2['IndModel.adj.pvals.1'] < 0.05) & (~np.isnan(cell_2['IndModel.adj.pvals.1']))].copy()
cell_3 = cell_3[(cell_3['IndModel.adj.pvals.2'] < 0.05) & (~np.isnan(cell_3['IndModel.adj.pvals.2']))].copy()

# Drop pval column, no longer needed
cell_1.drop('IndModel.adj.pvals', axis=1, inplace=True)
cell_2.drop('IndModel.adj.pvals.1', axis=1, inplace=True)
cell_3.drop('IndModel.adj.pvals.2', axis=1, inplace=True)

In [None]:
# Convert fold change values to numeric and categorize into edge types
cell_1['IndModel.FC'] = pd.to_numeric(cell_1['IndModel.FC'], errors='coerce')
cell_1['IndModel.FC'] = cell_1['IndModel.FC'].apply(lambda x: 'no-pathology vs pathology up mathys' if x > 0 else 'no-pathology vs pathology down mathys')

cell_2['IndModel.FC.1'] = pd.to_numeric(cell_2['IndModel.FC.1'], errors='coerce')
cell_2['IndModel.FC.1'] = cell_2['IndModel.FC.1'].apply(lambda x: 'no-pathology vs early-pathology up mathys' if x > 0 else 'no-pathology vs early-pathology down mathys')

cell_3['IndModel.FC.2'] = pd.to_numeric(cell_3['IndModel.FC.2'], errors='coerce')
cell_3['IndModel.FC.2'] = cell_3['IndModel.FC.2'].apply(lambda x: 'early-pathology vs late-pathology up mathys' if x > 0 else 'early-pathology vs late-pathology down mathys')

#print df head to check
#print (cell_1.head())
#print (cell_2.head())
#print (cell_3.head())

In [None]:
# Load existing PrimeKG nodes data
nodes = pd.read_csv('/content/drive/My Drive/primekg_files/nodes.csv')

# Create a dictionary to map existing node names to their IDs for efficient lookup
node_id_dict = nodes.set_index('node_name')['node_id'].to_dict()

# Map gene names in cell_1 to their corresponding node IDs using the dictionary
cell_1['node_id'] = cell_1['Unnamed: 0'].map(node_id_dict)

# Initialize a list to collect new node data and a counter for new node IDs
new_rows = []
mathys_index = 1  # Start counter for generating unique new node IDs

# Determine the starting index for new nodes based on the existing data
new_node_index = nodes['node_index'].max() + 1

# Process each row in cell_1 to update existing nodes or create new node entries
for i, row in cell_1.iterrows():
    if pd.isna(row['node_id']):
        # Assign a unique new node ID for genes not already in PrimeKG
        new_node_id = f"mathys{mathys_index}"
        cell_1.at[i, 'node_id'] = new_node_id

        # Construct a new node entry and add it to the list
        new_row = {
            'node_index': new_node_index,
            'node_id': new_node_id,
            'node_type': 'gene/protein',
            'node_name': row['Unnamed: 0'],
            'node_source': 'mathys et al. 2019'
        }
        new_rows.append(new_row)
        mathys_index += 1
        new_node_index += 1

# Combine the newly created node data with the existing nodes DataFrame
nodes_1 = pd.DataFrame()
nodes_1 = pd.concat([nodes, pd.DataFrame(new_rows)], ignore_index=True)

# Output information about the updated nodes for verification
print("Final mathys_index:", mathys_index)
print("check nodes:\n", pd.DataFrame(new_rows).head())


Final mathys_index: 753
check nodes:
    node_index  node_id     node_type node_name         node_source
0      129381  mathys1  gene/protein   NGFRAP1  mathys et al. 2019
1      129382  mathys2  gene/protein    ATPIF1  mathys et al. 2019
2      129383  mathys3  gene/protein   C14orf2  mathys et al. 2019
3      129384  mathys4  gene/protein   FAM153C  mathys et al. 2019
4      129385  mathys5  gene/protein    ATP5G3  mathys et al. 2019


In [None]:
# Update the node ID dictionary with the latest nodes data from nodes_1
node_id_dict = nodes_1.set_index('node_name')['node_id'].to_dict()

# Map gene names in cell_2 to their corresponding node IDs using the updated dictionary
cell_2['node_id'] = cell_2['Unnamed: 11'].map(node_id_dict)

# Determine the starting index for new nodes based on the updated nodes data
new_node_index = nodes_1['node_index'].max() + 1

# Initialize a counter for new node IDs, continuing from the last used index in cell_1 processing
mathys_index = 753  # Continuing the node ID counter for cell_2

# Clear new_rows before starting to process cell_2 to avoid appending to old data
new_rows = []

# Process each row in cell_2 to update existing nodes or create new node entries
for i, row in cell_2.iterrows():
    if pd.isna(row['node_id']):
        # Assign a unique new node ID for genes not already in PrimeKG
        new_node_id = f"mathys{mathys_index}"
        cell_2.at[i, 'node_id'] = new_node_id

        # Construct a new node entry and add it to the list
        new_row = {
            'node_index': new_node_index,
            'node_id': new_node_id,
            'node_type': 'gene/protein',
            'node_name': row['Unnamed: 11'],
            'node_source': 'mathys et al. 2019'
        }
        new_rows.append(new_row)
        mathys_index += 1
        new_node_index += 1

# Combine the newly created node data with the previously updated nodes DataFrame (nodes_1)
nodes_2 = pd.DataFrame()
nodes_2 = pd.concat([nodes_1, pd.DataFrame(new_rows)], ignore_index=True)

# Output information about the updated nodes for verification
print("Final mathys_index for cell_2 processing:", mathys_index)
print("check nodes:\n", pd.DataFrame(new_rows).head())


Final mathys_index for cell_2 processing: 810
check index: 130190
check nodes:
    node_index    node_id     node_type  node_name         node_source
0      130133  mathys753  gene/protein   C11orf84  mathys et al. 2019
1      130134  mathys754  gene/protein  HIST1H2AC  mathys et al. 2019
2      130135  mathys755  gene/protein   C17orf96  mathys et al. 2019
3      130136  mathys756  gene/protein    TMEM159  mathys et al. 2019
4      130137  mathys757  gene/protein  HIST1H2BD  mathys et al. 2019


In [None]:
# Update the node ID dictionary with the latest nodes data from nodes_2
node_id_dict = nodes_2.set_index('node_name')['node_id'].to_dict()

# Map gene names in cell_3 to their corresponding node IDs using the updated dictionary
cell_3['node_id'] = cell_3['Unnamed: 22'].map(node_id_dict)

# Determine the starting index for new nodes based on the updated nodes data
new_node_index = nodes_2['node_index'].max() + 1

# Initialize a counter for new node IDs, continuing from the last used index in cell_2 processing
mathys_index = 810  # Continuing the node ID counter for cell_3

# Clear new_rows before starting to process cell_2 to avoid appending to old data
new_rows = []

# Process each row in cell_3 to update existing nodes or create new node entries
for i, row in cell_3.iterrows():
    if pd.isna(row['node_id']):
        # Assign a unique new node ID for genes not already in PrimeKG
        new_node_id = f"mathys{mathys_index}"
        cell_3.at[i, 'node_id'] = new_node_id

        # Construct a new node entry and add it to the list
        new_row = {
            'node_index': new_node_index,
            'node_id': new_node_id,
            'node_type': 'gene/protein',
            'node_name': row['Unnamed: 22'],
            'node_source': 'mathys et al. 2019'
        }
        new_rows.append(new_row)
        mathys_index += 1
        new_node_index += 1

# Combine the newly created node data with the previously updated nodes DataFrame (nodes_2)
nodes_3 = pd.DataFrame()
nodes_3 = pd.concat([nodes_2, pd.DataFrame(new_rows)], ignore_index=True)

# Output information about the updated nodes for verification
print("Final mathys_index for cell_3 processing:", mathys_index)
print("check nodes:\n", pd.DataFrame(new_rows).head())


Final mathys_index for cell_3 processing: 832
check nodes:
    node_index    node_id     node_type     node_name         node_source
0      130190  mathys810  gene/protein       FAM231D  mathys et al. 2019
1      130191  mathys811  gene/protein    AC087350.1  mathys et al. 2019
2      130192  mathys812  gene/protein         SEPP1  mathys et al. 2019
3      130193  mathys813  gene/protein      HIST1H4E  mathys et al. 2019
4      130194  mathys814  gene/protein  CTD-2370N5.3  mathys et al. 2019


In [None]:
# Convert DataFrame 'cell_1' to PrimeKG format for integration
new_df1 = pd.DataFrame()

# Copy the 'relation' data from 'cell_1'
new_df1['relation'] = cell_1['IndModel.FC']
new_df1['display_relation'] = new_df1['relation'].map({
    'no-pathology vs pathology up mathys': 'np-p-up m',
    'no-pathology vs pathology down mathys': 'np-p-down m'
})

# Set attributes for x_node in the PrimeKG data format
new_df1['x_id'] = 'mathys_1'  # Unique identifier for the x_node
new_df1['x_type'] = 'celltype'  # Type of the x_node
new_df1['x_name'] = 'Ex'  # Name of the x_node
new_df1['x_source'] = 'mathys et al. 2019'  # Source of the x_node data

# Set attributes for y_node in the PrimeKG data format
new_df1['y_id'] = cell_1['node_id']  # Unique identifier for the y_node
new_df1['y_type'] = 'gene/protein'  # Type of the y_node
new_df1['y_name'] = cell_1['Unnamed: 0']  # Name of the y_node

# Determine the y_source based on the node_id
new_df1['y_source'] = new_df1['y_id'].apply(lambda x: 'mathys et al. 2019' if x.startswith('mathys') else 'NCBI')

# Print the first few rows of the new DataFrame to verify the structure
print(new_df1.head())


                                relation display_relation      x_id    x_type  \
0    no-pathology vs pathology up mathys        np-p-up m  mathys_1  celltype   
1  no-pathology vs pathology down mathys      np-p-down m  mathys_1  celltype   
2    no-pathology vs pathology up mathys        np-p-up m  mathys_1  celltype   
3  no-pathology vs pathology down mathys      np-p-down m  mathys_1  celltype   
4    no-pathology vs pathology up mathys        np-p-up m  mathys_1  celltype   

  x_name            x_source     y_id        y_type    y_name  \
0     Ex  mathys et al. 2019   153020  gene/protein  RASGEF1B   
1     Ex  mathys et al. 2019  mathys1  gene/protein   NGFRAP1   
2     Ex  mathys et al. 2019    84894  gene/protein    LINGO1   
3     Ex  mathys et al. 2019    55859  gene/protein      BEX1   
4     Ex  mathys et al. 2019     1811  gene/protein   SLC26A3   

             y_source  
0                NCBI  
1  mathys et al. 2019  
2                NCBI  
3                NCBI  
4 

In [None]:
# Convert DataFrame 'cell_2' to PrimeKG format for integration
new_df2 = pd.DataFrame()

# Copy the 'relation' data from 'cell_2'
new_df2['relation'] = cell_2['IndModel.FC.1']
new_df2['display_relation'] = new_df2['relation'].map({
    'no-pathology vs early-pathology up mathys': 'np-ep-up m',
    'no-pathology vs early-pathology down mathys': 'np-ep-down m'
})

# Set attributes for x_node in the PrimeKG data format
new_df2['x_id'] = 'mathys_1'  # Unique identifier for the x_node
new_df2['x_type'] = 'celltype'  # Type of the x_node
new_df2['x_name'] = 'Ex'  # Name of the x_node
new_df2['x_source'] = 'mathys et al. 2019'  # Source of the x_node data

# Set attributes for y_node in the PrimeKG data format
new_df2['y_id'] = cell_2['node_id']  # Unique identifier for the y_node
new_df2['y_type'] = 'gene/protein'  # Type of the y_node
new_df2['y_name'] = cell_2['Unnamed: 11']  # Name of the y_node

# Determine the y_source based on the node_id
new_df2['y_source'] = new_df2['y_id'].apply(lambda x: 'mathys et al. 2019' if x.startswith('mathys') else 'NCBI')

# Print the first few rows of 'new_df2' to verify the structure and data mapping
print(new_df2.head())


                                      relation display_relation      x_id  \
0  no-pathology vs early-pathology down mathys     np-ep-down m  mathys_1   
1  no-pathology vs early-pathology down mathys     np-ep-down m  mathys_1   
2  no-pathology vs early-pathology down mathys     np-ep-down m  mathys_1   
3    no-pathology vs early-pathology up mathys       np-ep-up m  mathys_1   
4  no-pathology vs early-pathology down mathys     np-ep-down m  mathys_1   

     x_type x_name            x_source     y_id        y_type    y_name  \
0  celltype     Ex  mathys et al. 2019     3925  gene/protein     STMN1   
1  celltype     Ex  mathys et al. 2019  mathys2  gene/protein    ATPIF1   
2  celltype     Ex  mathys et al. 2019     6138  gene/protein     RPL15   
3  celltype     Ex  mathys et al. 2019   153020  gene/protein  RASGEF1B   
4  celltype     Ex  mathys et al. 2019     3094  gene/protein     HINT1   

             y_source  
0                NCBI  
1  mathys et al. 2019  
2             

In [None]:
# Convert DataFrame 'cell_3' to PrimeKG format for integration
new_df3 = pd.DataFrame()

# Copy the 'relation' data from 'cell_3'
new_df3['relation'] = cell_3['IndModel.FC.2']
new_df3['display_relation'] = new_df3['relation'].map({
    'early-pathology vs late-pathology up mathys': 'ep-lp-up m',
    'early-pathology vs late-pathology down mathys': 'ep-lp-down m'
})

# Set attributes for x_node in the PrimeKG data format
new_df3['x_id'] = 'mathys_1'  # Identifier for the x_node
new_df3['x_type'] = 'celltype'  # Type of the x_node
new_df3['x_name'] = 'Ex'  # Name of the x_node
new_df3['x_source'] = 'mathys et al. 2019'  # Data source for the x_node

# Set attributes for y_node in the PrimeKG data format
new_df3['y_id'] = cell_3['node_id']  # Unique identifier for each y_node
new_df3['y_type'] = 'gene/protein'  # Type of the y_node
new_df3['y_name'] = cell_3['Unnamed: 22']  # Name of the y_node

# Determine the y_source based on the node_id
new_df3['y_source'] = new_df3['y_id'].apply(lambda x: 'mathys et al. 2019' if x.startswith('mathys') else 'NCBI')

# Print the first few rows of 'new_df3' to verify correct data formatting and mapping
print(new_df3.head())


                                      relation display_relation      x_id  \
0  early-pathology vs late-pathology up mathys       ep-lp-up m  mathys_1   
1  early-pathology vs late-pathology up mathys       ep-lp-up m  mathys_1   
2  early-pathology vs late-pathology up mathys       ep-lp-up m  mathys_1   
3  early-pathology vs late-pathology up mathys       ep-lp-up m  mathys_1   
4  early-pathology vs late-pathology up mathys       ep-lp-up m  mathys_1   

     x_type x_name            x_source   y_id        y_type    y_name y_source  
0  celltype     Ex  mathys et al. 2019   8655  gene/protein    DYNLL1     NCBI  
1  celltype     Ex  mathys et al. 2019   4697  gene/protein    NDUFA4     NCBI  
2  celltype     Ex  mathys et al. 2019   3320  gene/protein  HSP90AA1     NCBI  
3  celltype     Ex  mathys et al. 2019   1350  gene/protein     COX7C     NCBI  
4  celltype     Ex  mathys et al. 2019  83442  gene/protein  SH3BGRL3     NCBI  


In [None]:
# Read in kgraw
kgraw = pd.read_csv('/content/drive/My Drive/primekg_files/kg_raw.csv')


  kgraw = pd.read_csv('/content/drive/My Drive/primekg_files/kg_raw.csv')


In [None]:
# Mirror each edge, for each (x,y) edge there should also be a (y,x) edge to match PrimeKG format
combined_df = pd.concat([new_df1, new_df2, new_df3], ignore_index=True)
interchanged_df = combined_df.copy()

columns_to_interchange = ['id', 'type', 'name', 'source']
for col in columns_to_interchange:
    interchanged_df[f'x_{col}'], interchanged_df[f'y_{col}'] = combined_df[f'y_{col}'], combined_df[f'x_{col}']

# now concatenate kgraw with the dataframes
final = pd.concat([kgraw, combined_df, interchanged_df], ignore_index=True)



final.to_csv('/content/drive/My Drive/primekg_files/kgraw_final_1.csv', index=False, header=True)
nodes_3.to_csv('/content/drive/My Drive/primekg_files/nodes_final_1.csv', index=False, header=True)

