#### Alliance_of_Genome_Resources 

In [None]:
# The following commands are used to decompress the compressed files in the dataset.
# These commands are designed for execution in a terminal, not directly in Jupyter Notebook.

# Example terminal commands (uncomment and run in a terminal):
# ! sudo gunzip -k data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv.gz
# ! sudo gunzip -k data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv.gz

In [5]:
! head -n 20 ../../data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv
! wc -l ../../data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv
! head -n 20 ../../data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv
! wc -l ../../data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv

##########################################################################
#
# Data type: Molecular Interactions
# Data format: PSI-MI TAB 2.7 Format
# README: https://github.com/HUPO-PSI/miTab/blob/master/PSI-MITAB27Format.md
# Source: Alliance of Genome Resources (Alliance)
# Source URL: http://alliancegenome.org/downloads
# Help Desk: help@alliancegenome.org
# TaxonIDs: NCBI:txid10090
# Species: Mus musculus
# Alliance Database Version: 7.2.0
# File generated (UTC): 2024-06-10 00:57
#
##########################################################################
#ID(s) interactor A	ID(s) interactor B	Alt. ID(s) interactor A	Alt. ID(s) interactor B	Alias(es) interactor A	Alias(es) interactor B	Interaction detection method(s)	Publication 1st author(s)	Publication Identifier(s)	Taxid interactor A	Taxid interactor B	Interaction type(s)	Source database(s)	Interaction identifier(s)	Confidence value(s)	Expansion method(s)	Biological role(s) interactor A	Biological role(s) interactor B	Experime

In [13]:
# Molecular Interaction.
# Mouse 
import pandas as pd

file_path = '../../data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv'

# Read the first few lines of the file to locate the line containing column names
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        if line.startswith('#ID'):
            header_line = i
            break

# Re-read the data starting from the line with the column names
data = pd.read_csv(file_path, sep='\t', header=header_line)

# Display the first two rows and the first five columns of the data
print(data.iloc[:2, :])


          #ID(s) interactor A           ID(s) interactor B  \
0  entrez gene/locuslink:4087  entrez gene/locuslink:75141   
1  entrez gene/locuslink:4087  entrez gene/locuslink:19376   

                             Alt. ID(s) interactor A  \
0  biogrid:110262|entrez gene/locuslink:SMAD2|uni...   
1  biogrid:110262|entrez gene/locuslink:SMAD2|uni...   

                             Alt. ID(s) interactor B  \
0  biogrid:217250|entrez gene/locuslink:Rasd2|uni...   
1  biogrid:202576|entrez gene/locuslink:Rab34|ent...   

                              Alias(es) interactor A  \
0  entrez gene/locuslink:JV18(gene name synonym)|...   
1  entrez gene/locuslink:JV18(gene name synonym)|...   

                              Alias(es) interactor B  \
0  entrez gene/locuslink:4930526B11Rik(gene name ...   
1  entrez gene/locuslink:AI326479(gene name synon...   

                     Interaction detection method(s)  \
0  psi-mi:"MI:0004"(affinity chromatography techn...   
1  psi-mi:"MI:0004"(affin

In [16]:
# Select the first two columns of the data
selected_data = data.iloc[:, [0, 1]].copy()

# Extract Entrez gene IDs by splitting the values in the columns at the colon and keeping the part after the colon
selected_data.loc[:, '#ID(s) interactor A'] = selected_data['#ID(s) interactor A'].apply(lambda x: x.split(':')[1])
selected_data.loc[:, 'ID(s) interactor B'] = selected_data['ID(s) interactor B'].apply(lambda x: x.split(':')[1])

# Display the first few rows of the selected data and its shape (number of rows and columns)
print(selected_data.head())
print(selected_data.shape)


  #ID(s) interactor A ID(s) interactor B
0                4087              75141
1                4087              19376
2                4087              69159
3                4087              72433
4                4087              69288
(114779, 2)


In [18]:
# Save the selected gene interaction data to a TSV file
# The file is saved without including the index or column headers
mgi_net_file = '../../dataset/pre_data/network/AGR_MGI.tsv'
selected_data.to_csv(mgi_net_file, sep='\t', index=False, header=False)

In [19]:
# Molecular Interaction.
# Human

import pandas as pd

# Define the file path for the data
file_path = '../../data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv'

# Read the first few lines of the file to locate the row containing the column names
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        if line.startswith('#ID'):
            header_line = i
            break

# Re-read the data starting from the line containing the column names
data = pd.read_csv(file_path, sep='\t', header=header_line)

# Display the first two rows and the first five columns of the data
print(data.iloc[:2, :])


           #ID(s) interactor A          ID(s) interactor B  \
0   entrez gene/locuslink:6416  entrez gene/locuslink:2318   
1  entrez gene/locuslink:84665    entrez gene/locuslink:88   

                             Alt. ID(s) interactor A  \
0  biogrid:112315|entrez gene/locuslink:MAP2K4|un...   
1  biogrid:124185|entrez gene/locuslink:MYPN|unip...   

                             Alt. ID(s) interactor B  \
0  biogrid:108607|entrez gene/locuslink:FLNC|unip...   
1  biogrid:106603|entrez gene/locuslink:ACTN2|uni...   

                              Alias(es) interactor A  \
0  entrez gene/locuslink:JNKK(gene name synonym)|...   
1  entrez gene/locuslink:CMD1DD(gene name synonym...   

                              Alias(es) interactor B  \
0  entrez gene/locuslink:ABP-280(gene name synony...   
1    entrez gene/locuslink:CMD1AA(gene name synonym)   

  Interaction detection method(s) Publication 1st author(s)  \
0    psi-mi:"MI:0018"(two hybrid)            Marti A (1997)   
1    psi-mi

In [20]:
# Select the first two columns of the data
selected_data = data.iloc[:, [0, 1]].copy()

# Extract Entrez gene ID by splitting the string at the colon and keeping the part after the colon
selected_data.loc[:, '#ID(s) interactor A'] = selected_data['#ID(s) interactor A'].apply(lambda x: x.split(':')[1])
selected_data.loc[:, 'ID(s) interactor B'] = selected_data['ID(s) interactor B'].apply(lambda x: x.split(':')[1])

# Display the first few rows of the selected data and its shape (number of rows and columns)
print(selected_data.head())
print(selected_data.shape)

  #ID(s) interactor A ID(s) interactor B
0                6416               2318
1               84665                 88
2                  90               2339
3                2624               5371
4                6118               6774
(1261933, 2)


In [21]:
# Save the selected gene interaction data to a TSV file
# The file is saved without including the index or column headers
human_net_file = '../../dataset/pre_data/network/AGR_human.tsv'
selected_data.to_csv(human_net_file, sep='\t', index=False, header=False)

#### BIOGRID

In [12]:
! head -n 5 ../../data/Network/BIOGRID/BIOGRID-ORGANISM-Homo_sapiens-4.4.235.tab3.txt
! wc -l ../../data/Network/BIOGRID/BIOGRID-ORGANISM-Homo_sapiens-4.4.235.tab3.txt
! head -n 5 ../../data/Network/BIOGRID/BIOGRID-ORGANISM-Mus_musculus-4.4.235.tab3.txt
! wc -l ../../data/Network/BIOGRID/BIOGRID-ORGANISM-Mus_musculus-4.4.235.tab3.txt

#BioGRID Interaction ID	Entrez Gene Interactor A	Entrez Gene Interactor B	BioGRID ID Interactor A	BioGRID ID Interactor B	Systematic Name Interactor A	Systematic Name Interactor B	Official Symbol Interactor A	Official Symbol Interactor B	Synonyms Interactor A	Synonyms Interactor B	Experimental System	Experimental System Type	Author	Publication Source	Organism ID Interactor A	Organism ID Interactor B	Throughput	Score	Modification	Qualifications	Tags	Source Database	SWISS-PROT Accessions Interactor A	TREMBL Accessions Interactor A	REFSEQ Accessions Interactor A	SWISS-PROT Accessions Interactor B	TREMBL Accessions Interactor B	REFSEQ Accessions Interactor B	Ontology Term IDs	Ontology Term Names	Ontology Term Categories	Ontology Term Qualifier IDs	Ontology Term Qualifier Names	Ontology Term Types	Organism Name Interactor A	Organism Name Interactor B
103	6416	2318	112315	108607	-	-	MAP2K4	FLNC	JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAPKK1|SEK1|SERK1|SKK1	ABP-280|ABP280A|ABPA|ABPL|FLN2|M

In [16]:
import numpy as np
import pandas as pd

# Load the network data from the TSV file
net_df = pd.read_csv('../../data/Network/BIOGRID/BIOGRID-ORGANISM-Mus_musculus-4.4.235.tab3.txt', sep='\t', header=0, dtype=str)

# Display the first 10 rows of the dataset
print(net_df[:10])

# Print the shape of the dataset (rows, columns)
print(net_df.shape)

# Get and display the column names
cols = net_df.columns
print(cols)


  #BioGRID Interaction ID Entrez Gene Interactor A Entrez Gene Interactor B  \
0                  117015                     4087                    75141   
1                  117016                     4087                    19376   
2                  117017                     4087                    69159   
3                  117018                     4087                    72433   
4                  117019                     4087                    69288   
5                  117020                     4087                    54126   
6                  117021                     4087                    78294   
7                  117022                     4087                    57443   
8                  117023                     4087                    18412   
9                  117024                     4087                    52432   

  BioGRID ID Interactor A BioGRID ID Interactor B  \
0                  110262                  217250   
1                  11026

In [18]:
# Extracting the second and third columns
data_df = net_df.iloc[:, [1, 2]]
# Display the extracted data
print(data_df.head(10))
print(data_df.shape)


  Entrez Gene Interactor A Entrez Gene Interactor B
0                     4087                    75141
1                     4087                    19376
2                     4087                    69159
3                     4087                    72433
4                     4087                    69288
5                     4087                    54126
6                     4087                    78294
7                     4087                    57443
8                     4087                    18412
9                     4087                    52432
(101492, 2)


In [19]:
# Check if there are any missing values in data_df
missing_values = data_df.isnull().sum().sum()  # Total number of missing values

# Display total number of missing values
print(f"Total missing values: {missing_values}")

# Identify the rows with missing values
rows_with_missing = data_df[data_df.isnull().any(axis=1)]
print(f"Rows with missing values:\n{rows_with_missing}")


Total missing values: 0
Rows with missing values:
Empty DataFrame
Columns: [Entrez Gene Interactor A, Entrez Gene Interactor B]
Index: []


In [20]:
# Save the gene interaction data as a TSV file
# Exclude index and header in the saved file
mouse_net_file = '../../dataset/pre_data/network/BIOGRID_mouse.tsv'
data_df.to_csv(mouse_net_file, sep='\t', index=False, header=False)


In [23]:
import numpy as np
import pandas as pd

# Load the human gene interaction data from the TSV file
net_df = pd.read_csv('../../data/Network/BIOGRID/BIOGRID-ORGANISM-Homo_sapiens-4.4.235.tab3.txt', sep='\t', header=0, dtype=str)

# Display the first 10 rows of the dataset
print(net_df[:10])

# Print the shape of the dataset (number of rows and columns)
print(net_df.shape)

# Get and display the column names
cols = net_df.columns
print(cols)


  #BioGRID Interaction ID Entrez Gene Interactor A Entrez Gene Interactor B  \
0                     103                     6416                     2318   
1                     117                    84665                       88   
2                     183                       90                     2339   
3                     278                     2624                     5371   
4                     418                     6118                     6774   
5                     586                      375                    23163   
6                     612                      377                    23647   
7                     617                      377                    27236   
8                     663                    54464                      226   
9                     866                      351                    10513   

  BioGRID ID Interactor A BioGRID ID Interactor B  \
0                  112315                  108607   
1                  12418

In [24]:
# Extracting the second and third columns
data_df = net_df.iloc[:, [1, 2]]
# Display the extracted data
print(data_df.head(10))
print(data_df.shape)

  Entrez Gene Interactor A Entrez Gene Interactor B
0                     6416                     2318
1                    84665                       88
2                       90                     2339
3                     2624                     5371
4                     6118                     6774
5                      375                    23163
6                      377                    23647
7                      377                    27236
8                    54464                      226
9                      351                    10513
(1237327, 2)


In [25]:
# Check if there are any missing values in data_df
missing_values = data_df.isnull().sum().sum()  # Total number of missing values

# Display total number of missing values
print(f"Total missing values: {missing_values}")

# Identify the rows with missing values
rows_with_missing = data_df[data_df.isnull().any(axis=1)]
print(f"Rows with missing values:\n{rows_with_missing}")


Total missing values: 0
Rows with missing values:
Empty DataFrame
Columns: [Entrez Gene Interactor A, Entrez Gene Interactor B]
Index: []


In [26]:
# Save the gene interaction data to a TSV file
# Exclude index and header in the saved file
human_net_file = '../../dataset/pre_data/network/BIOGRID_human.tsv'
data_df.to_csv(human_net_file, sep='\t', index=False, header=False)