#### Alliance_of_Genome_Resources 

In [None]:
# The following commands are used to decompress the compressed files in the dataset.
# These commands are designed for execution in a terminal, not directly in Jupyter Notebook.

# Example terminal commands (uncomment and run in a terminal):
# ! sudo gunzip -k data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv.gz
# ! sudo gunzip -k data/Network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv.gz

In [None]:
! head -n 20 ../../data/raw/network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv
! wc -l ../../data/raw/network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv
! head -n 20 ../../data/raw/network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv
! wc -l ../../data/raw/network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv

In [None]:
# Molecular Interaction.
# Mouse 
import pandas as pd

file_path = '../../data/raw/network/Alliance_of_Genome_Resources/INTERACTION-MOL_MGI.tsv'

# Read the first few lines of the file to locate the line containing column names
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        if line.startswith('#ID'):
            header_line = i
            break

# Re-read the data starting from the line with the column names
data = pd.read_csv(file_path, sep='\t', header=header_line)

# Display the first two rows and the first five columns of the data
print(data.iloc[:2, :])


In [None]:
# Select the first two columns of the data
selected_data = data.iloc[:, [0, 1]].copy()

# Extract Entrez gene IDs by splitting the values in the columns at the colon and keeping the part after the colon
selected_data.loc[:, '#ID(s) interactor A'] = selected_data['#ID(s) interactor A'].apply(lambda x: x.split(':')[1])
selected_data.loc[:, 'ID(s) interactor B'] = selected_data['ID(s) interactor B'].apply(lambda x: x.split(':')[1])

# Display the first few rows of the selected data and its shape (number of rows and columns)
print(selected_data.head())
print(selected_data.shape)


In [None]:
# Save the selected gene interaction data to a TSV file
# The file is saved without including the index or column headers
mgi_net_file = '../../data/pre_data/network/AGR_MGI.tsv'
selected_data.to_csv(mgi_net_file, sep='\t', index=False, header=False)

In [None]:
# Molecular Interaction.
# Human

import pandas as pd

# Define the file path for the data
file_path = '../../data/raw/network/Alliance_of_Genome_Resources/INTERACTION-MOL_HUMAN.tsv'

# Read the first few lines of the file to locate the row containing the column names
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        if line.startswith('#ID'):
            header_line = i
            break

# Re-read the data starting from the line containing the column names
data = pd.read_csv(file_path, sep='\t', header=header_line)

# Display the first two rows and the first five columns of the data
print(data.iloc[:2, :])


In [None]:
# Select the first two columns of the data
selected_data = data.iloc[:, [0, 1]].copy()

# Extract Entrez gene ID by splitting the string at the colon and keeping the part after the colon
selected_data.loc[:, '#ID(s) interactor A'] = selected_data['#ID(s) interactor A'].apply(lambda x: x.split(':')[1])
selected_data.loc[:, 'ID(s) interactor B'] = selected_data['ID(s) interactor B'].apply(lambda x: x.split(':')[1])

# Display the first few rows of the selected data and its shape (number of rows and columns)
print(selected_data.head())
print(selected_data.shape)

In [None]:
# Save the selected gene interaction data to a TSV file
# The file is saved without including the index or column headers
human_net_file = '../../data/pre_data/network/AGR_human.tsv'
selected_data.to_csv(human_net_file, sep='\t', index=False, header=False)

#### BIOGRID

In [None]:
! head -n 5 ../../data/raw/network/BIOGRID/BIOGRID-ORGANISM-Homo_sapiens-4.4.235.tab3.txt
! wc -l ../../data/raw/network/BIOGRID/BIOGRID-ORGANISM-Homo_sapiens-4.4.235.tab3.txt
! head -n 5 ../../data/raw/network/BIOGRID/BIOGRID-ORGANISM-Mus_musculus-4.4.235.tab3.txt
! wc -l ../../data/raw/network/BIOGRID/BIOGRID-ORGANISM-Mus_musculus-4.4.235.tab3.txt

In [None]:
import numpy as np
import pandas as pd

# Load the network data from the TSV file
net_df = pd.read_csv('../../data/raw/network/BIOGRID/BIOGRID-ORGANISM-Mus_musculus-4.4.235.tab3.txt', sep='\t', header=0, dtype=str)

# Display the first 10 rows of the dataset
print(net_df[:10])

# Print the shape of the dataset (rows, columns)
print(net_df.shape)

# Get and display the column names
cols = net_df.columns
print(cols)


In [None]:
# Extracting the second and third columns
data_df = net_df.iloc[:, [1, 2]]
# Display the extracted data
print(data_df.head(10))
print(data_df.shape)


In [None]:
# Check if there are any missing values in data_df
missing_values = data_df.isnull().sum().sum()  # Total number of missing values

# Display total number of missing values
print(f"Total missing values: {missing_values}")

# Identify the rows with missing values
rows_with_missing = data_df[data_df.isnull().any(axis=1)]
print(f"Rows with missing values:\n{rows_with_missing}")


In [None]:
# Save the gene interaction data as a TSV file
# Exclude index and header in the saved file
mouse_net_file = '../../data/pre_data/network/BIOGRID_mouse.tsv'
data_df.to_csv(mouse_net_file, sep='\t', index=False, header=False)


In [None]:
import numpy as np
import pandas as pd

# Load the human gene interaction data from the TSV file
net_df = pd.read_csv('../../data/raw/network/BIOGRID/BIOGRID-ORGANISM-Homo_sapiens-4.4.235.tab3.txt', sep='\t', header=0, dtype=str)

# Display the first 10 rows of the dataset
print(net_df[:10])

# Print the shape of the dataset (number of rows and columns)
print(net_df.shape)

# Get and display the column names
cols = net_df.columns
print(cols)


In [None]:
# Extracting the second and third columns
data_df = net_df.iloc[:, [1, 2]]
# Display the extracted data
print(data_df.head(10))
print(data_df.shape)

In [None]:
# Check if there are any missing values in data_df
missing_values = data_df.isnull().sum().sum()  # Total number of missing values

# Display total number of missing values
print(f"Total missing values: {missing_values}")

# Identify the rows with missing values
rows_with_missing = data_df[data_df.isnull().any(axis=1)]
print(f"Rows with missing values:\n{rows_with_missing}")


In [None]:
# Save the gene interaction data to a TSV file
# Exclude index and header in the saved file
human_net_file = '../../data/pre_data/network/BIOGRID_human.tsv'
data_df.to_csv(human_net_file, sep='\t', index=False, header=False)