In [56]:
import pandas as pd

In [57]:
def count_elements_in_columns(file_path):
    # Read data from TSV file into a pandas DataFrame
    df = pd.read_csv(file_path, sep='\t')

    # Get the count of unique elements in each column
    column_counts = df.nunique()

    return column_counts

if __name__ == "__main__":
    file_path = "../data/interactions.tsv"

    try:
        result = count_elements_in_columns(file_path)
        print("Number of unique elements in each column:")
        print(result)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


Number of unique elements in each column:
gene_name                    3952
gene_claim_name              8464
entrez_id                    3952
interaction_claim_source       20
interaction_types              25
drug_claim_name             33250
drug_claim_primary_name     30123
drug_name                   11048
drug_concept_id             11080
interaction_group_score       662
PMIDs                       10769
dtype: int64


In [58]:
def create_interaction_matrix(file_path, min_gene_count=50, min_drug_count=10):
    # Read data from TSV file into a pandas DataFrame
    df = pd.read_csv(file_path, sep='\t')

    # Filter rows based on the count of "gene_claim_name" and "drug_claim_name"
    gene_counts = df['gene_claim_name'].value_counts()
    drug_counts = df['drug_claim_name'].value_counts()

    # Create boolean masks for filtering
    gene_mask = df['gene_claim_name'].map(gene_counts) >= min_gene_count
    drug_mask = df['drug_claim_name'].map(drug_counts) >= min_drug_count

    # Apply the masks to filter the DataFrame
    df_filtered = df[gene_mask & drug_mask]

    # Drop rows where 'drug_claim_name' or 'gene_claim_name' is NA
    df_filtered = df_filtered.dropna(subset=['drug_claim_name', 'gene_claim_name'])

    # Create a pivot table to get the interaction matrix
    interaction_matrix = df_filtered.pivot_table(
        values='interaction_group_score',
        index='gene_claim_name',
        columns='drug_claim_name',
        fill_value=0  # Set fill_value to 0 for cells with no interaction
    )

    # Count the elements that are more than 0
    positive_elements_count = (interaction_matrix > 0).sum().sum()

    return interaction_matrix, positive_elements_count

In [59]:
if __name__ == "__main__":
    file_path = "../data/interactions.tsv"

    try:
        result = count_elements_in_columns(file_path)
        print("Number of unique elements in each column:")
        print(result)

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


Number of unique elements in each column:
gene_name                    3952
gene_claim_name              8464
entrez_id                    3952
interaction_claim_source       20
interaction_types              25
drug_claim_name             33250
drug_claim_primary_name     30123
drug_name                   11048
drug_concept_id             11080
interaction_group_score       662
PMIDs                       10769
dtype: int64


In [60]:
interaction_matrix, positive_elements_count = create_interaction_matrix(file_path)
print("Interaction Matrix:")
print(interaction_matrix)

print("\nNumber of elements more than 0 in the matrix:", positive_elements_count)

Interaction Matrix:
drug_claim_name                                (-)-EPICATECHIN  \
gene_claim_name                                                  
1128                                                       0.0   
1129                                                       0.0   
1131                                                       0.0   
1132                                                       0.0   
1133                                                       0.0   
...                                                        ...   
VDR                                                        0.0   
Vascular endothelial growth factor A                       0.0   
Vascular endothelial growth factor receptor 2              0.0   
WRN                                                        0.0   
YES1                                                       0.0   

drug_claim_name                                (R,S)-INDATRALINE  \
gene_claim_name                                      