In [9]:
import pickle
import pandas as pd

# List of protein types and their respective files
proteins = [
    "wt", 
    "S718C", 
    "P720R", 
    "P720Q", 
    "S719del", 
    "PD720_721R", 
    "T739A"
]

# Dictionary to store DataFrames for each protein
protein_dfs = {}

# Frequency thresholds for filtering interactions
frequency_threshold_1000 = 1000
frequency_threshold_3000 = 3000

# Function to extract amino acid number from protein name
def extract_protein_number(protein):
    try:
        return int(''.join([char for char in protein if char.isdigit()]))
    except ValueError:
        return None


In [10]:
# Load data for WT
with open("wt_dub_domain.pkl", 'rb') as f:
    wt_interactions_sum = pickle.load(f).sum(axis=0)

# Store all interactions for WT as a DataFrame
wt_all_interactions = [
    (ligand, protein_interaction, interaction_type, frequency_wt)
    for interaction, frequency_wt in wt_interactions_sum.items()
    for ligand, protein_interaction, interaction_type in [interaction]
]
wt_df = pd.DataFrame(wt_all_interactions, columns=["DUB", "Protein", "Interaction", "Frequency"])
protein_dfs["wt"] = wt_df

# Output WT interaction data
print(f"\nAll interactions for WT ({len(wt_df)}):")
print(wt_df)


All interactions for WT (375):
         DUB Protein Interaction  Frequency
0     GLY778  THR777  VdWContact      10001
1     ARG780  LEU765     HBDonor          7
2     ARG780  LEU765    Cationic        248
3     ARG780  LEU765  VdWContact         18
4     ARG780  ASN766     HBDonor        742
..       ...     ...         ...        ...
370  PHE1083  ALA775  VdWContact       2162
371  PHE1083  LEU776  VdWContact          1
372  PHE1083  THR777  VdWContact        577
373  VAL1088  LEU776  VdWContact        699
374  VAL1088  THR777  VdWContact         10

[375 rows x 4 columns]


In [11]:
# Load data and create DataFrames for each mutant protein
for protein in proteins[1:]:
    file_name = f"{protein}_dub_domain.pkl"
    with open(file_name, 'rb') as f:
        mutant_interactions_sum = pickle.load(f).sum(axis=0)

    mutant_all_interactions = [
        (ligand, protein_interaction, interaction_type, frequency_mutant)
        for interaction, frequency_mutant in mutant_interactions_sum.items()
        for ligand, protein_interaction, interaction_type in [interaction]
    ]
    mutant_df = pd.DataFrame(mutant_all_interactions, columns=["DUB", "Protein", "Interaction", "Frequency"])
    protein_dfs[protein] = mutant_df

    print(f"\nAll interactions for {protein} ({len(mutant_df)}):")
    print(mutant_df)



All interactions for S718C (582):
         DUB  Protein Interaction  Frequency
0     GLY778   THR777  VdWContact      10001
1     ARG780   ASN766     HBDonor         25
2     ARG780   ASN766    Cationic        298
3     ARG780   ASN766  VdWContact         39
4     ARG780   PRO767    Cationic          3
..       ...      ...         ...        ...
577  LYS1082  ASP1090    Cationic        838
578  LYS1082  ASP1090  VdWContact        798
579  PHE1083   ALA775  VdWContact       2556
580  PHE1083   THR777  VdWContact        160
581  VAL1088   LEU776  VdWContact       1168

[582 rows x 4 columns]

All interactions for P720R (586):
         DUB  Protein Interaction  Frequency
0     GLY778   THR777  VdWContact      10001
1     ARG780   MET456    Cationic         16
2     ARG780   LEU457     HBDonor        322
3     ARG780   LEU457    Cationic        695
4     ARG780   LEU457  VdWContact        395
..       ...      ...         ...        ...
581  HIS1086   ASP459     HBDonor          7
582  H

In [12]:
# Save each DataFrame to a CSV file
for protein, df in protein_dfs.items():
    output_file = f"{protein}_out_interactions_all.csv"
    df.to_csv(output_file, index=False)
    print(f"All interactions for {protein} saved in {output_file}")

All interactions for wt saved in wt_out_interactions_all.csv
All interactions for S718C saved in S718C_out_interactions_all.csv
All interactions for P720R saved in P720R_out_interactions_all.csv
All interactions for P720Q saved in P720Q_out_interactions_all.csv
All interactions for S719del saved in S719del_out_interactions_all.csv
All interactions for PD720_721R saved in PD720_721R_out_interactions_all.csv
All interactions for T739A saved in T739A_out_interactions_all.csv


In [13]:
# Calculate the total frequency of interactions for each protein
for protein, df in protein_dfs.items():
    total_frequency = df["Frequency"].sum()
    print(f"Total interaction frequency for {protein}: {total_frequency}")

Total interaction frequency for wt: 546671
Total interaction frequency for S718C: 581980
Total interaction frequency for P720R: 645204
Total interaction frequency for P720Q: 560222
Total interaction frequency for S719del: 643905
Total interaction frequency for PD720_721R: 585012
Total interaction frequency for T739A: 575272


In [14]:
# Count interactions and frequency for the 687-776 region, with filtering thresholds 1000 and 3000
filtered_protein_dfs = {}

for protein, df in protein_dfs.items():
    dub_vs_687_776 = df[df["Protein"].apply(extract_protein_number).between(687, 776)]
    dub_vs_687_776.columns = ['DUB', 'Interdomain', 'Interaction', 'Frequency']
    dub_vs_687_776_frequency = dub_vs_687_776["Frequency"].sum()

    print(f"\nInteractions + Frequency for DUB domain vs 687-776 for {protein}:")
    print(dub_vs_687_776)
    print(f"Total frequency for {protein}: {dub_vs_687_776_frequency}")

    # Filter by frequency > 1000 and frequency > 3000
    interactions_above_1000 = dub_vs_687_776[dub_vs_687_776["Frequency"] > frequency_threshold_1000]
    interactions_above_3000 = dub_vs_687_776[dub_vs_687_776["Frequency"] > frequency_threshold_3000]

    # Print and save results for each threshold
    for threshold, filtered_df in [(1000, interactions_above_1000), (3000, interactions_above_3000)]:
        print(f"\nSignificant interactions for {protein} (region 687-776, frequency > {threshold}):")
        print(filtered_df)
        print(f"Total frequency of significant interactions (frequency > {threshold}) for {protein}: {filtered_df['Frequency'].sum()}")
        print(f"Number of significant interactions (frequency > {threshold}) for {protein}: {len(filtered_df)}")
        
        # Save filtered interactions to CSV
        output_file = f"{protein}_filtered_interactions_687_776_above_{threshold}.csv"
        filtered_df.to_csv(output_file, index=False)
        print(f"Filtered interactions for {protein} (frequency > {threshold}) saved in {output_file}")


Interactions + Frequency for DUB domain vs 687-776 for wt:
         DUB Interdomain Interaction  Frequency
1     ARG780      LEU765     HBDonor          7
2     ARG780      LEU765    Cationic        248
3     ARG780      LEU765  VdWContact         18
4     ARG780      ASN766     HBDonor        742
5     ARG780      ASN766    Cationic       1813
..       ...         ...         ...        ...
330  ARG1077      SER772  VdWContact       3868
331  ARG1077      PRO774    Cationic         16
370  PHE1083      ALA775  VdWContact       2162
371  PHE1083      LEU776  VdWContact          1
373  VAL1088      LEU776  VdWContact        699

[97 rows x 4 columns]
Total frequency for wt: 133092

Significant interactions for wt (region 687-776, frequency > 1000):
         DUB Interdomain Interaction  Frequency
5     ARG780      ASN766    Cationic       1813
60    LYS835      SER755  VdWContact       1673
72    GLY840      GLY773  VdWContact       5764
83    TYR842      VAL768  VdWContact       6716
8

In [16]:
# Define regions to process
regions = {
    '402-644': (402, 644),
    '610-615': (610, 615),
    '645-684': (645, 684),
    '685-776': (685, 776),
    '687-776': (687, 776),
    '1089-1118': (1089, 1118)
}

# Process interactions for each region with frequency thresholds 1000 and 3000
for region_name, (start, end) in regions.items():
    for protein, df in protein_dfs.items():
        # Filter interactions for the current region
        region_interactions = df[df["Protein"].apply(extract_protein_number).between(start, end)]
        region_interactions.columns = ['DUB', region_name, 'Interaction', 'Frequency']

        # Calculate total interactions and frequency for each region
        total_interactions = len(region_interactions)
        total_frequency = region_interactions["Frequency"].sum()
        
        print(f"\nInteractions DUB domain vs {region_name} for {protein}:")
        print(region_interactions)
        print(f"Total interactions for {protein} in {region_name}: {total_interactions}")
        print(f"Total frequency for {protein} in {region_name}: {total_frequency}")

        # Filter interactions by frequency thresholds 1000 and 3000
        interactions_above_1000 = region_interactions[region_interactions["Frequency"] > frequency_threshold_1000]
        interactions_above_3000 = region_interactions[region_interactions["Frequency"] > frequency_threshold_3000]

        # Output and save results for each threshold
        for threshold, filtered_df in [(1000, interactions_above_1000), (3000, interactions_above_3000)]:
            print(f"\nSignificant interactions (frequency > {threshold}) for {protein} in {region_name}:")
            print(filtered_df)
            print(f"Total significant interactions for {protein} (frequency > {threshold}) in {region_name}: {len(filtered_df)}")
            print(f"Total frequency of significant interactions (frequency > {threshold}) for {protein} in {region_name}: {filtered_df['Frequency'].sum()}")
            
            # Save filtered interactions to CSV
            output_file = f"{protein}_filtered_interactions_{region_name}_above_{threshold}.csv"
            filtered_df.to_csv(output_file, index=False)
            print(f"Filtered interactions for {protein} (frequency > {threshold}) in {region_name} saved in {output_file}")



Interactions DUB domain vs 402-644 for wt:
        DUB 402-644 Interaction  Frequency
35   ASP812  ARG628  VdWContact         21
36   ASP812  ASN629  VdWContact         14
39   SER817  ASP625  VdWContact          2
109  LYS848  THR428    Cationic          2
110  LYS848  HIS430    Cationic          2
..      ...     ...         ...        ...
249  GLN916  PRO589     HBDonor         21
250  GLN916  PRO589  VdWContact        130
251  GLN916  HIS590     HBDonor       3685
252  GLN916  HIS590  VdWContact       4001
253  GLN916  THR591  VdWContact         18

[132 rows x 4 columns]
Total interactions for wt in 402-644: 132
Total frequency for wt in 402-644: 68326

Significant interactions (frequency > 1000) for wt in 402-644:
        DUB 402-644 Interaction  Frequency
142  ARG890  THR623    Cationic       2376
144  ARG890  ASP624     HBDonor       2200
145  ARG890  ASP624    Cationic       6587
146  ARG890  ASP624  VdWContact       3021
155  LYS891  THR591    Cationic       1053
156  LYS891