In [2]:
from Bio import Entrez, SeqIO
import time

Entrez.email = "brianna.flynn@utexas.edu"  # Required for NCBI Entrez access

def fetch_protein_fasta_from_gene(gene_name, organism="Homo sapiens"):
    try:
        search_handle = Entrez.esearch(
            db="protein",
            term=f"{gene_name}[Gene Name] AND {organism}[Organism]",
            retmode="xml",
            retmax=1
        )
        search_results = Entrez.read(search_handle)
        search_handle.close()

        if not search_results["IdList"]:
            return None

        protein_id = search_results["IdList"][0]
        fetch_handle = Entrez.efetch(
            db="protein",
            id=protein_id,
            rettype="fasta",
            retmode="text"
        )
        fasta_data = fetch_handle.read()
        fetch_handle.close()
        return fasta_data.strip()
    except Exception as e:
        print(f"Error retrieving {gene_name}: {e}")
        return None

test_fasta = fetch_protein_fasta_from_gene("SRP68")
print(test_fasta)

>NP_001247432.1 signal recognition particle subunit SRP68 isoform 3 [Homo sapiens]
MLSECRDAIQVVREELKPDQKQRDYILEGEPGKVSNLQYLHSYLTYIKLSTAIKRNENMAKGLQRALLQQ
QPEDDSKRSPRPQDLIRLYDIILQNLVELLQLPGLEEDKAFQKEIGLKTLVFKAYRCFFIAQSYVLVKKW
SEALVLYDRVLKYANEVNSDAGAFKNSLKDLPDVQELITQVRSEKCSLQAAAILDANDAHQTETSSSQVK
DNKPLVERFETFCLDPSLVTKQANLVHFPPGFQPIPCKPLFFDLALNHVAFPPLEDKLEQKTKSGLTGYI
KGIFGFRS


In [5]:
import pandas as pd

afm_candidates = pd.read_csv('/home/ubuntu/afm_candidates_533.csv')
afm_candidates

Unnamed: 0,candidate_gene_1,candidate_gene_2,weighted_degree_1,betweenness_1,weighted_degree_2,betweenness_2,neighbor_type,shared_neighbors
0,RAB1A,RAB11A,58.577790,0.001466,62.715677,0.000621,first,3
1,RAB1A,ATP6V1D,58.577790,0.001466,50.998125,0.004554,first,4
2,RAB1A,TRAPPC3,58.577790,0.001466,54.574739,0.010212,first,3
3,RAB1A,ARF4,58.577790,0.001466,40.075296,0.000155,first,3
4,RAB1A,CSNK1D,58.577790,0.001466,25.037478,0.002191,first,2
...,...,...,...,...,...,...,...,...
528,CRB3,TAS2R43,9.270838,0.016405,15.175787,0.046369,first,2
529,CRB3,HAVCR1,9.270838,0.016405,14.301102,0.006089,first,1
530,CRB3,DISC1,9.270838,0.016405,8.635891,0.015146,first,1
531,CRB3,PKD1L1,9.270838,0.016405,23.913688,0.002243,second,3


In [6]:
unique_genes = pd.unique(afm_candidates[["candidate_gene_1", "candidate_gene_2"]].values.ravel())

gene_to_fasta = {}
for gene in unique_genes:
    fasta = fetch_protein_fasta_from_gene(gene)
    if fasta:
        gene_to_fasta[gene] = fasta
    else:
        print(f"No FASTA found for {gene}")
    time.sleep(1)  # be polite to NCBI

In [7]:
len(gene_to_fasta.keys())

245

In [12]:
import os

output_dir = "/home/ubuntu/AFM_CANDIDATES_533"
os.makedirs(output_dir, exist_ok=True)

for idx, row in afm_candidates.iterrows():
    fasta_a = gene_to_fasta.get(row["candidate_gene_1"])
    fasta_b = gene_to_fasta.get(row["candidate_gene_2"])

    if fasta_a and fasta_b:
        # Extract sequences (remove headers and join lines)
        seq_a = ''.join(fasta_a.strip().splitlines()[1:])
        seq_b = ''.join(fasta_b.strip().splitlines()[1:])

        # Create combined header and combined sequence
        combined_header = f">{row['candidate_gene_1']}_{row['candidate_gene_2']}"
        combined_sequence = f"{seq_a}:\n{seq_b}"

        with open(f"{output_dir}/pair_{idx}_{row['candidate_gene_1']}_{row['candidate_gene_2']}.fasta", "w") as f:
            f.write(f"{combined_header}\n{combined_sequence}\n")
            print(f"Writing {row['candidate_gene_1']} and {row['candidate_gene_2']}")
    else:
        print(f"Skipping {row['candidate_gene_1']} and {row['candidate_gene_2']}| MISSING sequence")

Writing RAB1A and RAB11A
Writing RAB1A and ATP6V1D
Writing RAB1A and TRAPPC3
Writing RAB1A and ARF4
Writing RAB1A and CSNK1D
Writing RAB1A and ARL2
Writing TRAPPC3 and ATP6V1D
Writing TRAPPC3 and RAB1A
Writing TRAPPC3 and MAPRE1
Writing TRAPPC3 and ARL2
Writing TRAPPC3 and ARF4
Writing TRAPPC3 and RANBP1
Writing TRAPPC3 and RAB11A
Writing TRAPPC3 and ACTR3
Writing TUBB2B and TUBB2A
Writing TUBB2B and TUBA1A
Writing TUBB2B and RAN
Writing TUBB2B and TUBB3
Writing TUBB2B and TBCC
Writing RAN and HSPA8
Writing RAN and TUBA1C
Writing RAN and ACTR3
Writing RAN and RANBP1
Writing RAN and RAB11A
Writing RAN and ARF4
Writing RAN and MAPRE1
Writing ACTR3 and MAPRE1
Writing ACTR3 and RAN
Writing ACTR3 and TUBA1C
Writing ACTR3 and RAB8A
Writing ACTR3 and ARF4
Writing ACTR3 and TRAPPC3
Writing ACTR3 and RANBP1
Writing ACTR3 and DPYSL2
Writing ACTR3 and ATP6V1D
Writing ACTR3 and RAB11A
Writing RAB11A and RAB1A
Writing RAB11A and ATP6V1D
Writing RAB11A and RAN
Writing RAB11A and ARF4
Writing RAB11A 

In [13]:
# EVEN SMALLER

In [14]:
import pandas as pd

afm_candidates = pd.read_csv('/home/ubuntu/afm_candidates_182.csv')
afm_candidates

Unnamed: 0,candidate_gene_1,candidate_gene_2,weighted_degree_1,betweenness_1,weighted_degree_2,betweenness_2,neighbor_type,shared_neighbors
0,ACTR3,MAPRE1,68.630193,0.023840,63.631303,0.002243,first,3
1,ACTR3,RAN,68.630193,0.023840,55.312205,0.000759,first,2
2,ACTR3,TUBA1C,68.630193,0.023840,41.219445,0.000052,first,2
3,ACTR3,RANBP1,68.630193,0.023840,37.579426,0.000776,second,4
4,ACTR3,ATP6V1D,68.630193,0.023840,50.998125,0.004554,second,4
...,...,...,...,...,...,...,...,...
177,PKD1L1,DISC1,23.913688,0.002243,8.635891,0.015146,first,2
178,PKD1L1,EVC,23.913688,0.002243,24.280329,0.052562,second,4
179,CRB3,CABYR,9.270838,0.016405,11.855550,0.003623,first,0
180,CRB3,TAS2R43,9.270838,0.016405,15.175787,0.046369,first,2


In [15]:
unique_genes = pd.unique(afm_candidates[["candidate_gene_1", "candidate_gene_2"]].values.ravel())

gene_to_fasta = {}
for gene in unique_genes:
    fasta = fetch_protein_fasta_from_gene(gene)
    if fasta:
        gene_to_fasta[gene] = fasta
    else:
        print(f"No FASTA found for {gene}")
    time.sleep(1)  # be polite to NCBI

In [16]:
len(gene_to_fasta.keys())

142

In [17]:
import os

output_dir = "/home/ubuntu/AFM_CANDIDATES_182"
os.makedirs(output_dir, exist_ok=True)

for idx, row in afm_candidates.iterrows():
    fasta_a = gene_to_fasta.get(row["candidate_gene_1"])
    fasta_b = gene_to_fasta.get(row["candidate_gene_2"])

    if fasta_a and fasta_b:
        # Extract sequences (remove headers and join lines)
        seq_a = ''.join(fasta_a.strip().splitlines()[1:])
        seq_b = ''.join(fasta_b.strip().splitlines()[1:])

        # Create combined header and combined sequence
        combined_header = f">{row['candidate_gene_1']}_{row['candidate_gene_2']}"
        combined_sequence = f"{seq_a}:\n{seq_b}"

        with open(f"{output_dir}/pair_{idx}_{row['candidate_gene_1']}_{row['candidate_gene_2']}.fasta", "w") as f:
            f.write(f"{combined_header}\n{combined_sequence}\n")
            print(f"Writing {row['candidate_gene_1']} and {row['candidate_gene_2']}")
    else:
        print(f"Skipping {row['candidate_gene_1']} and {row['candidate_gene_2']}| MISSING sequence")

Writing ACTR3 and MAPRE1
Writing ACTR3 and RAN
Writing ACTR3 and TUBA1C
Writing ACTR3 and RANBP1
Writing ACTR3 and ATP6V1D
Writing ACTR3 and RAB11A
Writing RAB11A and RAB1A
Writing RAB11A and ATP6V1D
Writing RAB11A and RAN
Writing RAB11A and TRAPPC3
Writing RAB11A and ACTR3
Writing TBCC and TUBB2A
Writing TBCC and TUBA4A
Writing TBCC and TUBB3
Writing SLC47A2 and ATP6V1D
Writing SLC47A2 and ACTR3
Writing SLC47A2 and TRAPPC3
Writing MAPRE1 and TUBA1C
Writing MAPRE1 and ACTR3
Writing MAPRE1 and RAB8A
Writing MAPRE1 and SSNA1
Writing PCM1 and SDCCAG8
Writing PCM1 and NIN
Writing PCM1 and TTC17
Writing PCM1 and FNBP1L
Writing SYNE2 and PCM1
Writing SYNE2 and ABLIM1
Writing SYNE2 and FNBP1L
Writing EXOC4 and EXOC6
Writing EXOC4 and EXOC5
Writing EXOC4 and EXOC3
Writing EXOC4 and PARD3
Writing EXOC4 and TTC17
Writing EXOC4 and HTT
Writing EXOC4 and TTBK2
Writing TTC17 and WDPCP
Writing TTC17 and RFX3
Writing TTC17 and HTT
Writing TTC17 and PARD3
Writing TTC17 and EXOC4
Writing TTC17 and CEP2