In [1]:
### Functions
def extract_seq_pep(file_path, keyword="_entity_poly.pdbx_seq_one_letter_code"):
    """
    Extract lines from a file starting after a specific keyword until a line containing just a semicolon.

    Args:
        file_path (str): Path to the file to be read.
        keyword (str): Keyword to search for in the file. Default is "_entity_poly.pdbx_seq_one_letter_code".

    Returns:
        list: A list of lines found after the keyword until a semicolon is encountered.
    """
    lines_after_keyword = []
    start_extracting = False

    try:
        with open(file_path, 'r') as file:
            for line in file:
                # Check if we have found the keyword and start extracting
                if start_extracting:
                    # Strip whitespace characters to identify an isolated semicolon
                    if line.strip() == ";":
                        break  # Stop extracting when a line with just ";" is encountered
                    lines_after_keyword.append(line.strip())
                elif keyword in line:
                    # Set the flag to start extracting lines in the next iteration
                    start_extracting = True
        
        res = "".join(lines_after_keyword)

    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return res

# explanation format : ATOM 1    N N   . MET A 1 1    ? 43.991  49.112  24.050  1.0 27.03 ? 1    MET A N   1 Q53S48 UNP 1    M 
# here _atom_site.B_iso_or_equiv field is plddt score (for alphafold files), its the 15th field (so 14th in python) : 27.03
def extract_plddt(file_path, keyword="_atom_site.pdbx_sifts_xref_db_res"):

    lines_after_keyword = []
    start_extracting = False

    with open(file_path, 'r') as file:
        for line in file:
            if start_extracting:
                if line.strip() == "#":
                    break  

                entry = [int(line.strip().split()[23]),line.strip().split()[24],float(line.strip().split()[14])] #AA number, AA CODE and its plDDT score
                if entry not in lines_after_keyword:
                    lines_after_keyword.append(entry)

            elif keyword in line:
                start_extracting = True
        
    return lines_after_keyword

In [7]:
### Dics 
import pandas as pd 

#alphafold files id 
dic_hg_file={}
with open("/home/mouren/Data/alphafold/hsap/ee_neg_uniprotID_file_hg.tsv") as file:
    for line in file:   
        if line.strip().split()[1] == "NA": #no file in alphafold for exon
            continue
        dic_hg_file[line.strip().split()[0]] = line.strip().split()[1]
dic_mm_file={}
with open("/home/mouren/Data/alphafold/mm/ee_neg_uniprotID_file_mm.tsv") as file:
    for line in file:   
        if line.strip().split()[1] == "NA": #no file in alphafold for exon
            continue
        dic_mm_file[line.strip().split()[0]] = line.strip().split()[1]
dic_dm_file={}
with open("/home/mouren/Data/alphafold/dm/ee_neg_uniprotID_file_dm.tsv") as file:
    for line in file:   
        if line.strip().split()[1] == "NA": #no file in alphafold for exon
            continue
        dic_dm_file[line.strip().split()[0]] = line.strip().split()[1]
dic_tair_file={}
with open("/home/mouren/Data/alphafold/tair/ee_neg_uniprotID_file_tair.tsv") as file:
    for line in file: 
        if line.strip().split()[1] == "NA": #no file in alphafold for exon
            continue  
        dic_tair_file[line.strip().split()[0]] = line.strip().split()[1]

#aa sequences
dic_hg_pep={}
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/ee_aa_seqs_hg38.txt") as file:
    for line in file:  
        try: 
            dic_hg_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/neg_aa_seqs_hg38.txt") as file:
    for line in file:   
        try: 
            dic_hg_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
dic_mm_pep={}
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/ee_aa_seqs_mm39.txt") as file:
    for line in file:  
        try:  
            dic_mm_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/neg_aa_seqs_mm39.txt") as file:
    for line in file:   
        try: 
            dic_mm_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
dic_dm_pep={}
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/ee_aa_seqs_dm6.txt") as file:
    for line in file:   
        try: 
            dic_dm_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/neg_aa_seqs_dm6.txt") as file:
    for line in file:   
        try: 
            dic_dm_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
dic_tair_pep={}
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/ee_aa_seqs_tair10.txt") as file:
    for line in file:   
        try: 
            dic_tair_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue
with open("/home/mouren/Data/final_files_tokeep/aa_sequences/sequences/neg_aa_seqs_tair10.txt") as file:
    for line in file:   
        try: 
            dic_tair_pep[line.strip().split()[0]] = line.strip().split()[1]
        except IndexError: #No sequences found for exon
            continue

In [9]:
###HSAP
ee_neg_mean_plddt = {}
with open("/home/mouren/Data/final_files_tokeep/other_species/hg38_EE.bed") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_hg_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_hg_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/hsap/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_hg_pep:
                start_index = seq_file.find(dic_hg_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_hg_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/final_files_tokeep/control/control_neg_NoTF_NoTSS_TES_prom.tsv") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_hg_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_hg_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/hsap/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_hg_pep:
                start_index = seq_file.find(dic_hg_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_hg_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/alphafold/hsap/ee_neg_plddt_mean_score_hg.tsv", 'w') as file:
    for key, value in ee_neg_mean_plddt.items():
        file.write(f'{key}\t{value}\n')

In [12]:
###MM
ee_neg_mean_plddt = {}
with open("/home/mouren/Data/final_files_tokeep/other_species/mm39_EE.bed") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_mm_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_mm_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/mm/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_mm_pep:
                start_index = seq_file.find(dic_mm_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_mm_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/final_files_tokeep/other_species/control/mm39_control_neg_NoTF_NoTSS_TES_prom.tsv") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_mm_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_mm_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/mm/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_mm_pep:
                start_index = seq_file.find(dic_mm_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_mm_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/alphafold/mm/ee_neg_plddt_mean_score_mm.tsv", 'w') as file:
    for key, value in ee_neg_mean_plddt.items():
        file.write(f'{key}\t{value}\n')

In [13]:
###DM
ee_neg_mean_plddt = {}
with open("/home/mouren/Data/final_files_tokeep/other_species/dm6_EE.bed") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_dm_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_dm_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/dm/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_dm_pep:
                start_index = seq_file.find(dic_dm_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_dm_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/final_files_tokeep/other_species/control/dm6_control_neg_NoTF_NoTSS_TES.tsv") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_dm_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_dm_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/dm/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_dm_pep:
                start_index = seq_file.find(dic_dm_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_dm_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/alphafold/dm/ee_neg_plddt_mean_score_dm.tsv", 'w') as file:
    for key, value in ee_neg_mean_plddt.items():
        file.write(f'{key}\t{value}\n')

In [14]:
###TAIR
ee_neg_mean_plddt = {}
with open("/home/mouren/Data/final_files_tokeep/other_species/tair10_EE.bed") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_tair_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_tair_file[line.strip().split("\t")[3]]
            path_file = "//home/mouren/Data/alphafold/tair/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_tair_pep:
                start_index = seq_file.find(dic_tair_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_tair_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/final_files_tokeep/other_species/control/tair10_control_neg_NoTF_NoTSS_TES.tsv") as file:
    for line in file:
        if line.strip().split("\t")[3] not in dic_tair_file:
            ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"
            continue
        else:
            file = dic_tair_file[line.strip().split("\t")[3]]
            path_file = "/home/mouren/Data/alphafold/tair/files/"+file+"-model_v4.cif"
            seq_file = extract_seq_pep(path_file)[1:] #get total AA seq of gene in file
            
            if line.strip().split("\t")[3] in dic_tair_pep:
                start_index = seq_file.find(dic_tair_pep[line.strip().split("\t")[3]]) #get postion of exon AA sequence in gene seq
                if start_index != -1:

                    end_index = start_index + len(dic_tair_pep[line.strip().split("\t")[3]])
                    start_index += 1 

                    lst_scores = []
                    structure = extract_plddt(path_file)
                    for i in structure: #get plddt score for each AA fo exon sequence
                        if i[0] >= start_index or i[0] <= start_index:
                            lst_scores.append(i[2])

                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = (sum(lst_scores)/len(lst_scores))
                
                else:
                    ee_neg_mean_plddt[line.strip().split("\t")[3]] = "NA"

with open("/home/mouren/Data/alphafold/tair/ee_neg_plddt_mean_score_tair.tsv", 'w') as file:
    for key, value in ee_neg_mean_plddt.items():
        file.write(f'{key}\t{value}\n')