In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import re

### Get list of snp ID by fabian format 
file = "/mnt/project/exonhancer/ZENODO_REPO/gnomAD_v3/2_snps_dictionaries/SNP_annot_allExons_fabian_gnomadV3_all.tsv"
dic_fabian_id = {}

with open(file) as f:
    for line in f:
        dic_fabian_id[line.strip().split('\t')[0]] = line.strip().split('\t')[1]

In [None]:
file = Path("K562_REF_sequences_raw.tsv")

content = []
with open(file) as f:
    for line in f:
        start = int((line.strip().split()[2]))
        end = int((line.strip().split()[3]))
        name = line.strip().split()[0]

        try:
            df = pd.read_csv("/mnt/project/exonhancer/ZENODO_REPO/gnomAD_v3/1_compute_fabian/pipeline_exons/fabian_result/"+name+".vcf_fabian_bilan.tsv", index_col=0, sep='\t')
            for column_name in df.columns:  
                coord_fabian_tmp = (column_name.split(":")[1]).split(">")[0]
                coord_fabian = [int(x) for x in re.findall(r'\d+', coord_fabian_tmp)] 
                ref = [x for x in re.findall(r'[a-zA-Z]+', coord_fabian_tmp)][0]
                alt = (column_name.split(">")[1]).split(".")[0]

                if len(ref)>1 or len(alt)>1: #remove indel
                    continue
 
                ### Check if strong synonymous SNP
                if int(coord_fabian[0]) > start and int(coord_fabian[0]) <= end: #Mutation can be in sequence #> start because start is not included
                    try: #If not in the dictionnary it means it didn't pass the gnomAD filter 
                        if dic_fabian_id[column_name.split('.')[0]] == "synonymous": #Mutation is synonymous
                            df[column_name] = df[column_name].astype(float)
                            if any(df[column_name] > 0.66) or any(df[column_name] < -0.66): #Mutation is strong
                                
                                if line.strip().split()[4] == "+":
                                    ### Mutate sequence
                                    seq = line.strip().split()[5]
                                    cpt = start+1 #+1 because in UCSC coordinates the start is not included
                            
                                    for base in range(len(seq)):
                                        if cpt != int(coord_fabian[0]):
                                            cpt += 1
                                            continue
                                        else:
                                            if ref != seq[base]:
                                                print(print(column_name), ref, alt)
                                                print(seq[base])
                                                print(line)
                                                break
                                            else:
                                                modified_seq = seq[:base] + alt.lower() + seq[base+1:]
                                                if abs(df[column_name].min()) >= abs(df[column_name].max()):
                                                    name2 = str("_".join(name.split('_')[:3]))+"_L-"+str(cpt)+"-"+ref+">"+alt+"_EESNP"
                                                else:
                                                    name2 = str("_".join(name.split('_')[:3]))+"_G-"+str(cpt)+"-"+ref+">"+alt+"_EESNP"
                                                entry = ["EESNP",name2,line.strip().split()[1],start,end,line.strip().split()[4],modified_seq]
                                                content.append(entry)
                                                break
                                else: #Strand negative so we need to reverse complement the mutation
                                    seq = (line.strip().split()[5])[::-1]
                                    cpt = start+1 #because UCSC gives reverse complementary
                                    
                                    nuc = {"A":"T","T":"A","C":"G","G":"C"}
                                    ref2 = nuc[ref]
                                    alt2 = nuc[alt]

                                    for base in range(len(seq)):
                                        if cpt != int(coord_fabian[0]):
                                            cpt += 1
                                            continue
                                        else:
                                            if ref2 != seq[base]:
                                                print(print(column_name), ref, alt)
                                                print(seq[base])
                                                print(line)
                                            else:
                                                modified_seq = seq[:base] + alt2.lower() + seq[base+1:]
                                                seq_rc = modified_seq[::-1]
                                                if abs(df[column_name].min()) >= abs(df[column_name].max()):
                                                    name2 = str("_".join(name.split('_')[:3]))+"_L-"+str(cpt)+"-"+ref+">"+alt+"_EESNP"
                                                else:
                                                    name2 = str("_".join(name.split('_')[:3]))+"_G-"+str(cpt)+"-"+ref+">"+alt+"_EESNP"
                                                entry = ["EESNP",name2,line.strip().split()[1],start,end,line.strip().split()[4],seq_rc]
                                                content.append(entry)
                                                break

                    except KeyError:
                        continue
                                    
        except FileNotFoundError:
            continue

df1 = pd.DataFrame(content)
df1.to_csv("/home/mouren/Data/valid_exp_starr/list_raw_v3/K562_MUT_sequences.tsv",sep="\t",header=False,index=False)