___
### Provide input
- gRNA_csv = name of the csv-file (csv-file needs two column ("GRNA NAME", "GRNA")
- wild_type_seq_file = name of txt-file that contains the wild-type sanger seq for the gene of interest (can be derived from split_fasta notebook), txt-file name needs to contain word "wild" (see example below)
- include_protein_data = final output can either include AA-seq data or not (can be set by include_protein_data as False, or True

In [None]:
#CSV format!
gRNA_csv = 'A2M_gRNAs.csv'
#Fasta format!
wild_type_seq_file = "A2M_wild_type.txt"
#Set to True if you want to include AA-seq in final output
include_protein_data = True

___

In [None]:
import gpe_module as gpe
import pandas as pd
#pd.set_option('display.max_colwidth', None)

#### Edited clones

In [None]:
edited_clones, non_edited_clones = gpe.edited_ko_clone(gRNA_csv)
edited_clones = edited_clones[edited_clones["CLONES"].str.contains('.txt')]
edited_clones_seq = gpe.add_seq_results_to_df(edited_clones)

if include_protein_data:
    df_aa_edited_clones = gpe.translate_df_entry_to_protein(edited_clones)
    df_edited_clones_with_protein = edited_clones_seq.merge(df_aa_edited_clones, on="CLONES", sort="CLONES")
    df_edited_clones_with_protein["AA_SEQ"] = df_edited_clones_with_protein['AA_SEQ'].str.split('_').str[0]
    df_edited_clones_with_protein["AA_SEQ_REV_COMPL"] = df_edited_clones_with_protein['AA_SEQ_REV_COMPL'].str.split('_').str[0]
    df_edited_clones_with_protein.drop_duplicates().to_csv("edited_clones_aa_seq.csv")
    df_edited_clones_with_protein.drop_duplicates()
    df_edited_clones_with_protein = edited_clones_seq
else:
    df_edited_clones_with_protein = edited_clones_seq

#### Non-edited clones

In [None]:
# non_edited_clones = non_edited_clones[non_edited_clones["CLONES"].str.contains('.txt')]
# non_edited_clones_seq = gpe.add_seq_results_to_df(non_edited_clones)
# df_aa_non_edited_clones = gpe.translate_df_entry_to_protein(non_edited_clones)
# df_non_edited_clones_with_protein = non_edited_clones_seq.merge(df_aa_non_edited_clones, on="CLONES", sort="CLONES")
# df_non_edited_clones_with_protein.drop_duplicates().to_csv("non_edited_clones_aa_seq.csv")
# df_non_edited_clones_with_protein.drop_duplicates()

#### Get wild-type sequence and extract upstream/downstream 150bp

In [None]:
#extract wild-type sequence around gRNA site
df_wt_seq_guideRNA = gpe.extract_nt_from_sangerseq_around_grna(wild_type_seq_file, df_edited_clones_with_protein)

#### Manipulate and tidy up df

In [None]:
#manipulate df and add column
column_name_wt = df_wt_seq_guideRNA.columns[-3]
column_name_first_20 = df_wt_seq_guideRNA.columns[-2]
column_name_last_20 = df_wt_seq_guideRNA.columns[-1]
df_edited_clones_with_protein[column_name_wt] = ""
df_edited_clones_with_protein[column_name_first_20] = ""
df_edited_clones_with_protein[column_name_last_20] = ""
#add wild_type sequence to df
for entry in range(len(df_wt_seq_guideRNA)):
    for row in range(len(df_edited_clones_with_protein)):
        if df_edited_clones_with_protein["GRNA NAME"].iloc[row] == df_wt_seq_guideRNA["GRNA NAME"].iloc[entry]:
            df_edited_clones_with_protein[column_name_wt].iloc[row] = df_wt_seq_guideRNA[column_name_wt].iloc[entry]
            df_edited_clones_with_protein[column_name_first_20].iloc[row] = df_wt_seq_guideRNA[column_name_first_20].iloc[entry]
            df_edited_clones_with_protein[column_name_last_20].iloc[row] = df_wt_seq_guideRNA[column_name_last_20].iloc[entry]

In [None]:
# determine sequence length between clipped wild-type and clone sequence
seq_clipped_list = []
wt_seq_clipped_list = []
len_seq_clipped_list = []
len_wt_seq_clipped_list = []
diff_wt_clipped_cone_clipped_list = []

for row in range(len(df_edited_clones_with_protein)):
    clone_seq = df_edited_clones_with_protein['SEQUENCE'].iloc[row]
    clone_first_20 = df_edited_clones_with_protein[column_name_first_20].iloc[row]
    value_first = clone_seq.find(clone_first_20)
    clone_last_20 = df_edited_clones_with_protein[column_name_last_20].iloc[row]
    value_last = clone_seq.find(clone_last_20) + 20
    seq_clipped = df_edited_clones_with_protein['SEQUENCE'].iloc[row][value_first:value_last]
    seq_clipped_list.append(seq_clipped)
    len_seq_clipped_list.append(len(seq_clipped))
    wt_seq = df_edited_clones_with_protein[column_name_wt].iloc[row]
    wt_first_20 = df_edited_clones_with_protein[column_name_first_20].iloc[row]
    wt_value_first = wt_seq.find(wt_first_20)
    wt_last_20 = df_edited_clones_with_protein[column_name_last_20].iloc[row]
    wt_value_last = wt_seq.find(wt_last_20) + 20
    wt_seq_clipped = df_edited_clones_with_protein[column_name_wt].iloc[row][wt_value_first:wt_value_last]
    wt_seq_clipped_list.append(wt_seq_clipped)
    len_wt_seq_clipped_list.append(len(wt_seq_clipped))
    difference_wt_clone = len(seq_clipped) - len(wt_seq_clipped)
    diff_wt_clipped_cone_clipped_list.append(difference_wt_clone)

df_edited_clones_with_protein["CLONE_SEQ_CLIPPED"] = seq_clipped_list
df_edited_clones_with_protein["LENGTH_CLONE_SEQ_CLIPPED"] = len_seq_clipped_list
df_edited_clones_with_protein["WT_SEQ_CLIPPED"] = wt_seq_clipped_list
df_edited_clones_with_protein["LENGTH_WT_SEQ_CLIPPED"] = len_wt_seq_clipped_list
df_edited_clones_with_protein["DIFF_LENGTH_CLONE_WT"] = diff_wt_clipped_cone_clipped_list

In [None]:
df_edited_clones_with_protein["INDEL"] = ""
for row in range(len(df_edited_clones_with_protein)):
    if (df_edited_clones_with_protein["DIFF_LENGTH_CLONE_WT"].iloc[row] < 0) and (df_edited_clones_with_protein["DIFF_LENGTH_CLONE_WT"].iloc[row] % 3 != 0):                      
        df_edited_clones_with_protein["INDEL"].iloc[row] = "Out of frame deletion"
    elif (df_edited_clones_with_protein["DIFF_LENGTH_CLONE_WT"].iloc[row] > 0) and (df_edited_clones_with_protein["DIFF_LENGTH_CLONE_WT"].iloc[row] % 3 != 0):
        df_edited_clones_with_protein["INDEL"].iloc[row] = "Out of frame insertion"
    elif df_edited_clones_with_protein["DIFF_LENGTH_CLONE_WT"].iloc[row] % 3 == 0:
        df_edited_clones_with_protein["INDEL"].iloc[row] = "In frame alteration"

In [None]:
df_edited_clones_with_protein_final = df_edited_clones_with_protein.drop(columns=["FIRST_20NT", "LAST_20NT", "SEQUENCE", "CLONE_SEQ_CLIPPED", "WT_SEQ_CLIPPED" ])

In [None]:
df_edited_clones_with_protein_final

In [None]:
df_edited_clones_with_protein_final.to_csv("Summary_edited_clones.csv")