## TODO:
- check out if the percent tab is calculated correctly if the form : df / cov * 100 works as you suppose to
- add coverage to excel spreadsheet
- create bar chart for each excel file
- path to input file should be denoted only once in the main(). So that you don't have to change it anywhere in the functions 
- add time_stamp
- add file counter (may be local is better, just counter in the main())
- add 


In [32]:
from Bio.SeqIO.FastaIO import SimpleFastaParser  # low level fast fasta parser
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
sns.set()
import os

from ipywidgets import FloatProgress
from IPython.display import display

    
    

In [2]:
def get_ref(in_file):
    """deriving the first record 
    from the input fasta
    """
    
    with open("./input_data/" + in_file) as in_handle:
        for record in SimpleFastaParser(in_handle):
            # sequence is under the 2nd index in the record's tuple
            ref_seq = record[1] 
            break
    
    return ref_seq
    
    

In [3]:
def get_duplex_posits(ref_seq, ref_nuc):
    """takes duplex positions out of the reference string
    """
    nucleotides = ['A', 'T', 'G', 'C']
    start = 0
    
    d = {'start_pos': [], 'stop_pos': [], 'ref_duplex': [] }
    
    while start != len(ref_seq) - 1:
        
        if ref_seq[start] == ref_nuc:
            first_duplex_nuc = ref_seq[start]
            first_duplex_pos = start
            
            # + 1 to get slice from the next nuc
            second_duplex_pos = start + 1
            for nuc in ref_seq[start + 1 : ]:
                if nuc in nucleotides:
                    second_duplex_nuc = nuc
                    break
                else:
                    second_duplex_pos += 1
                        
            d['start_pos'].append(first_duplex_pos)
            d['stop_pos'].append(second_duplex_pos)
            d['ref_duplex'].append(first_duplex_nuc + second_duplex_nuc)
            # incrementing index to move forward the string
            start += 1
    
        else:
            # incrementing anyway
            start += 1
    
    return d
                
    

In [4]:
def create_df(duplex_posits, snp_type):

    df = pd.DataFrame(duplex_posits)
    
    for snp in snp_type:
        df[snp] = np.nan
    
    df.fillna(inplace=True, value=0)
    
    
    
    return df
    

In [5]:
def count_snp(in_file, df, nuc_to_watch):
    coverage = 0
    
    with open("./input_data/" + in_file) as in_handle:
        
        reads = SimpleFastaParser(in_handle)
        for record in reads:
            coverage += 1
            
            read = record[1]
            for row in df.index:
                if read[df.loc[row, 'start_pos']] != nuc_to_watch and read[df.loc[row, 'start_pos']] != '-' \
                and read[df.loc[row, 'stop_pos']] == df.loc[row, 'ref_duplex'][1]:
                    
                    snp = read[df.loc[row, 'start_pos']]
                    
                    df.loc[row, snp] += 1
        
    return df, coverage
    

In [6]:
def create_pivot_df(df_snp):
    
    context_data = df_snp.groupby('ref_duplex').sum()
    context_data.drop(['start_pos', 'stop_pos'], inplace=True, axis=1)
    
    return context_data

    
    

In [7]:
def save_df(f, df_snp, df_duplex_context, df_duplex_context_percent, nuc):
    """saves dataframes into an excel spreadsheet
    """
    if not os.path.exists("./output_apobec"):
        os.mkdir("output_apobec")

    file_name = f.rsplit(".", 1)[0]
    writer = pd.ExcelWriter("./output_apobec/" + file_name + "__" + nuc + "__" +'.xlsx')
    df_snp.to_excel(writer, "snp")
    df_duplex_context.to_excel(writer, "context")
    df_duplex_context_percent.to_excel(writer, "context_percent")

    writer.save()


In [8]:
def get_input_files_names():
    path_to_input = "./input_data"
    fasta_extensions = ["fasta", "fa", "fas"]
    input_files = os.listdir("./input_data")
    input_files = [f for f in input_files if f.rsplit(".", 1)[-1] in fasta_extensions]
     
    return input_files
    

## main()

try to decomposite main(). the part which deals with input dir should be a separate func

In [33]:
def main():
    
    nucleotides = ["A", "T", "G", "C"]
    
    if os.path.exists("./input_data"):
        input_files = get_input_files_names()
        
        num_files = len(input_files)
        
        print("job started at : ")
        progress_bar = FloatProgress(min=0, max=num_files)
        display(progress_bar)
    
        for f in input_files:
            #print("processing file '{}'".format(f))
            
            for nuc in nucleotides:
        
                snp_type = nucleotides[:]
                snp_type.remove(nuc)
    
                ref_seq = get_ref(f)
    
                duplex_posits = get_duplex_posits(ref_seq, nuc)
    
                df = create_df(duplex_posits, snp_type)
    
                df_snp, coverage = count_snp(f, df, nuc)

                df_duplex_context = create_pivot_df(df_snp)
            
                df_duplex_context_percent = df_duplex_context / coverage * 100
            
                save_df(f, df_snp, df_duplex_context, df_duplex_context_percent, nuc)
            
            progress_bar.value += 1
    else:
        os.mkdir("input_data")        
        
        
        
        
        
    

In [None]:
main()

job started at : 


FloatProgress(value=0.0, max=3.0)

In [11]:
from IPython.display import HTML


In [13]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

##  progress bar pattern 

In [31]:
from ipywidgets import FloatProgress
from IPython.display import display
progress_bar = FloatProgress(min=0, max=100)
display(progress_bar)

from time import sleep


# incrementing progress_bar value moves the widget
for i in range(100):
    progress_bar.value += 1
    sleep(1)

FloatProgress(value=0.0)