# 01. Amino Acid Count Preprocessing

<p>This notebook processes AA count data saved as .tsv files (e.g. Patient-Visit-PBMC.AACounts.tsv) and converts them into tables of raw counts or frequencies (as a percentage) for the genome region of interest, in this case, Tat exon 1 and Tat exon 2 are the exclusive focus.</p>
<p><b>Input:</b></p>
<ul>
<li>Full genome amino acid count TSV file for each sample
</ul>
<p><b>Output:</b></p>
<ul>
<li>Tat exon 1 amino acid count table CSV file for each sample
<li>Tat exon 1 amino acid frequency table CSV file for each sample
<li>Tat exon 2 amino acid count table CSV file for each sample
<li>Tat exon 2 amino acid frequency table CSV file for each sample
</ul>

# Import requirements and modules

In [1]:
import os
import sys
import glob
import time
import pandas as pd
pd.options.mode.chained_assignment = None

In [2]:
module_directory = '../code'
modules = sys.path
if module_directory not in modules:
    sys.path.append(module_directory)

import aa_count_preprocess as acp

# Survey files

In [3]:
# Files are stored in a separate directory, one above
aa_count_files = glob.glob('../data/raw_counts/*.tsv')

# Print the total number of files
print(len(aa_count_files), 'AA count files')

145 AA count files


# Preprocessing

In [4]:
def preprocess_files(aa_count_files, target):
    for count_file in aa_count_files:    
        out1 = count_file.replace('/raw_counts/','/freq_tables/')
        out2 = count_file.replace('/raw_counts/','/freq_tables/')
        out_file1 = out1.split('.AACounts')[0] + '-' + target + '.AAfreq.csv'
        out_file2 = out2.split('.AACounts')[0] + '-' + target + '.AAfreqcount.csv'

        df = acp.tsvfile_to_df(count_file, target)
        nostop_df = acp.remove_stops(df)
        freq_dict = acp.makeFreqDict(nostop_df)
        count_dict = acp.makeCountDict(nostop_df)
        freq_df = acp.freqToDataframe(freq_dict)
        freq_df.to_csv(out_file1, index=False)    
        count_df = acp.freqToDataframe(count_dict)
        count_df.to_csv(out_file2, index=False)
        
    print(target, len(aa_count_files), 'AA count files processed')

In [5]:
# Files are stored in a separate directory, one above
aa_count_files = glob.glob('../data/raw_counts/*.tsv')

preprocess_files(aa_count_files, 'Tat1')
preprocess_files(aa_count_files, 'Tat2')

Tat1 145 AA count files processed
Tat2 145 AA count files processed
