# Conservation Analysis and Epitope Prediction


#### Author: C. Mazzaferro, K. Fisch
#### Email: cmazzafe@ucsd.edu
#### Date: August 2016
 
## Outline of Notebook
<a id = "toc"></a>
1. <a href = "#background">Background</a>
2. <a href = "#ConsA">Conservation Analysis</a>
    * <a href = "#BLAST">BLAST-P</a>
    * <a href = "#MSA">Multiple Sequence Alignment</a>
    * <a href = "#Cons">Conservation Score Prediction</a>
3. <a href = "#ep_pred">Windowing and New Epitope Prediction</a>
    * <a href = "#clustering">Epitope Clustering Analysis</a>

This workflow aims at finding peptide substitute candidates by swapping 1 AA at a time from a prioritized list of peptides. This prioritized list was found using the workflow - Base Workflow.

## Finding similar peptides by doing simple AA swaps
The peptides will be written to a fasta file which will be sent to netMHCcons. Results will be analyzed subsequentially


In [11]:
import pandas

csv_path = '/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/high_aa_all_prots.csv'
high_aa_df = pandas.read_csv(csv_path)

In [14]:
from nepitope import pep_utils
import importlib
importlib.reload(pep_utils)

fasta_files_dir = '/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/swaps/'
pep_utils.find_swaps_write_to_fasta(high_aa_df, fasta_files_dir)

### Load Results From netMHCcons
The swaps are sent to netMHCCons, and the results are aggragated under the class FileConsolidation in the module scoring_utils

In [26]:
from nepitope import mhc_utils, summary_data
import glob
import os
nmers = [8, 9,10,11]

fasta_files_dir = '/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/swaps/*'

orig_fasta = '/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/prots_in_fasta.fasta'
possible_proteins = [os.path.basename(x) for x in glob.glob(fasta_files_dir)]
possible_proteins

['C__jejuni_Cas9',
 'Campylobacter_lari_Cas9',
 'Corynebacter_diphtheria_Cas9',
 'F__novicida_Cas9',
 'Lactobacillus_buchneri_Cas9',
 'Listeria_innocua_Cas9',
 'N__meningitidis_Cas9',
 'Neisseria_cinerea_Cas9',
 'P__multocida_Cas9',
 'Parvibaculum_lavamentivorans_Cas9',
 'S__mutans_Cas9',
 'S__pyogenes_Cas9',
 'S__thermophilus_CRISPR_3_Cas9',
 'Staphylococcus_aureus_Cas9',
 'Streptococcus_pasteurianus_Cas9',
 'T__denticola_Cas9']

In [41]:
#files saved from netMHCcons as split_*i*_prediction_*n*_mer for each split/nmer
def write_to_csv(df, path_name):
    df.to_csv(path_name, sep=',')

basepath = '/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/swaps/'



for prot in possible_proteins:

    filepath = basepath + prot + '/Results/'
    file_names = glob.glob(filepath + '*')
    file_names = [file_name.split('/')[-1] for file_name in file_names]
    
    aggregate_all = mhc_utils.FileConsolidation.load_batches(filepath, file_names, orig_fasta)
    df_list = aggregate_all.return_df_list()
    lsss_1 = aggregate_all.optimized_list_df_by_prot(df_list)
    
    files = glob.glob(filepath + file_pattern)
    original_peps_and_pos = []
 
    for file in files:
        split_ = file.split('/')[-1]
        split_2 = split_.split('_')
        original_peps_and_pos.append([split_2[2], split_2[4]])
    
    list_results = aggregate_all.return_df_list()
    summ_data = summary_data.SummaryData()
    summ_data_analysis = summ_data.summarize_data_for_each_mhc_pred(list_results, original_peps_and_pos, high_aa_df)
    
    pd = pandas.concat(summ_data_analysis.container)
    cl = pd.drop_duplicates()
    cl.to_csv(filepath + prot + '_preds_swaps' + '.csv')
    
    fasta_dir = (filepath + 'fasta_swaps_top_20_percent.fasta')
    summ_data_analysis.write_peptides_to_fasta(fasta_dir)
   
    csv_path = filepath + 'summary_results_per_prediction.csv'
    summ_data_analysis.summary_df.sort_values(by='original pos').to_csv(path_or_buf=csv_path, sep=',')
    
    csv_for_each_pred =  filepath + 'csv_for_each_pred'
    os.mkdir(csv_for_each_pred)
    
    for idx, val in enumerate(list_results):
        original_pep = original_peps_and_pos[idx][0]
        pos = original_peps_and_pos[idx][1]
        allele = val.Allele.unique()[0]
        nmer = val['n-mer'].unique()[0]
        name = "_".join([original_pep, pos, allele, str(nmer)])
        val['Peptide'] = val['Peptide'].str.replace('X', '-')
        write_to_csv(val, csv_for_each_pred + '/' + name + '.csv')

    
    
    

['_swap_ALKDFSHLV_Pos_234_ID_C--jejuni-Cas9_Allele_HLA-A0201_nmer_9.xls', '_swap_ALPRRLARSA_Pos_44_ID_C--jejuni-Cas9_Allele_HLA-B0702_nmer_10.xls', '_swap_APKNSPLAF_Pos_254_ID_C--jejuni-Cas9_Allele_HLA-B0702_nmer_9.xls', '_swap_ARLNHLKHL_Pos_62_ID_C--jejuni-Cas9_Allele_HLA-B2705_nmer_9.xls', '_swap_ARLNHLKHLI_Pos_62_ID_C--jejuni-Cas9_Allele_HLA-B2705_nmer_10.xls', '_swap_ARLVLNYTK_Pos_643_ID_C--jejuni-Cas9_Allele_HLA-B2705_nmer_9.xls', '_swap_DSYMNKVLVF_Pos_568_ID_C--jejuni-Cas9_Allele_HLA-A2402_nmer_10.xls', '_swap_EIKLKKAL_Pos_364_ID_C--jejuni-Cas9_Allele_HLA-B0801_nmer_8.xls', '_swap_FEKYIVSAL_Pos_959_ID_C--jejuni-Cas9_Allele_HLA-B4001_nmer_9.xls', '_swap_FMFVALTRI_Pos_262_ID_C--jejuni-Cas9_Allele_HLA-A0201_nmer_9.xls', '_swap_FSHLVGNCSF_Pos_238_ID_C--jejuni-Cas9_Allele_HLA-B1501_nmer_10.xls', '_swap_GDMFRVDIFK_Pos_827_ID_C--jejuni-Cas9_Allele_HLA-A0301_nmer_10.xls', '_swap_GEIKDWILM_Pos_869_ID_C--jejuni-Cas9_Allele_HLA-B4001_nmer_9.xls', '_swap_GTYFIEFKK_Pos_325_ID_C--jejuni-Cas9_A

In [42]:
#extract files of interest

In [48]:
fasta_files_dir = '/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/swaps/*'
possible_proteins = [os.path.basename(x) for x in glob.glob(fasta_files_dir)]
for i in possible_proteins:
    os.mkdir('/Users/carlomazzaferro/Desktop/Test_IEDB/OtherProtsPrashant/swap_results/' + i)

### Return summary data in a dataframe
Other methods and attributes are present as well

### Write files out