In [4]:
%matplotlib inline

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import IUPAC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.ensemble import RandomForestClassifier

import shap

In [5]:
#pd.set_option('display.max_rows', 500)

In [6]:
fasta_file = '../DATA/ZIBRA 2/2019-01-30_ZIBRA2_YFV-RIO-Diferentes Cts/CONSENSUS/CONSENSUS.aln'

### different CTs

In [7]:
# Gets the sequences IDs
identifiers = [seq_record.id for seq_record in SeqIO.parse(fasta_file, "clustal")]

In [8]:
# Gets the sequences nucleotides
seqs = np.array([list(str(seq_rec.seq)) for seq_rec in SeqIO.parse(fasta_file, "clustal")])

In [11]:
seqs.shape

(47, 10200)

In [9]:
# Creates columns names based on position, starting from 1, to make it consistent with the 
# sequence analysis, which starts at base number 1.
cols = list(range(1, seqs.shape[1]+1))

In [7]:
# Creates dataframe with data
df = pd.DataFrame(seqs, index=identifiers, columns=cols)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,10191,10192,10193,10194,10195,10196,10197,10198,10199,10200
RJY01_CARIACICA_Alouatta_sp_08-03-2017,N,N,N,N,N,N,N,N,N,N,...,G,G,A,T,A,G,N,N,N,N
RJY03_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,N,N,N,N,N,N,N,N,N,N,...,G,G,A,T,A,G,N,N,N,N
RJY04_SUMIDOURO_Alouatta_sp_22-12-2017,N,N,N,N,N,N,N,N,N,N,...,G,G,A,T,A,G,N,N,N,N
RJY05_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,N,N,N,N,N,N,N,N,N,N,...,G,G,A,T,A,G,N,N,N,N
RJY09_ITAMARANDIBA_Alouatta_sp_13-02-2017,N,N,N,N,N,N,N,N,N,N,...,G,G,A,T,A,G,N,N,N,N


In [9]:
samples = pd.read_excel('../DATA/ZIBRA 2/2019-01-30_ZIBRA2_YFV-RIO-Diferentes Cts/SAMPLE DATA/Sample List_YFV_RIO_Diferentes_CTs_alvaro.xlsx', index_col='ZIBRA code')

samples = samples[['species ML analysis', 'Data da Coleta', 'Ct ML analysis']]

samples.head()

Unnamed: 0_level_0,species ML analysis,Data da Coleta,Ct ML analysis
ZIBRA code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RJY01,Alouatta,2017-03-08,11.7
RJY02,Alouatta,2017-03-19,8.0
RJY03,Alouatta,2017-05-02,11.63
RJY04,Alouatta,2017-12-22,11.24
RJY05,Alouatta,2018-01-18,11.21


In [10]:
import re
#pattern = "^\d+"
#dic={}

In [11]:
#df['ID'] = ''
#df['species'] = ''
#df['date'] = ''
#df['Ct'] = ''

In [12]:
for index_a, sample_a in samples.iterrows():
    #print(index_a)
    pattern = '^' + index_a + '_'
    regex = re.compile(pattern)
    for index_b, sample_b in df.iterrows():
        if regex.search(index_b):
            print(pattern, index_a, index_b)
            df.loc[index_b,'ID'] = index_a
            df.loc[index_b,'species'] = samples.loc[index_a, 'species ML analysis']
            df.loc[index_b,'date'] = samples.loc[index_a, 'Data da Coleta']
            df.loc[index_b,'Ct'] = samples.loc[index_a, 'Ct ML analysis']

^RJY01_ RJY01 RJY01_CARIACICA_Alouatta_sp_08-03-2017
^RJY03_ RJY03 RJY03_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017
^RJY04_ RJY04 RJY04_SUMIDOURO_Alouatta_sp_22-12-2017
^RJY05_ RJY05 RJY05_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018
^RJY09_ RJY09 RJY09_ITAMARANDIBA_Alouatta_sp_13-02-2017
^RJY10_ RJY10 RJY10_ANGRA_DOS_REIS_Alouatta_sp_19-02-2018
^RJY11_ RJY11 RJY11_SAO_ROQUE_Alouatta_sp_02-04-2018
^RJY12_ RJY12 RJY12_SANTA_TEREZA_Alouatta_sp_15-02-2017
^RJY13_ RJY13 RJY13_CARMO_Alouatta_sp_17-04-2017
^RJY14_ RJY14 RJY14_GUAPIMIRIM_Callithrix_sp_06-06-2017
^RJY15_ RJY15 RJY15_CARIACICA_Callithrix_sp_09-03-2017
^RJY16_ RJY16 RJY16_GUARAPARI_Callithrix_sp_02-03-2017
^RJY18_ RJY18 RJY18_UBERLANDIA_Callithrix_penicillata_07-02-2017
^RJY19_ RJY19 RJY19_SERRA_Callithrix_sp_16-02-2017
^RJY20_ RJY20 RJY20_RIO_DAS_FLORES_Callithrix_sp_20-03-2017
^RJY21_ RJY21 RJY21_ITUIUTABA_Callithrix_penicillata_14-02-2017
^RJY22_ RJY22 RJY22_SAO_DOMINGOS_DAS_DORES_Callithrix_flaviceps_13-02-2017
^RJY30_ RJY30 RJY30

In [13]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,10195,10196,10197,10198,10199,10200,ID,species,date,Ct
RJY01_CARIACICA_Alouatta_sp_08-03-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY01,Alouatta,2017-03-08,11.7
RJY03_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY03,Alouatta,2017-05-02,11.63
RJY04_SUMIDOURO_Alouatta_sp_22-12-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY04,Alouatta,2017-12-22,11.24
RJY05_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY05,Alouatta,2018-01-18,11.21
RJY09_ITAMARANDIBA_Alouatta_sp_13-02-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY09,Alouatta,2017-02-13,21.85
RJY10_ANGRA_DOS_REIS_Alouatta_sp_19-02-2018,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY10,Alouatta,2018-02-19,15.17
RJY11_SAO_ROQUE_Alouatta_sp_02-04-2018,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,RJY11,Alouatta,2018-04-02,23.55
RJY12_SANTA_TEREZA_Alouatta_sp_15-02-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY12,Alouatta,2017-02-15,16.27
RJY13_CARMO_Alouatta_sp_17-04-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY13,Alouatta,2017-04-17,14.34
RJY14_GUAPIMIRIM_Callithrix_sp_06-06-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY14,Callithrix,2017-06-06,20.77


In [18]:
df_Callithrix = df[df['species'] == 'Callithrix']
df_Callithrix

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,10195,10196,10197,10198,10199,10200,ID,species,date,Ct
RJY14_GUAPIMIRIM_Callithrix_sp_06-06-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY14,Callithrix,2017-06-06,20.77
RJY15_CARIACICA_Callithrix_sp_09-03-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY15,Callithrix,2017-03-09,13.28
RJY16_GUARAPARI_Callithrix_sp_02-03-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY16,Callithrix,2017-03-02,27.05
RJY18_UBERLANDIA_Callithrix_penicillata_07-02-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY18,Callithrix,2017-02-07,25.9
RJY19_SERRA_Callithrix_sp_16-02-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY19,Callithrix,2017-02-16,12.58
RJY20_RIO_DAS_FLORES_Callithrix_sp_20-03-2017,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,RJY20,Callithrix,2017-03-20,35.38
RJY21_ITUIUTABA_Callithrix_penicillata_14-02-2017,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,RJY21,Callithrix,2017-02-14,33.17
RJY22_SAO_DOMINGOS_DAS_DORES_Callithrix_flaviceps_13-02-2017,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY22,Callithrix,2017-02-13,18.17
RJY30_VALENCA_Callithrix_sp_22-01-2018,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,RJY30,Callithrix,2018-01-22,10.5
RJY31_ANGRA_DOS_REIS_Callithrix_sp_28-02-2018,N,N,N,N,N,N,N,N,N,N,...,A,G,N,N,N,N,RJY31,Callithrix,2018-02-28,10.49
