In [1]:
%matplotlib inline

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import IUPAC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.ensemble import RandomForestClassifier

import shap

In [2]:
shap.__version__

'0.29.3'

# Readme
With this notebook:
- Read the fasta files containing the sequences' DNAs and the excel files containing their metadata.
- Clean the data.
- Turn it into "one-hot encoded" data.
- Save it to csv and pkl.

### NHP Sequence alignment files (.aln format)

Put the files names (relative paths) in a list.

We will iterate over this list to read the sequences into dataframes.

In [3]:
file_1 = '../DATA/!CLEAN/2019-01-30_ZIBRA2_YFV-RIO-Diferentes_CTs'

file_2 = '../DATA/!CLEAN/NHP_65_outbreak'

file_3 = '../DATA/!CLEAN/2018-01_Salvador'

file_4 = '../DATA/!CLEAN/2018-03-04_LACEN_Bahia'

file_5 = '../DATA/!CLEAN/FUNED_AGOSTO-2018'

file_6 = '../DATA/!CLEAN/RIO_DE_JANEIRO'

file_7 = '../DATA/!CLEAN/YFV_LACEN_BAHIA'

In [4]:
file_list = [file_1,
            file_2,
            file_3,
            file_4,
            file_5,
            file_6,
            file_7]
seq_list = []
metadata_list = []

In [5]:
for file in file_list:
    seq_file = file + '.aln'
    metadata_file = file + '.xlsx'
    
    seq_list.append(seq_file)
    metadata_list.append(metadata_file)

### Dataframes containing NHP YFV sequences
A function to iterate over all file names and create a dataframe containing the **nucleotide sequences** for each one, putting them in a _dictionary_.

In [6]:
def create_seq_df(file):
    # Creates a dataframe based on a ".aln" file.
    
    # Gets the sequences IDs from a multi-fasta into a list
    identifiers = [seq_rec.id for seq_rec in SeqIO.parse(file, "clustal")]
    
    # Gets the sequences nucleotides, for each sequence in a multi-fasta
    seqs = np.array([list(str(seq_rec.seq)) for seq_rec in SeqIO.parse(file, "clustal")])
    
    # Creates columns names based on position, starting from 0.
    cols = list(range(seqs.shape[1]))
    
    # Creates dataframe with data
    seq_df = pd.DataFrame(seqs, index=identifiers, columns=cols)
    
    return seq_df

In [7]:
seq_dict = {}
for file in seq_list:
    print(file)
    df = create_seq_df(file)
    seq_dict[file] = df

../DATA/!CLEAN/2019-01-30_ZIBRA2_YFV-RIO-Diferentes_CTs.aln
../DATA/!CLEAN/NHP_65_outbreak.aln
../DATA/!CLEAN/2018-01_Salvador.aln
../DATA/!CLEAN/2018-03-04_LACEN_Bahia.aln
../DATA/!CLEAN/FUNED_AGOSTO-2018.aln
../DATA/!CLEAN/RIO_DE_JANEIRO.aln
../DATA/!CLEAN/YFV_LACEN_BAHIA.aln


In [8]:
sizes = [len(seq_dict[file]) for file in seq_list]
n_seqs = sum(sizes)
print(sizes)
print(n_seqs)

[32, 26, 16, 1, 12, 2, 9]
98


### NHP Metadata
The following code reads the excel spreadsheet containing the metadata related to the sequences and includes them in the dataframe.

One spreadsheet per group of sequences from ZIBRA database, all into a _dictionary_.

I only keep the information I'm going to use now, i.e., 'Host', 'Date' and 'Ct'.

I use regex to link the spreadsheet to the sequences.


In [9]:
def read_metadata(excel_file):
    metadata_df = pd.read_excel(excel_file, index_col='index')
    metadata_df = metadata_df[['Host', 'Date', 'Ct']]
    return metadata_df

In [10]:
metadata_dict = {}
for file in metadata_list:
    print(file)
    metadata_df = read_metadata(file)
    metadata_dict[file] = metadata_df

../DATA/!CLEAN/2019-01-30_ZIBRA2_YFV-RIO-Diferentes_CTs.xlsx
../DATA/!CLEAN/NHP_65_outbreak.xlsx
../DATA/!CLEAN/2018-01_Salvador.xlsx
../DATA/!CLEAN/2018-03-04_LACEN_Bahia.xlsx
../DATA/!CLEAN/FUNED_AGOSTO-2018.xlsx
../DATA/!CLEAN/RIO_DE_JANEIRO.xlsx
../DATA/!CLEAN/YFV_LACEN_BAHIA.xlsx


In [11]:
sizes = [len(metadata_dict[file]) for file in metadata_list]
n_meta = sum(sizes)
print(sizes)
print(n_meta)

[60, 27, 21, 11, 15, 2, 22]
158


In [12]:
indexes = [metadata_dict[file].index for file in metadata_list]
sum([len(index) for index in indexes])

158

# Regex - Merge information on metadata to dna sequence dataframe
Parse through metadata and sequences IDs, linking information and adding it to the seqs dataframes.

In [13]:
import re

In [14]:
index_bookeeping = {} # to avoid matching multiple sequences for the same ID.
index_search = {}     #Just an auxiliary variable I used to see if all indexes were being counted.
count = 0

for file in file_list: # compare seq_df to metadata_df in a pairwise manner.
    seq_file = file + '.aln'
    metadata_file = file + '.xlsx'
    
    # Here I hold both the sequence df and the metadata df, to merge information.
    seq_df = seq_dict[seq_file]
    metadata_df = metadata_dict[metadata_file]
    
    # Prepare seq_df to receive the metadata info.
    seq_df.insert(0, 'ID', 'id')
    seq_df.insert(1, 'Host', 'host')
    seq_df.insert(2, 'Ct', 'ct')
    seq_df.insert(3, 'Date', 'date')
    
    # For each ID in metadata (here in its index).
    # In the excel files, there is a column called "index".
    # This column was used as the "metadata dataframe" index.
    # So I iterate over these indexes and look for them (try to match them using regex) in the fasta file ID.
    
    # for each index, and each metadata related to that index.
    for index_meta, meta in metadata_df.iterrows():
        
        #Just an auxiliary variable I used to see if all indexes were being counted.
        if index_meta not in index_search:
            index_search[index_meta] = 1
        else:
            index_search[index_meta] += 1
            
        # I have pre-edited the fasta files ID fields to put the index values between vertical bars "|".
        # This was to make it easier to create a pattern and use regex.
        pattern = '\|' + str(index_meta) + '\|'
        regex = re.compile(pattern)
        
        # For each fasta ID (index_seq) in the file...
        for index_b, sample_b in seq_df.iterrows():
            # If the metadata index is in this fasta ID...
            if regex.search(index_b):
                # if this sequence still has no metadata values associated...
                if seq_df.loc[index_b,'ID'] == 'id':
                    # fill in metadata values to seq_df
                    seq_df.loc[index_b,'ID'] = index_meta
                    seq_df.loc[index_b,'Host'] = metadata_df.loc[index_meta, 'Host']
                    seq_df.loc[index_b,'Date'] = metadata_df.loc[index_meta, 'Date']
                    seq_df.loc[index_b,'Ct'] = metadata_df.loc[index_meta, 'Ct']
                    index_bookeeping[index_meta] = 1
                # else, if this sequence already has metadata values associated
                # (this happens because identical sequences are grouped together, and their fasta IDs 
                # keep all the information of all these sequences)
                # and if this is the first time this specific index_meta is matched...
                elif index_meta not in index_bookeeping:
                    # Copy the sequence, but with new metadata, and append it to seq_df
                    sample_copy = pd.Series(sample_b)
                    index_copy = str(index_meta+'_')+str(index_b)
                    sample_copy.name = index_copy
                    seq_df.append(sample_copy)
                    seq_df.loc[index_copy,'ID'] = index_meta
                    seq_df.loc[index_copy,'Host'] = metadata_df.loc[index_meta, 'Host']
                    seq_df.loc[index_copy,'Date'] = metadata_df.loc[index_meta, 'Date']
                    seq_df.loc[index_copy,'Ct'] = metadata_df.loc[index_meta, 'Ct']
                    index_bookeeping[index_meta] = 1
                else:
                    index_bookeeping[index_meta] += 1
    

In [15]:
#index_search

In [16]:
#len(index_search)

Both the `index_search` and the `len(index_search)` results show that there are duplicate metadata in the spreadsheets. I manually checked it, and it is indeed duplicated, with the exact values (it is consistent).

So there is no problem here.

The difference between the number of metadata and sequences is due to identical sequences and some low quality sequences I manually removed.

### Merge all dataframes into one 

In [15]:
dfs = list(seq_dict.values())

In [16]:
len(dfs)

7

In [17]:
for df in dfs:
    print(df.shape)

(32, 10084)
(26, 10084)
(16, 10084)
(1, 10084)
(12, 10084)
(2, 10084)
(9, 10084)


In [18]:
seq_df = pd.concat(dfs)

In [19]:
seq_df

Unnamed: 0,ID,Host,Ct,Date,0,1,2,3,4,5,...,10070,10071,10072,10073,10074,10075,10076,10077,10078,10079
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,RJY01,Alouatta,11.22,2017-03-08 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,RJY03,Alouatta,11.56,2017-05-02 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,RJY04,Alouatta,11.45,2017-12-22 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,RJY05,Alouatta,11.04,2018-01-18 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,RJY09,Alouatta,22.3,2017-02-13 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY10|_ANGRA_DOS_REIS_Alouatta_sp_19-02-2018,RJY10,Alouatta,14.335,2018-02-19 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY12|_SANTA_TEREZA_Alouatta_sp_15-02-2017,RJY12,Alouatta,16.425,2017-02-15 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY13|_CARMO_Alouatta_sp_17-04-2017,RJY13,Alouatta,14.43,2017-04-17 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY14|_GUAPIMIRIM_Callithrix_sp_06-06-2017,RJY14,Callithrix,20.715,2017-06-06 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY15|_CARIACICA_Callithrix_sp_09-03-2017,RJY15,Callithrix,13.395,2017-03-09 00:00:00,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T


# Data Cleaning

## Clear missing values

- First, turn all "N" and "-" into `np.nan`. This will mark the missing values.

In [20]:
seq_df.replace('N', np.nan, inplace=True)
seq_df.replace('-', np.nan, inplace=True)

- Second, keep only rows (samples) containing less then 5% missing values (NaN).

In [21]:
threshold = int(seq_df.shape[1]*0.95)

In [22]:
seq_df.dropna(axis=0, how='any', thresh=threshold, inplace=True)

In [23]:
seq_df.shape

(77, 10084)

- Third, remove all columns that still containg missing values.

In [71]:
seq_df.dropna(axis=1, how='any', inplace=True)

In [73]:
seq_df.shape

(77, 6550)

In [24]:
# Remove rows containing NaN or empty values in the Ct column
seq_df = seq_df[seq_df['Ct'].notnull()]
seq_df = seq_df[seq_df['Ct'] != 'ct']

# Make sure values in Ct column are float numeric
seq_df['Ct'] = pd.to_numeric(seq_df['Ct'])
seq_df['Ct'] = seq_df['Ct'].astype(np.float16)

# Make sure values in Date are datetime
seq_df['Date'] = pd.to_datetime(seq_df['Date'])

# Correct some values
seq_df.replace('Allouata', 'Alouatta', inplace=True)
seq_df.replace('cebidae', 'Cebidae', inplace=True)
seq_df.replace('NHP (unk)', 'unk', inplace=True)
seq_df.replace('Sem informação','unk', inplace=True)
seq_df.replace('Leontopithecus rosalia','L. rosalia', inplace=True)

# Show all hosts present
print(seq_df['Host'].unique())

['Alouatta' 'Callithrix' 'Callicebus' 'Cebus' 'unk' 'Cebidae']


In [25]:
seq_df.head()

Unnamed: 0,ID,Host,Ct,Date,0,1,2,3,4,5,...,10070,10071,10072,10073,10074,10075,10076,10077,10078,10079
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,RJY01,Alouatta,11.21875,2017-03-08,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,RJY03,Alouatta,11.5625,2017-05-02,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,RJY04,Alouatta,11.453125,2017-12-22,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,RJY05,Alouatta,11.039062,2018-01-18,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,RJY09,Alouatta,22.296875,2017-02-13,A,A,A,A,C,C,...,C,A,G,A,G,C,C,A,C,T


## Seasons
Prepare data to perform "epidemic season" analysis

In [26]:
# Insert another column on the dataset to hold the epidemiologic season
# 2016/2017
# 2017/2018
seq_df.insert(4, 'Season', 'season')

In [27]:
# Fill season values based on date condition:
# season 1: before August 2017
# season 2: after August 2017
mask = seq_df['Date'] < pd.datetime(2017,8,1)
seq_df.loc[mask, 'Season'] = '2016/2017'

mask = seq_df['Date'] >= pd.datetime(2017,8,1)
seq_df.loc[mask, 'Season'] = '2017/2018'

seq_df.head()

Unnamed: 0,ID,Host,Ct,Date,Season,0,1,2,3,4,...,10070,10071,10072,10073,10074,10075,10076,10077,10078,10079
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,RJY01,Alouatta,11.21875,2017-03-08,2016/2017,A,A,A,A,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,RJY03,Alouatta,11.5625,2017-05-02,2016/2017,A,A,A,A,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,RJY04,Alouatta,11.453125,2017-12-22,2017/2018,A,A,A,A,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,RJY05,Alouatta,11.039062,2018-01-18,2017/2018,A,A,A,A,C,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,RJY09,Alouatta,22.296875,2017-02-13,2016/2017,A,A,A,A,C,...,C,A,G,A,G,C,C,A,C,T


## High/Low Ct
Prepare data to perform "high/low Ct" analysis

In [28]:
# Insert another column on the dataset to hold the Ct group
# high = 1
# low = 0
seq_df.insert(5, 'Ct_Group', 0)

In [29]:
Ct_threshold = 20

In [30]:
# Fill Ct groups based on:
# high: Ct > 20
# low: Ct <= 20
mask = seq_df['Ct'] <= Ct_threshold
seq_df.loc[mask, 'Ct_Group'] = 0

mask = seq_df['Ct'] > Ct_threshold
seq_df.loc[mask, 'Ct_Group'] = 1

seq_df.head()

Unnamed: 0,ID,Host,Ct,Date,Season,Ct_Group,0,1,2,3,...,10070,10071,10072,10073,10074,10075,10076,10077,10078,10079
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,RJY01,Alouatta,11.21875,2017-03-08,2016/2017,0,A,A,A,A,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,RJY03,Alouatta,11.5625,2017-05-02,2016/2017,0,A,A,A,A,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,RJY04,Alouatta,11.453125,2017-12-22,2017/2018,0,A,A,A,A,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,RJY05,Alouatta,11.039062,2018-01-18,2017/2018,0,A,A,A,A,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,RJY09,Alouatta,22.296875,2017-02-13,2016/2017,1,A,A,A,A,...,C,A,G,A,G,C,C,A,C,T


In [31]:
seq_df.groupby('Host')["ID"].count()

Host
Alouatta      26
Callicebus     4
Callithrix    27
Cebidae        8
Cebus          5
unk            6
Name: ID, dtype: int64

### We are left with a dataset containing 26 Alouatta samples and 27 Callithrix samples.
- of the Callithrix samples, 6 are high Ct $(> 20)$ and 21 are low Ct $(< 20)$

In [32]:
for host, host_data in seq_df.groupby('Host'):
    print(host)
    print(host_data.groupby('Ct_Group')['ID'].count(), '\n')
    #print(b)

Alouatta
Ct_Group
0    24
1     2
Name: ID, dtype: int64 

Callicebus
Ct_Group
0    4
Name: ID, dtype: int64 

Callithrix
Ct_Group
0    21
1     6
Name: ID, dtype: int64 

Cebidae
Ct_Group
0    8
Name: ID, dtype: int64 

Cebus
Ct_Group
0    5
Name: ID, dtype: int64 

unk
Ct_Group
0    5
1    1
Name: ID, dtype: int64 



# One hot encoding

In [33]:
nucleotides_df = seq_df.iloc[:, 6:]

In [34]:
nucleotides_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10070,10071,10072,10073,10074,10075,10076,10077,10078,10079
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,A,A,A,A,C,C,C,T,G,G,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,A,A,A,A,C,C,C,T,G,G,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,A,A,A,A,C,C,C,,,G,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,A,A,A,A,C,C,C,T,G,G,...,C,A,G,A,G,C,C,A,C,T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,A,A,A,A,C,C,C,T,G,G,...,C,A,G,A,G,C,C,A,C,T


In [35]:
seq_ohe_df = pd.get_dummies(nucleotides_df)
seq_ohe_df.head()

Unnamed: 0,0_A,1_A,2_A,3_A,4_C,5_C,6_C,7_T,8_G,9_G,...,10070_C,10071_A,10072_G,10073_A,10074_G,10075_C,10076_C,10077_A,10078_C,10079_T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,1,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [36]:
seq_ohe_df.shape

(76, 10404)

In [37]:
seq_df.shape

(76, 10086)

In [38]:
seq_ohe_df.index == seq_df.index

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [39]:
seq_ohe_df.insert(0, 'ID', seq_df['ID'])
seq_ohe_df.insert(1, 'Host', seq_df['Host'])
seq_ohe_df.insert(2, 'Ct', seq_df['Ct'])
seq_ohe_df.insert(3, 'Date', seq_df['Date'])
seq_ohe_df.insert(4, 'Season', seq_df['Season'])
seq_ohe_df.insert(5, 'Ct_Group', seq_df['Ct_Group'])

In [40]:
seq_ohe_df.head()

Unnamed: 0,ID,Host,Ct,Date,Season,Ct_Group,0_A,1_A,2_A,3_A,...,10070_C,10071_A,10072_G,10073_A,10074_G,10075_C,10076_C,10077_A,10078_C,10079_T
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY01|_CARIACICA_Alouatta_sp_08-03-2017,RJY01,Alouatta,11.21875,2017-03-08,2016/2017,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY03|_SANTA_MARIA_MADALENA_Alouatta_sp_02-05-2017,RJY03,Alouatta,11.5625,2017-05-02,2016/2017,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY04|_SUMIDOURO_Alouatta_sp_22-12-2017,RJY04,Alouatta,11.453125,2017-12-22,2017/2018,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY05|_ANGRA_DOS_REIS_Alouatta_sp_18-01-2018,RJY05,Alouatta,11.039062,2018-01-18,2017/2018,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2019-01-30_ZIBRA2_YFV-RIO-Diferentes_Cts|RJY09|_ITAMARANDIBA_Alouatta_sp_13-02-2017,RJY09,Alouatta,22.296875,2017-02-13,2016/2017,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# Save to .csv and .pkl

In [41]:
seq_ohe_df.to_csv('../DATA/!CLEAN/YFV_seq_ohe_df.csv', index=True, header=True, decimal='.', sep=',', float_format='%.2f')

In [42]:
seq_ohe_df.to_pickle('../DATA/!CLEAN/YFV_seq_ohe_df.pkl')

In [43]:
seq_df.to_pickle('../DATA/!CLEAN/YFV_seq_df.pkl')