# PREPARE DATASETS FOR DOWNSTREAM ANALYSES

This Jupyter Notebook contains main functions to prepare gut microbiome time series data for downstream analyses.
1. Interpolate data using PCHIP interpolation function
2. Rarefy interpolated data to chosen treshold
3. Re-interpolate timepoints that got removed during the rarefaction step

#### Reguirements

We recommend running this conde inside a conda environment with qiime2 package installed

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import pchip_interpolate
import qiime2
from qiime2.plugins.feature_table.methods import rarefy

### Read dataframes

In [2]:
wd =  './data/'

# male
male_df = pd.read_csv(wd + 'raw_files/male_assigned_sample_names.csv', 
                      index_col = [0]).sort_index()
# female
female_df = pd.read_csv(wd + 'raw_files/female_assigned_sample_names.csv', 
                        index_col = [0]).sort_index()
# donorA
donorA_df = pd.read_csv(wd + 'raw_files/donorA_assigned_sample_names.csv', 
                        index_col = [0]).iloc[:-1].sort_index() #remove last row as is Nan
donorA_df.index = donorA_df.index.astype(int) 
donorA_df = donorA_df[~donorA_df.index.duplicated(keep=False)]

# donorB
donorB_df = pd.read_csv(wd + 'raw_files/donorB_assigned_sample_names.csv', 
                        index_col = [0]).iloc[:-1].sort_index() #remove last observation as is the gape is too big t be interpolated
donorB_df.index = donorB_df.index.astype(int)
donorB_df = donorB_df[~donorB_df.index.duplicated(keep=False)]

### Interpolate data using PCHIP

In [None]:
def prepare_data_for_interpolation(df):
        
    start_df = df.iloc[0].name
    end_df = df.iloc[-1].name

    full = list(range(start_df, end_df)) 
    missing_tpoints = list(set(full) - set(df.index.astype(int)))
    missing_df = df.reindex(df.index.union(missing_tpoints))

    return missing_df
    
def pchip_interpolation(col, masked_df):
    
    df_interpolated = pd.DataFrame(index = masked_df.index)

    tmp = masked_df[col]
    base_nodes =  tmp.dropna().index 
    interpolated_nodes = tmp[tmp.isna()].index 

    y = pchip_interpolate(base_nodes,
                          tmp.dropna().values,
                          interpolated_nodes)

    name = str(col)
    df_interpolated.loc[base_nodes, name] = tmp.dropna().values
    df_interpolated.loc[interpolated_nodes, name] = y

    return df_interpolated

def apply_interpolation(df, interpolation_function):

    INTERPOLATED_COLUMNS = []
    for col in df.columns:
        interpolated_col = interpolation_function(col, df)
        INTERPOLATED_COLUMNS.append(interpolated_col)
    INTERPOLATED_DF = pd.concat(INTERPOLATED_COLUMNS, axis=1)

    return INTERPOLATED_DF
    
def interpolate_pchip(df, path, subject):

    df = prepare_data_for_interpolation(df)
    df_interpolated = apply_interpolation(df, pchip_interpolation)
    df_interpolated = df_interpolated.astype(int).T
    df_interpolated.to_csv(path + f'{subject}_interpolated.tsv', sep = '\t')              

In [None]:
path = wd + 'first_interpolation_files/'
male_interpolated_df = interpolate_pchip(male_df, path, 'male')
female_interpolated_df = interpolate_pchip(female_df, path,  'female')
donorA_interpolated_df = interpolate_pchip(donorA_df, path, 'donorA')
donorB_interpolated_df = interpolate_pchip(donorB_df, path, 'donorB')

### Change tsv to biom format

### Rarefy

In [None]:
def rarefy_table(biom_file):
    
    '''
    biom_file: path to biom file 
    '''
    
    unrarefied_table = qiime2.Artifact.import_data("FeatureTable[Frequency]", biom_file)
    #rarefy
    rarefy_result = rarefy(table=unrarefied_table, sampling_depth=18000)
    rarefied_table = rarefy_result.rarefied_table
    rarefied_table_df = rarefied_table.view(pd.DataFrame)
    rarefied_table_df.index = rarefied_table_df.index.astype(int)
    return rarefied_table_df

In [None]:
male_rarefied_df = rarefy_table(wd + 'biom_files/male_pchip_interpolated.biom')
female_rarefied_df = rarefy_table(wd + 'biom_files/female_pchip_interpolated.biom')
donorA_rarefied_df = rarefy_table(wd + 'biom_files/donorA_pchip_interpolated.biom')
donorB_rarefied_df = rarefy_table(wd + 'biom_files/donorB_pchip_interpolated.biom')

### interpolate after rarefaction

In [None]:
def re_interpolate_pchip(df):
    
    '''
    df: rarefied dataframe
    '''

    df = prepare_data_for_interpolation(df)
    df_interpolated = apply_interpolation(df, pchip_interpolation)
    df_interpolated = df_interpolated.astype(int)
    
    return df_interpolated

reinterpolated_male_rarefied_df = re_interpolate_pchip(male_rarefied_df)
reinterpolated_female_rarefied_df = re_interpolate_pchip(female_rarefied_df)
reinterpolated_donorA_rarefied_df = re_interpolate_pchip(donorA_rarefied_df)
reinterpolated_donorB_rarefied_df = re_interpolate_pchip(donorB_rarefied_df)

### save as tsv and then change to biom format for further analyses

In [None]:
def to_csv(df, path, subject):
    df.T.to_csv(path + f'{subject}_rarefied_18000_interpolated_pchip.tsv', sep = '\t')

In [None]:
path = wd + 'ready_files/'
to_csv(reinterpolated_male_rarefied_df, path, 'male')
to_csv(reinterpolated_female_rarefied_df, path, 'female')
to_csv(reinterpolated_donorA_rarefied_df, path, 'donorA')
to_csv(reinterpolated_donorB_rarefied_df, path, 'donorB')