## This notebook is to generate the simulated data for testing.

In [4]:
import pandas as pd
import numpy as np

## simulated mapping files

In [2]:
# general mapping file: 120 subjects with 20 missing values 
# age -- continous variable; 
# gender -- categorical variable with 2 levels; 
# country -- categorical variable with 4 levels

np.random.seed(31)
mapping = pd.DataFrame(
    {'#SampleID': ['10000.000001010' +
                   x for x in np.char.mod(
                       '%d', np.arange(start=1, stop=121, step=1))],
     'Age': np.append(np.sort(np.round(np.random.normal(loc=40, scale=5,
                      size=100), decimals=0)), np.repeat('nan', 20)),
     'Gender': np.append(np.repeat(np.array(['Male', 'Female']), 50),
                         np.repeat('Not applicable', 20)),
     'Country': np.append(np.repeat(np.array(['US', 'France',
                                              'Germany', 'Mexico']), 25),
                          np.repeat('Missing: Not provided', 20))})
mapping.set_index('#SampleID', inplace=True)

# separate mapping files
# age
mapping_age = pd.DataFrame({'#SampleID': ['10000.000001010' +
                            x for x in np.char.mod(
                              '%d', np.arange(start=1, stop=121, step=1))],
                            'Age': mapping.Age})
mapping_age.set_index('#SampleID', inplace=True)

# gender
mapping_gender = pd.DataFrame({'#SampleID': ['10000.000001010' +
                              x for x in np.char.mod(
                                '%d', np.arange(start=1, stop=121, step=1))],
                              'Gender': mapping.Gender})
mapping_gender.set_index('#SampleID', inplace=True)

# country
mapping_country = pd.DataFrame({'#SampleID': ['10000.000001010' +
                               x for x in np.char.mod(
                                 '%d', np.arange(start=1, stop=121, step=1))],
                               'Country': mapping.Country})
mapping_country.set_index('#SampleID', inplace=True)

mapping.to_csv('mappings.txt', sep='\t')
mapping_age.to_csv('mapping_age.txt', sep='\t')
mapping_gender.to_csv('mapping_gender.txt', sep='\t')
mapping_country.to_csv('mapping_country.txt', sep='\t')

## simulated alpha diversities

In [3]:
# multipe alphas:
# Faith_PD ~ Uniform (0, 40): negligible effect size
# Shannon: a mixture of N(100, 1) and N(3, 1): largest effect size
# Observed_OTUs: a mixture of N(5, 2), N(15, 10), N(20, 8) and N(30, 2): medium effect size
# Expected effect size (from smallest to biggest): Faith_PD < Observed_OTUs < Shannon
# Expected p-value (from smallest to biggest): Shannon < Observed_OTUs < Faith_PD

alpha_div = pd.DataFrame({'#SampleID': ['10000.000001010' +
                         x for x in np.char.mod(
                           '%d', np.arange(start=1, stop=121, step=1))],
                          'Faith_PD': np.append(np.random.uniform(
                            low=0, high=40, size=100),
                            np.repeat('None', 20)),
                          'Shannon': np.concatenate(
                            (abs(np.random.normal(loc=100, scale=1, size=60)),
                             abs(np.random.normal(loc=3, scale=1, size=60)))),
                          'Observed_OTUs': np.concatenate(
                            (abs(np.random.normal(loc=5, scale=2, size=30)),
                             abs(np.random.normal(loc=15, scale=10, size=30)),
                             abs(np.random.normal(loc=20, scale=8, size=30)),
                             abs(np.random.normal(
                                loc=30, scale=2, size=30))))})
alpha_div.set_index('#SampleID', inplace=True)

# separate alpha diversity file
# Faith_PD
alpha_pd = pd.DataFrame({'#SampleID': ['10000.000001010' +
                        x for x in np.char.mod(
                          '%d', np.arange(start=1, stop=121, step=1))],
                         'Faith_PD': alpha_div.Faith_PD})
alpha_pd.set_index('#SampleID', inplace=True)

# Shannon
alpha_sn = pd.DataFrame({'#SampleID': ['10000.000001010' +
                        x for x in np.char.mod(
                          '%d', np.arange(start=1, stop=121, step=1))],
                         'Shannon': alpha_div.Shannon})
alpha_sn.set_index('#SampleID', inplace=True)

# Observed_OTUs
alpha_otu = pd.DataFrame({'#SampleID': ['10000.000001010' +
                         x for x in np.char.mod(
                           '%d', np.arange(start=1, stop=121, step=1))],
                         'Observed_OTUs': alpha_div.Observed_OTUs})
alpha_otu.set_index('#SampleID', inplace=True)

# output alpha diversity files
alpha_div.to_csv('alphas.txt', sep='\t')
alpha_pd.to_csv('alpha_pd.txt', sep='\t')
alpha_sn.to_csv('alpha_sn.txt', sep='\t')
alpha_otu.to_csv('alpha_otu.txt', sep='\t')