# Procrustes transformation for comparing 90- vs 150-bp Deblur tables

In [None]:
import pandas as pd
import numpy as np

Note: this notebook requires operating on the original UniFrac distances matrices, which are very large. 

To reproduce, download the following files from ftp://ftp.microbio.me/emp/release1/results/beta_diversity/deblur/:

```
emp_150_gg_1k_unweighted_unifrac.txt
emp_90_gg_1k_unweighted_unifrac.txt
```

In [34]:
# read in samples 

samples_150 = pd.read_csv('./emp_150_gg_1k_unweighted_unifrac.txt', nrows=1, sep ='\t', header=None).T[0][1:]
samples_90 = pd.read_csv('./emp_90_gg_1k_unweighted_unifrac.txt', nrows=1, sep ='\t', header=None).T[0][1:]

In [18]:
# read in sample map

subset_2k_map = pd.read_csv('../../data/mapping-files/emp_qiime_mapping_subset_2k.tsv', header=0, sep='\t')
subset_2k_map.head()

Unnamed: 0,#SampleID,BarcodeSequence,LinkerPrimerSequence,Description,host_subject_id,study_id,title,principal_investigator,doi,ebi_accession,...,adiv_shannon,adiv_faith_pd,temperature_deg_c,ph,salinity_psu,oxygen_mg_per_l,phosphate_umol_per_l,ammonium_umol_per_l,nitrate_umol_per_l,sulfate_umol_per_l
0,550.L1S116.s.1.sequence,ATGCCTGAGCAG,GTGCCAGCMGCCGCGGTAA,sample_20 stool,F4,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,3.867414,12.457989,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
1,550.L1S119.s.1.sequence,CAGCACTAAGCG,GTGCCAGCMGCCGCGGTAA,sample_23 stool,F4,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,3.265164,10.719448,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
2,550.L1S164.s.1.sequence,ATGTACGGCGAC,GTGCCAGCMGCCGCGGTAA,sample_73 stool,M3,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,3.661124,14.214158,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
3,550.L1S194.s.1.sequence,CGAAGACTGCTG,GTGCCAGCMGCCGCGGTAA,sample_105 stool,M3,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,4.439943,12.012602,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
4,550.L1S20.s.1.sequence,ACGGTGAGTGTC,GTGCCAGCMGCCGCGGTAA,sample_112 stool,F4,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,3.327601,11.758069,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
5,550.L1S26.s.1.sequence,ACAGCAGTGGTC,GTGCCAGCMGCCGCGGTAA,sample_175 stool,F4,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,3.272025,10.390166,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
6,550.L1S264.s.1.sequence,CTGTATCGTATG,GTGCCAGCMGCCGCGGTAA,sample_180 stool,M3,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,5.732025,19.540809,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
7,550.L1S273.s.1.sequence,CCTAGTACTGAT,GTGCCAGCMGCCGCGGTAA,sample_189 stool,M3,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,4.191533,14.265016,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
8,550.L2S103.s.2.sequence,AGCACGAGCCTA,GTGCCAGCMGCCGCGGTAA,sample_380 sebum,F4,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,7.570207,52.371958,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable
9,550.L2S144.s.2.sequence,AGTACGCTCGAG,GTGCCAGCMGCCGCGGTAA,sample_425 sebum,F4,550,Moving pictures of the human microbiome,Rob Knight,10.1186/gb-2011-12-5-r50,ERP021896,...,6.181649,38.653723,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable,Not applicable


In [45]:
# reduce map

subset_2k_map_90_100 = subset_2k_map.loc[(subset_2k_map['#SampleID'].isin(samples_150)) &
                                         (subset_2k_map['#SampleID'].isin(samples_90)),]

In [46]:
# pick N random sequences and write to file

subset_2k_map_90_100.sample(n=200)['#SampleID'].to_csv('./200_samples.txt', index=False)

In [47]:
# filter distance matrices by 

!filter_distance_matrix.py -i ./emp_150_gg_1k_unweighted_unifrac.txt \
-o ./emp_150_gg_1k_unweighted_unifrac.200.txt \
--sample_id_fp ./200_samples.txt

!filter_distance_matrix.py -i ./emp_90_gg_1k_unweighted_unifrac.txt \
-o ./emp_90_gg_1k_unweighted_unifrac.200.txt \
--sample_id_fp ./200_samples.txt

In [48]:
# compute PC

!principal_coordinates.py -i ./emp_150_gg_1k_unweighted_unifrac.200.txt -o ./emp_150_gg_1k_unweighted_unifrac.200.txt.pc
!principal_coordinates.py -i ./emp_90_gg_1k_unweighted_unifrac.200.txt -o ./emp_90_gg_1k_unweighted_unifrac.200.txt.pc



In [50]:
# compute procrustes

!transform_coordinate_matrices.py -i emp_90_gg_1k_unweighted_unifrac.200.txt.pc,emp_150_gg_1k_unweighted_unifrac.200.txt.pc  \
-r 999 \
-o procrustes_results/

In [51]:
# make emperor plot of procrustes

!make_emperor.py -c -i procrustes_results/ \
-o procrustes_results/plots/ \
-m ./emp_qiime_mapping_subset_2k_20170606.tsv
