In [1]:
import pandas as pd

In [2]:
%load_ext rpy2.ipython

In [3]:
%%R
library(phyloseq)
library(plyr); library(dplyr)
library(data.table)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following object is masked from ‘package:stats’:

    filter

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

data.table 1.9.4  For help type: ?data.table
*** NB: by=.EACHI is now explicit. See README to restore previous behaviour.

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last



In [4]:
mappers = !find /var/seq_data/priming_exp/RawData/ -name seq?_mapper.csv -print

In [5]:
mappers

['/var/seq_data/priming_exp/RawData/Seq5/seq5_mapper.csv',
 '/var/seq_data/priming_exp/RawData/Seq2/seq2_mapper.csv',
 '/var/seq_data/priming_exp/RawData/Seq1/seq1_mapper.csv',
 '/var/seq_data/priming_exp/RawData/Seq3/seq3_mapper.csv',
 '/var/seq_data/priming_exp/RawData/Seq4/seq4_mapper.csv']

In [6]:
columns_to_keep = ["Sample","FractionNum","Bulk","Control","CC","100","700","H2O","Day","Density"]

In [7]:
df_all = None
for mapper in mappers:
    df = pd.read_csv(mapper)
    df_to_merge = df[columns_to_keep]
    if df_all is None:
        df_all = df_to_merge
    else:
        df_all = pd.concat([df_all, df_to_merge], )

In [8]:
df_all.drop_duplicates(inplace = True)

In [9]:
df_all.shape

(468, 10)

In [10]:
df_all["Density"] = df_all["Density"].map(lambda x: "%.4f"%x)

In [11]:
df_all.head()

Unnamed: 0,Sample,FractionNum,Bulk,Control,CC,100,700,H2O,Day,Density
0,12C.000.28.03.07,7,0,1,1,0,0,0,28,1.7646
1,12C.000.28.03.08,8,0,1,1,0,0,0,28,1.7614
2,12C.000.28.03.09,9,0,1,1,0,0,0,28,1.7537
3,12C.000.28.03.10,10,0,1,1,0,0,0,28,1.7483
4,12C.000.28.03.11,11,0,1,1,0,0,0,28,1.7417


In [12]:
label_rep_flask_numbers = ["06","07"]
control_rep_flask_number= "03"

In [13]:
def parseID(ID):
    return ID.split(".")[3]

def replicate(ID):
    flaskID = ID.split(".")[3]
    if not ID.startswith("13C.100.28") and not ID.startswith("12C.100.28"):
        return ""
    if ID.endswith("NA"):
        return ""
    if flaskID in label_rep_flask_numbers:
        return "label_rep_" + flaskID
    elif flaskID == control_rep_flask_number:
        return "control_rep"
    else:
        return ""

In [14]:
df_all["rep"] = df_all["Sample"].map(replicate)

In [15]:
df_all["contolVlabel"] = df_all["Control"].map(lambda x: "control" if x else "label")

In [16]:
df_all["Treatment"] = df_all["Sample"].map(lambda x: x.split(".")[0] + x.split(".")[1])

In [17]:
df_all.to_csv("/var/seq_data/priming_exp/data/sample_data_all.csv", index=False)

In [18]:
!head -n 25 /var/seq_data/priming_exp/data/sample_data_all.csv

Sample,FractionNum,Bulk,Control,CC,100,700,H2O,Day,Density,rep,contolVlabel,Treatment
12C.000.28.03.07,7.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7646,,control,12C000
12C.000.28.03.08,8.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7614,,control,12C000
12C.000.28.03.09,9.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7537,,control,12C000
12C.000.28.03.10,10.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7483,,control,12C000
12C.000.28.03.11,11.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7417,,control,12C000
12C.000.28.03.12,12.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7373,,control,12C000
12C.000.28.03.13,13.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7330,,control,12C000
12C.000.28.03.14,14.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7275,,control,12C000
12C.000.28.03.15,15.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7220,,control,12C000
12C.000.28.03.16,16.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7188,,control,12C000
12C.000.28.03.17,17.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7155,,control,12C000
12C.000.28.03.18,18.0,0.0,1.0,1.0,0.0,0.0,0.0,28.0,1.7111,,control,12C000
12C.000.28.03.19

### Filter out mock community 

In [19]:
%%R
df_all = read.csv("/var/seq_data/priming_exp/data/sample_data_all.csv")

In [20]:
%%R
df_all_nomock = df_all %>% 
    filter(Sample != "0MC.000.00.00.00")

In [21]:
%%R
head(df_all_nomock)

            Sample FractionNum Bulk Control CC X100 X700 H2O Day Density rep
1 12C.000.28.03.07           7    0       1  1    0    0   0  28  1.7646    
2 12C.000.28.03.08           8    0       1  1    0    0   0  28  1.7614    
3 12C.000.28.03.09           9    0       1  1    0    0   0  28  1.7537    
4 12C.000.28.03.10          10    0       1  1    0    0   0  28  1.7483    
5 12C.000.28.03.11          11    0       1  1    0    0   0  28  1.7417    
6 12C.000.28.03.12          12    0       1  1    0    0   0  28  1.7373    
  contolVlabel Treatment
1      control    12C000
2      control    12C000
3      control    12C000
4      control    12C000
5      control    12C000
6      control    12C000


In [22]:
%%R
write.table(df_all_nomock, 
          file = "/var/seq_data/priming_exp/data/allsample_metadata_nomock.csv", 
          sep = ",",
          row.names = FALSE)

#### Note: This command deleted only duplicates of the mock community, but there is still one row of mock community

#### The phyloseq package does not like this mapper in .csv form, so tranforming it to .txt and removing "phantom" column that R adds. 

In [23]:
%%R 
sampledata = read.csv("/var/seq_data/priming_exp/data/allsample_metadata_nomock.csv")
sampledata$X = NULL
write.table(sampledata, "/var/seq_data/priming_exp/data/allsample_metadata_nomock.txt", sep="\t")
#print(sampledata)

In [24]:
!head -n 10 /var/seq_data/priming_exp/data/allsample_metadata_nomock.csv

"Sample","FractionNum","Bulk","Control","CC","X100","X700","H2O","Day","Density","rep","contolVlabel","Treatment"
"12C.000.28.03.07",7,0,1,1,0,0,0,28,1.7646,"","control","12C000"
"12C.000.28.03.08",8,0,1,1,0,0,0,28,1.7614,"","control","12C000"
"12C.000.28.03.09",9,0,1,1,0,0,0,28,1.7537,"","control","12C000"
"12C.000.28.03.10",10,0,1,1,0,0,0,28,1.7483,"","control","12C000"
"12C.000.28.03.11",11,0,1,1,0,0,0,28,1.7417,"","control","12C000"
"12C.000.28.03.12",12,0,1,1,0,0,0,28,1.7373,"","control","12C000"
"12C.000.28.03.13",13,0,1,1,0,0,0,28,1.733,"","control","12C000"
"12C.000.28.03.14",14,0,1,1,0,0,0,28,1.7275,"","control","12C000"
"12C.000.28.03.15",15,0,1,1,0,0,0,28,1.722,"","control","12C000"
