In [0]:
import pandas as pd
import rpy2.robjects as robjects
%matplotlib inline
%load_ext rpy2.ipython

In [0]:
cd ~/g/projects/black_spruce/

In [0]:
counts = "seqclean/all_ests.fa.clean_output/contig_member.counts"

In [0]:
count_df = pd.read_csv(counts, sep="\t", index_col=0)
count_df.head()

In [0]:
count_df = count_df.assign(C=lambda x: x.P32C + x.P40C)
count_df = count_df.assign(N=lambda x: x.P32N + x.P40N)

In [0]:
count_df.head()

In [0]:
!find . -type d

In [0]:
raw_dirs = ["im_bscp32C", "im_bscp32N", "BSCP40C", "BSCP40N/"]

In [0]:
for d in raw_dirs:
    scf = !ls $d | grep 'scf'
    print d, len(scf)
    

In [0]:
!grep -c ">" seqclean/all_ests.fa

In [0]:
!grep -c ">" seqclean/all_ests.fa.clean

##Count singletons in unigenes

In [0]:
singleton_counts = {}
for line in open("seqclean/all_ests.fa.clean_output/contig_member"):
    line = line.strip().split("\t")
    ests = line[1:]
    print ests
    if len(ests) == 1:
        sample = ests[0].split(".")[0]
        if not sample in singleton_counts:
            singleton_counts[sample] = 0
        singleton_counts[sample] += 1
singleton_counts

In [0]:
from Bio import SeqIO

##Get read length distribution

In [0]:
read_lengths = {}
for read in SeqIO.parse("seqclean/all_ests.fa.clean", "fasta"):
    sample = read.id.split(".")[0]
    if not sample in read_lengths:
        read_lengths[sample] = []
    read_lengths[sample].append(len(read))

In [0]:
c_lens = []
n_lens = []
for r, d in read_lengths.items():
    print r, len(d), pd.Series(d).describe()
    ro.globalenv[r] = d
    if "C" in r:
        c_lens.extend(d)
    else:
        n_lens.extend(d)

In [0]:
%R -i c_lens -i n_lens

In [0]:
%%R
t.test(c_lens, n_lens)

```
Welch Two Sample t-test

data:  c_lens and n_lens
t = 1.1567, df = 5633.269, p-value = 0.2474
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -3.321819 12.883833
sample estimates:
mean of x mean of y 
 474.4742  469.6932 
```

In [0]:
%%R
prcomp(c_lens, n_lens)

In [0]:
%%R
ls()

In [0]:
def get_tissue(row):
    if row.C > 0 and row.N > 0:
        return "Both"
    elif row.C > 0:
        return "Cambium"
    elif row.N > 0:
        return "Needle"

count_df['tissue'] = count_df.apply(get_tissue, axis=1)
count_df['total'] = count_df.apply(lambda row: row.C + row.N, axis=1)

In [0]:
%R -i count_df

In [0]:
%%R
head(count_df)

In [0]:
%%R
len_pca = prcomp(count_df[,1:4], center=T, scale=T)
print(len_pca)
plot(len_pca, type="l")
print(summary(len_pca))

In [0]:
%%R
library(ggbiplot)

In [0]:
%%R
g <- ggbiplot(len_pca, 
              obs.scale = 1, 
              var.scale = 1,
              ellipse = TRUE, 
              circle = TRUE,
             groups=count_df[,7],size=count_df[,8])
print(g)
pdf("count_pca.pdf")
print(g)
dev.off()

In [0]:
ls -lrt