In [1]:
import pandas as pd

In [2]:
import os

In [3]:
import sys

In [4]:
%load_ext autoreload
%autoreload 2

---------------------------

## Config

In [5]:
project_data_dir = '/gcm-lfs1/pablo/data/rna_silencing'

In [6]:
project_dir = '/home/pmonteagudo/workspace/silencing_project'

In [7]:
scripts_dir = os.path.join(project_dir, 'pyRNAdeg')
if scripts_dir not in sys.path: 
    sys.path.append(scripts_dir)

import Util

- Result **directories**

In [8]:
in_dir = os.path.join(project_data_dir, 'results/xp_data/RNA')

In [9]:
out_dir = in_dir
#out_dir = os.path.join(project_dir, 'results/Ratios')

------------------------------

- Investigate **Heterochromatic genes**

In [10]:
import viz_strands ## get deg1, deg2 and non_degraded

In [11]:
## centromeric genes: `deg1`
old_deg1 = ['dh1', 'dg1']
deg1 = viz_strands.deg1

## subtelomeric genes: `deg2`
old_deg2 = ['SPAC212.11', 'SPAC212.10']
deg2 = viz_strands.deg2

# Mating type region (MTR) gene counts
deg3 = ['MAT2', 'MAT3', 'MAT1']

## rest of Heterochromatic genes, including mat: `deg3`
non_degraded = viz_strands.non_degraded

In [12]:
all_htc_genes = deg1 + deg2 + non_degraded
htc_genes = deg1 + deg2 + deg3

- Import dataframe containing **annotation of all samples**

In [13]:
sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'sample_annotation.csv')
sample_annotation_file

'/gcm-lfs1/pablo/data/rna_silencing/seq_data/sample_annotation.csv'

In [14]:
select_cols = ['sample_id', 'pipeline_type', 'seq_category', 'seq_type',  'mutant_id', 'mutant', 'replicate', 'batch', 'trimmed', 'halic_local_dir']

In [15]:
all_samples_df = pd.read_csv(sample_annotation_file, sep="\t", usecols=select_cols)[select_cols]
#all_samples_df.head()

In [16]:
all_samples_df.shape

(151, 10)

- **Ignore S5-RIP samples** for now

In [17]:
## Ignore S5-samples for now!
all_samples_df = all_samples_df[all_samples_df['seq_type'] != 'S5-RIP']
all_samples_df.shape

(145, 10)

- **Select RNA Datasets**: | S2-RIP | S5-RIP | pA-RNA | total-RNA |

In [18]:
simulated_data = False
#simulated_data = True
                 
if not simulated_data:
  datasets_df = all_samples_df[all_samples_df['pipeline_type'] == 'RNA']
else:
  datasets_df = all_samples_df[all_samples_df['pipeline_type'] == 'simulated-data']
  datasets_df['pipeline_type'] = 'RNA'

datasets_df.head()

Unnamed: 0,sample_id,pipeline_type,seq_category,seq_type,mutant_id,mutant,replicate,batch,trimmed,halic_local_dir
30,1022_pA-RNA_1,RNA,pA-RNA,pA-RNA,1022,mot2d,1,ccr4-not,False,Revision/Ccr4-Not_mutants/1022_mot2D/RNA/
31,1023_pA-RNA_2,RNA,pA-RNA,pA-RNA,1023,mot2d,2,ccr4-not,False,Revision/Ccr4-Not_mutants/1022_mot2D/RNA/
32,1168_pA-RNA_1,RNA,pA-RNA,pA-RNA,1168,,1,manuscript,False,
33,1168_pA-RNA_2,RNA,pA-RNA,pA-RNA,1168,,2,manuscript,False,
34,260_pA-RNA_1,RNA,pA-RNA,pA-RNA,260,,1,revision,False,Revision/Sequencing_Revision/RNA/pA_RNA/


In [19]:
datasets_df.shape

(77, 10)

----------------

# Merge replicates: **RNA Samples**

------------

In general, before applying this step special care needs to be taken to find out which replicate samples can be merged together.

A **correlation analysis** between replicates is necessary as in:
- `RNAdeg/Notebooks/PreProcess/Process_Replicates_ChIP.ipynb`
- `RNAdeg/Notebooks/PreProcess/Process_Replicates_RNA.ipynb`

In [33]:
import RepTools as rpc

- **Replicate samples** grouped by `Mutant` type and `seq_type`:

In [21]:
datasets_df['sub_sample'] = datasets_df['mutant_id'] + '_' + datasets_df['seq_type']

In [22]:
sub_samples = dict(datasets_df.groupby('sub_sample')['sample_id'].apply(list))
#sub_samples

sub_samples =  [## 1022 - ?
                ## 1023 - ?
                ## 1168
                ['1168_S2RIP_2', '1168_S2RIP'],
                ['1168_pA_2', '1168_pA'],
                ## 260 - ?
                ## 301
                ['301_S2RIP', '301_S2RIP_2', '301_S2RIP_3'],
                ['301_RNA_pA', '301_RNA_pA_2', '301_RNA_pA_3'],
                ## 302
                ['302_S2RIP', '302_S2RIP_2', '302_S2RIP_3'],
                ['302_RNA_pA', '302_RNA_pA_2'],
                ## 324/283
                ['324_S2RIP', '324_S2RIP_2', '324_S2RIP_3'], 
                ['324_RNA_pA', '324_RNA_pA_2', '324_RNA_pA_3', '283_RNA_pA_4'],
                ## 491
                ['491_S2RIP', '491_S2RIP_2', '491_S2RIP_3'],
                ['491_RNA_pA', '491_RNA_pA_2'],
                ## 504
                ['504S2RIP_1', '504S2RIP_2'],
                ['504_RNA_pA_1', '504_RNA_pA_2'],
                ## 510/591
                ['510_S2RIP', '591_S2RIP_2'],
                ['510_RNA_pA', '510_RNA_pA_2'],
                ## 523 - ?
                ## 524 - ?
                ## 530
                ['530S2RIP_1', '530S2RIP_2'],
                ['530_RNA_pA_1', '530_RNA_pA_2'],
                ## 544 - ?
                ## 638
                ['638S2RIP_1','638S2RIP_2'],
                ['638_RNA_pA_1', '638_RNA_pA_2'],
                ## 80
                ['80_S2RIP','80S2RIP_1', '80S2RIP_2'],
                ['80_RNA_pA', '80pARNA_2'],
                ## 63/65 - WT
                ['63_S2PRIP', '63_S2RIP_2'],
                ['63_RNA_pA_3', '63_RNA_pA_4', '63_RNA_pA', '65_RNA_pA']
               ]

**Total number of samples**:

In [23]:
import itertools

In [24]:
len(list(itertools.chain.from_iterable(sub_samples.values())))

77

**Number of mutants**:

In [25]:
len(sub_samples)

38

## **Data**: S2-RIP, pA-RNA and total-RNA

Merge and store as `.csv` files:

- **Average TPM-normalized expression tables** (tpm_df) for replicates:
    - `rna_tpm_merged.csv`

- Import **RNA gene expresion Matrix**: `rna_pombe_tpm_matrix.csv`

In [27]:
tpm_df = pd.read_csv(os.path.join(in_dir, 'rna_pombe_tpm_matrix.csv'), sep='\t')
tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,80_S2-RIP_1,80_S2-RIP_2,80_S2-RIP_3,WT_S2-RIP_1,WT_S2-RIP_2,260_total-RNA_1,510_total-RNA_1,638_total-RNA_1,80_total-RNA_1,WT_total-RNA_1
0,FP565355_TR_box_3800..3820,mating_type_region,TR_box,3800,3820,+,,,,,...,,,,,,,,,,
1,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,3.91455,4.180249,1.822724,4.156766,1.162622,2.249401,6.054902,9.715733,6.556595,3.383456
2,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,,,,,,,,,,
3,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,,5.967763,4.089074,34.814198,6.085825,,,19.071625,,
4,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,12.518224,20.016098,26.939992,2.085145,,1.586756,21.439771,7.995868,42.139843,44.552336


In [28]:
tpm_df.shape

(7020, 90)

**Create `length` column**

In [29]:
tpm_df['length'] = tpm_df['gene_length']

- **Merge RNA replicates** into an average **gene expression Matrix** per Mutant type: `rna_merged_tpm.csv`

In [30]:
merged_tpm_df = rpc.repli_merge(tpm_df,
                                sub_samples,
                                out_dir = out_dir, 
                                out_file = 'rna_merged_tpm.csv')

In [31]:
merged_tpm_df.head()

Unnamed: 0,gene_id,gene_name,length,type,category,bio_type,1022_S2-RIP,1022_pA-RNA,1023_pA-RNA,1168_S2-RIP,...,591_S2-RIP,638_S2-RIP,638_pA-RNA,638_total-RNA,80_S2-RIP,80_pA-RNA,80_total-RNA,WT_S2-RIP,WT_pA-RNA,WT_total-RNA
0,FP565355_TR_box_3800..3820,,21.0,TR_box,repeat,TR_box,,,,,...,,,,,,,,,,
1,FP565355_region_1..2120,,2120.0,region,repeat,region,14.563027,6.299062,10.15861,3.644505,...,4.606917,2.902372,2.59804,9.715733,3.305841,1.69716,6.556595,2.659694,0.687899,3.383456
2,FP565355_region_15417..15473,,57.0,region,repeat,region,,,,,...,,,,,,,,,,
3,FP565355_region_15474..15608,,135.0,region,repeat,region,15.884988,5.206242,1.504979,10.40586,...,,7.545657,6.424812,19.071625,5.028418,1.432481,,20.450011,3.806464,
4,FP565355_region_15609..16735,,1127.0,region,repeat,region,21.995583,67.561037,39.931361,10.0128,...,17.332144,4.563064,6.617984,7.995868,19.824772,35.014741,42.139843,2.085145,8.159744,44.552336


In [32]:
merged_tpm_df.shape

(7020, 44)