In [1]:
import pandas as pd

In [2]:
import os

In [3]:
import sys

In [4]:
%load_ext autoreload
%autoreload 2

---------------------------

## Config

In [5]:
project_data_dir = '/gcm-lfs1/pablo/data/rna_silencing'

In [6]:
project_dir = '/home/pmonteagudo/workspace/silencing_project'

In [7]:
scripts_dir = os.path.join(project_dir, 'pyRNAdeg')
if scripts_dir not in sys.path: 
    sys.path.append(scripts_dir)

import Util

- Result **directories**

In [8]:
in_dir = os.path.join(project_data_dir, 'results/xp_data/ChIP')

In [9]:
out_dir = in_dir
#out_dir = os.path.join(project_dir, 'results/Ratios')

------------------------------

- Investigate **Heterochromatic genes**

In [10]:
import viz_strands ## get deg1, deg2 and non_degraded

In [11]:
## centromeric genes: `deg1`
old_deg1 = ['dh1', 'dg1']
deg1 = viz_strands.deg1

## subtelomeric genes: `deg2`
old_deg2 = ['SPAC212.11', 'SPAC212.10']
deg2 = viz_strands.deg2

# Mating type region (MTR) gene counts
deg3 = ['MAT2', 'MAT3', 'MAT1']

## rest of Heterochromatic genes, including mat: `deg3`
non_degraded = viz_strands.non_degraded

In [12]:
all_htc_genes = deg1 + deg2 + non_degraded
htc_genes = deg1 + deg2 + deg3

- Import dataframe containing **annotation of all samples**

In [13]:
sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'sample_annotation.csv')
sample_annotation_file

'/gcm-lfs1/pablo/data/rna_silencing/seq_data/sample_annotation.csv'

In [14]:
select_cols = ['sample_id', 'pipeline_type', 'seq_category', 'seq_type',  'mutant_id', 'mutant', 'replicate', 'batch', 'trimmed', 'halic_local_dir']

In [15]:
all_samples_df = pd.read_csv(sample_annotation_file, sep="\t", usecols=select_cols)[select_cols]
#all_samples_df.head()

In [16]:
all_samples_df.shape

(151, 10)

- **Ignore S5-ChIP samples** for now

In [17]:
all_samples_df = all_samples_df[all_samples_df['seq_type'] != 'S5-ChIP']
all_samples_df.shape

(147, 10)

- Select **ChIP Datasets**:

In [18]:
simulated_data = False
#simulated_data = True        

if not simulated_data:
  all_samples_df = all_samples_df[all_samples_df['pipeline_type'] == 'ChIP']
else:
  all_samples_df = all_samples_df[all_samples_df['pipeline_type'] == 'simulated-data']
  all_samples_df['pipeline_type'] = 'ChIP'

input_types = ['S2-ChIP-OIN', 'S2-ChIP-INPUT', 'simulated-data', 'H3K9me2']

## Distinguish between ChIP and INPUT samples
datasets_df = all_samples_df[~all_samples_df['seq_type'].isin(input_types)]
datasets_df.head()

Unnamed: 0,sample_id,pipeline_type,seq_category,seq_type,mutant_id,mutant,replicate,batch,trimmed,halic_local_dir
64,1022_S2-ChIP_1,ChIP,S2-ChIP,S2-ChIP,1022,mot2d,1,ccr4-not,False,Revision/Ccr4-Not_mutants/1022_mot2D/S2ChIP/
65,1022_S2-ChIP_2,ChIP,S2-ChIP,S2-ChIP,1022,mot2d,2,ccr4-not,False,Revision/Ccr4-Not_mutants/1022_mot2D/S2ChIP/
66,1168_S2-ChIP_1,ChIP,S2-ChIP,S2-ChIP,1168,,1,manuscript,False,CHIP/
67,1168_S2-ChIP_2,ChIP,S2-ChIP,S2-ChIP,1168,,2,manuscript,False,fastq/ChIP_replicates_510_1168/
68,260_S2-ChIP_1,ChIP,S2-ChIP,S2-ChIP,260,,1,revision,False,Revision/Sequencing_Revision/ChIP/


In [19]:
datasets_df.shape

(32, 10)

--------------

# Merge replicates: **ChIP Samples**

------------

In general, before merging samples special care needs to be taken to find out which replicate samples can be merged together.

A **correlation analysis** between replicates is necessary as in:
- `RNAdeg/Notebooks/PreProcess/Process_Replicates_ChIP.ipynb`
- `RNAdeg/Notebooks/PreProcess/Process_Replicates_RNA.ipynb`

In [20]:
import RepTools as rpc

- **Replicate samples** grouped by `Mutant` type and `seq_type`:

In [21]:
datasets_df['sub_sample'] = datasets_df['mutant_id'] + '_' + datasets_df['seq_type']

In [22]:
sub_samples = dict(datasets_df.groupby('sub_sample')['sample_id'].apply(list))
#sub_samples

sub_samples =  [## 1022 - ?
                ## 1168
                ['1168_S2ChIP', '1168_S2ChIP_1'],
                ## 260 - ?
                ## 301
                ['301_S2ChIP', '301_S2_ChIP'],
                ## 302
                ['302_S2ChIP', '302_S2_ChIP'],
                ## 324
                ['324_S2ChIP', '324_S2_ChIP'],
                ## 491
                ['491_S2ChIP', '491_S2_ChIP'],
                ## 504
                ['504S2ChIP_1', '504S2ChIP_2'],
                ## 510 - ?
                ## 523 - ?
                ## 524 - ?
                ## 530
                ['530ChIP_1', '530S2ChIP_2'],
                ## 544 - ?
                ## 591
                ['591_S2PChIP'],
                ## 638
                ['638_S2_ChIP', '638ChIP_1'],
                ## 80
                ['80_S2_ChIP', '80S2ChIP_1', '80_S2ChIP'], ## '80_S2ChIP_2' duplicated
                ## 63/65 - WT
                ['63_S2ChIPp']] 


**Total number of samples**:

In [23]:
import itertools

In [24]:
len(list(itertools.chain.from_iterable(sub_samples.values())))

32

**Number of mutants**:

In [25]:
len(sub_samples)

17

## **Data**: Pol II ChIP occupancy (S2-ChIP)

Merge and store as `.csv` files:

- **Average TPM-normalized expression tables** (tpm_df) for replicates:
    - `chip_tpm_merged.csv`

- Import **ChIP gene expression Matrix**: `chip_pombe_tpm_matrix.csv`

In [26]:
tpm_df = pd.read_csv(os.path.join(in_dir, 'chip_pombe_tpm_matrix.csv'), sep='\t')
tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,530_S2-ChIP_2,544_S2-ChIP_1,544_S2-ChIP_2,591_S2-ChIP_1,638_S2-ChIP_1,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,WT_S2-ChIP_1
0,FP565355_TR_box_3800..3820,mating_type_region,TR_box,3800,3820,+,,,,,...,,,,,,,,,,
1,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,114.561515,18.646708,30.418583,30.669509,54.330113,47.140139,60.233923,42.679556,50.339587,88.807335
2,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,136.859243,15.882291,43.513763,9.721794,23.271005,,40.417409,,26.41664,71.287978
3,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,134.831699,15.646997,64.844039,45.152333,94.980176,76.360722,63.384762,20.433753,69.710578,67.723579
4,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,157.81923,33.20202,28.998576,36.877258,42.174825,45.735126,72.714448,63.884942,62.795189,83.82821


In [27]:
tpm_df.shape

(7020, 45)

**Create `length` column**

In [28]:
tpm_df['length'] = tpm_df['gene_length']

- **Merge ChIP replicates** into an average **gene expression Matrix** per Mutant type: `chip_merged_tpm.csv`

In [29]:
merged_tpm_df = rpc.repli_merge(tpm_df,
                                sub_samples,
                                out_dir = out_dir, 
                                out_file = 'chip_merged_tpm.csv')
merged_tpm_df.head()

Unnamed: 0,gene_id,gene_name,length,type,category,bio_type,1022_S2-ChIP,1168_S2-ChIP,260_S2-ChIP,301_S2-ChIP,...,504_S2-ChIP,510_S2-ChIP,523_S2-ChIP,524_S2-ChIP,530_S2-ChIP,544_S2-ChIP,591_S2-ChIP,638_S2-ChIP,80_S2-ChIP,WT_S2-ChIP
0,FP565355_TR_box_3800..3820,,21.0,TR_box,repeat,TR_box,,,,,...,,,,,,,,,,
1,FP565355_region_1..2120,,2120.0,region,repeat,region,46.684445,41.868946,174.149807,45.88462,...,127.003512,213.972848,143.567061,186.757502,107.811047,24.532646,30.669509,50.735126,51.084355,88.807335
2,FP565355_region_15417..15473,,57.0,region,repeat,region,21.504131,15.238462,127.27033,14.604968,...,56.165597,81.462795,42.456228,69.322041,93.826542,29.698027,9.721794,23.271005,33.417024,71.287978
3,FP565355_region_15474..15608,,135.0,region,repeat,region,44.007667,32.97226,263.883917,44.468274,...,228.761631,187.851812,180.638551,180.298925,133.899344,40.245518,45.152333,85.670449,51.176364,67.723579
4,FP565355_region_15609..16735,,1127.0,region,repeat,region,58.905947,20.352371,142.991579,54.704439,...,67.918749,151.335309,123.387055,145.572595,132.858354,31.100298,36.877258,43.954976,66.46486,83.82821


In [30]:
merged_tpm_df.shape

(7020, 23)

In [31]:
#merged_tpm_df[merged_tpm_df['gene_id'].isin(non_degraded)][[xx for xx in merged_tpm_df.columns if ('80' in xx) | (xx == 'gene_id')]]