In [1]:
import os
import sys

import pandas as pd

------------------------------

In [4]:
rdg_dir = '/data/parastou/RNAdeg/scripts/'

In [8]:
if rdg_dir not in sys.path:
    sys.path.append(rdg_dir)

In [9]:
import RepTools as rpc

In [10]:
from Util import to_log2_tpm

---

In [5]:
source_dir = '/data/parastou/RNAdeg/results/RipChip/xp_data/'

In [6]:
rip_dir = '/data/parastou/RNAdeg/results/RipRna/xp_data/'

In [7]:
out_dir = '/data/parastou/RNAdeg/results/RipChip/'

------------

## Workflow

- Calculate log2(tpm +1) of all tpm counts of all samples.
- Calculate MA-plot, correlation plot, and pearson correlation score for all replicate pairs.
- Group replicates with correlation scores higher than 85%.
- Merge replicates groups by taking the mean of tpm values.
- Remove 'ncRNA_gene', 'pseudogene', 'rRNA_gene', 'snRNA_gene', 'snoRNA_gene', and 'tRNA_gene' entries from merged results.

--------------------------

### 1 - Prepare output folders

In [11]:
out_ma = os.path.join(out_dir, 'MA')
out_corr = os.path.join(out_dir, 'CORR')

if not os.path.isdir(out_ma):    
    !mkdir $out_ma
    
if not os.path.isdir(out_corr):    
    !mkdir $out_corr

### 2 - Load raw and tpm-normalized gene expression tables

In [11]:
tpm_df = pd.read_csv(os.path.join(source_dir, 'chip_pombe_tpm_matrix.csv'), sep='\t', comment='#')

In [12]:
gx_df = pd.read_csv(os.path.join(source_dir, 'chip_pombe_gene_count_matrix.csv'), sep='\t', comment='#')

In [13]:
log2_tpm_df = to_log2_tpm(tpm_df)

----

### 3 - Produce correlation plots and pearson-r correlation score for sample pairs.

In [18]:
rpc.run_corr_checks(log2_tpm_df, out_dir=out_corr)

-----------------

### 4 - Select replicates to merge based on their pearson-r correlation scores

In [20]:
from RepTools import report_corr

In [30]:
sub_samples =  [['638_S2_ChIP', '638ChIP_1'] , ['63_S2ChIPp'], ['302_S2ChIP', '302_S2_ChIP'],
                ['301_S2_ChIP', '301_S2ChIP'], ['324_S2ChIP', '324_S2_ChIP'], 
                ['80_S2_ChIP', '80_S2Ph_ChIP', '80S2ChIP_1', '80_S2ChIP_2', '80_S2ChIP'],
                ['504S2ChIP_1', '504S2ChIP_2'], ['591_S2PChIP'],
                ['491_S2ChIP', '491_S2_ChIP'], ['530ChIP_1', '530S2ChIP_2']]

In [23]:
corr_df = pd.read_csv(os.path.join(out_corr, 'correlations.csv'), sep='\t')

In [25]:
for s in sub_samples:
    a = report_corr(corr_df, s)
    print(s)
    print(a)

['638_S2_ChIP', '638ChIP_1']
     Sample1      Sample2  Correlation  P-value
0  638ChIP_1  638_S2_ChIP     0.948376      0.0
['1168_S2ChIP', '1168_S2ChIP_1']
         Sample1      Sample2  Correlation  P-value
0  1168_S2ChIP_1  1168_S2ChIP     0.883966      0.0
['63_S2ChIP_2', '63_S2ChIPp']
Empty DataFrame
Columns: [Sample1, Sample2, Correlation, P-value]
Index: []
['302_S2ChIP', '302_S2_ChIP']
      Sample1      Sample2  Correlation  P-value
0  302_S2ChIP  302_S2_ChIP     0.962185      0.0
['301_S2_ChIP', '301_S2ChIP']
      Sample1      Sample2  Correlation  P-value
0  301_S2ChIP  301_S2_ChIP     0.973536      0.0
['324_S2ChIP', '324_S2_ChIP']
      Sample1      Sample2  Correlation  P-value
0  324_S2ChIP  324_S2_ChIP     0.949192      0.0
['80_S2_ChIP', '80_S2Ph_ChIP', '80S2ChIP_1', '80_S2ChIP_2', '80_S2ChIP']
        Sample1       Sample2  Correlation  P-value
0   80_S2ChIP_2     80_S2ChIP     0.960285      0.0
1   80_S2ChIP_2  80_S2Ph_ChIP     0.891178      0.0
2   80_S2ChIP_2    

### 6 - Merge replicates.

In [14]:
sub_samples =  [['638_S2_ChIP', '638ChIP_1'] ,['1168_S2ChIP', '1168_S2ChIP_1'], 
                ['63_S2ChIPp'], ['302_S2ChIP', '302_S2_ChIP'],
                ['301_S2_ChIP', '301_S2ChIP'], ['324_S2ChIP', '324_S2_ChIP'],
                ['80_S2_ChIP', '80S2ChIP_1', '80_S2ChIP_2', '80_S2ChIP'],
                ['504S2ChIP_1', '504S2ChIP_2'], ['591_S2PChIP'],
                ['491_S2ChIP', '491_S2_ChIP'], ['530ChIP_1', '530S2ChIP_2']]

In [15]:
col_names = ['638_ChIP', '1160_ChIP', 'WT_ChIP', '302_ChIP', '301_ChIP', '324_ChIP', 
             '80_ChIP', '504_ChIP', '591_ChIP', '491_ChIP', '530_ChIP']

In [16]:
merged_df = rpc.repli_merge(gx_df, sub_samples, new_cols=col_names, out_dir=source_dir, out_file='chip_pombe_gene_count_merged.csv')

### 7 - Average TPM-normalized expression tables for replicates.

In [17]:
merged_tpm_df = rpc.repli_merge(tpm_df, sub_samples, new_cols=col_names, out_dir=source_dir, out_file='chip_pombe_tpm_merged.csv')

------