In [1]:
import os
import sys

import pandas as pd

------------------------------

In [4]:
rdg_dir = '/data/parastou/RNAdeg/pyRNAdeg/'

In [7]:
if rdg_dir not in sys.path:
    sys.path.append(rdg_dir)

In [8]:
import RepTools as rpc

In [9]:
from Util import to_log2_tpm

----

In [5]:
source_dir = '/data/parastou/RNAdeg/results/RipRna/xp_data/'

In [6]:
out_dir = '/data/parastou/RNAdeg/results/RipRna/xp_data/'

------------

## Workflow

- Calculate log2(tpm +1) of all tpm counts of all samples.
- Calculate MA-plot, correlation plot, and pearson correlation score for all replicate pairs.
- Group replicates with correlation scores higher than 85%.
- Merge replicates groups by taking the mean of tpm values.
- Remove 'ncRNA_gene', 'pseudogene', 'rRNA_gene', 'snRNA_gene', 'snoRNA_gene', and 'tRNA_gene' entries from merged results.

--------------------------

### 1 - Prepare output folders

In [10]:
out_ma = os.path.join(out_dir, 'MA')
out_corr = os.path.join(out_dir, 'CORR')

if not os.path.isdir(out_ma):    
    !mkdir $out_ma
    
if not os.path.isdir(out_corr):    
    !mkdir $out_corr

### 2 - Load raw and tpm-normalized gene expression tables

In [10]:
tpm_df = pd.read_csv(os.path.join(source_dir, 'pombe_tpm_matrix.csv'), sep='\t', comment='#')

In [11]:
gx_df = pd.read_csv(os.path.join(source_dir, 'pombe_gene_count_matrix.csv'), sep='\t', comment='#')

Log2-transfer the tpm counts.

In [13]:
log2_tpm_df = to_log2_tpm(tpm_df)

------

### 3 - Produce correlation plots and pearson-r correlation score for sample pairs.

In [15]:
rpc.run_corr_checks(log2_tpm_df, out_dir=out_corr)

### 4 - Produce MA plots for sample pairs.

In [16]:
rpc.run_ma_checks(log2_tpm_df, out_dir=out_ma)



-----------------

### 5 - Investigate correlation scores and select replicates to merge.

In [10]:
from RepTools import report_corr

In [11]:
sub_samples =  [['1113_S2RIP'], ['1113_p'], 
                ['1168_S2RIP_2', '1167_S2RIP_2', '1167_S2RIP', '1168_S2RIP'],
                ['1168_pA_2', '1168_p', '1167_pA_2', '1167_p'],  
                ['301_S2RIP_2', '301_S2RIP_3', '301_S2RIP', '301S2RIP_1'], 
                ['301_RNA_pA_2', '301_RNA_pA_3', '301_RNA_p'], 
                ['302_S2RIP_2', '302_S2RIP_3', '302_S2RIP', '302S2RIP_1'], 
                ['302_RNA_pA_2', '302_RNA_p'],
                ['324_S2RIP_2', '324_S2RIP_3', '324_S2RIP', '324S2RIP_1'],
                ['324_RNA_pA_2', '324_RNA_pA_3', '324_RNA_p', '283_RNA_pA_4'],
                ['491_S2RIP_2', '491_S2RIP_3', '491_S2RIP', '491S2RIP_1'],
                ['491_RNA_pA_2', '491_RNA_p'], ['504S2RIP_1', '504S2RIP_2'],
                ['504_RNA_pA_1', '504_RNA_pA_2'], ['510_S2RIP', '591_S2RIP_2'],
                ['510_RNA_pA_2', '510_RNA_p'], ['530S2RIP_1', '530S2RIP_2'],
                ['530_RNA_pA_1', '530_RNA_pA_2'], ['638S2RIP_1','638S2RIP_2'],
                ['638_RNA_pA_1', '638_RNA_pA_2'], ['80_S2RIP', '80S2RIP_1', '80S2RIP_2'],
                ['80_RNA_p', '80pARNA_2'], ['63_S2Ph_RIP', '63_S2PRIP', '63_S2RIP_2'],          
                ['63_RNA_pA_3', '63_RNA_pA_4', '63', '65']]

In [12]:
corr_df = pd.read_csv('/data/parastou/RNAdeg/results/RipRna/CORR/correlations.csv', sep='\t')

In [40]:
report_corr(corr_df, ['63_RNA_pA_3', '63_RNA_pA_4', '63', '65'])

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,63_RNA_pA_3,63_RNA_pA_4,0.98531,0.0
1,63_RNA_pA_3,63,0.952595,0.0
2,63_RNA_pA_3,65,0.956224,0.0
3,63_RNA_pA_4,63,0.927823,0.0
4,63_RNA_pA_4,65,0.926151,0.0
5,63,65,0.978269,0.0


### 6 - Merge replicates.

In [14]:
sub_samples =  [['1168_S2RIP_2', '1168_S2RIP'],
                ['1168_pA_2', '1168_p'],  
                ['301_S2RIP_2', '301_S2RIP_3', '301_S2RIP'], 
                ['301_RNA_pA_2', '301_RNA_pA_3', '301_RNA_p'], 
                ['302_S2RIP_2', '302_S2RIP_3', '302_S2RIP'], 
                ['302_RNA_pA_2', '302_RNA_p'],
                ['324_S2RIP_2', '324_S2RIP_3', '324_S2RIP'],
                ['324_RNA_pA_2', '324_RNA_pA_3', '324_RNA_p', '283_RNA_pA_4'],
                ['491_S2RIP_2', '491_S2RIP_3', '491_S2RIP'],
                ['491_RNA_pA_2', '491_RNA_p'], ['504S2RIP_1', '504S2RIP_2'],
                ['504_RNA_pA_1', '504_RNA_pA_2'], ['510_S2RIP', '591_S2RIP_2'],
                ['510_RNA_pA_2', '510_RNA_p'], ['530S2RIP_1', '530S2RIP_2'],
                ['530_RNA_pA_1', '530_RNA_pA_2'], ['638S2RIP_1','638S2RIP_2'],
                ['638_RNA_pA_1', '638_RNA_pA_2'], ['80_S2RIP', '80S2RIP_1', '80S2RIP_2'],
                ['80_RNA_p', '80pARNA_2'], ['63_S2Ph_RIP', '63_S2PRIP', '63_S2RIP_2'],          
                ['63_RNA_pA_3', '63_RNA_pA_4', '63', '65']]

In [15]:
col_names = ['1160_RIP','1160_RNA', '301_RIP', '301_RNA', 
             '302_RIP', '302_RNA', '324_RIP', '324_RNA', '491_RIP', '491_RNA', '504_RIP',
             '504_RNA', '510_RIP', '510_RNA', '530_RIP', '530_RNA', '638_RIP', '638_RNA',
             '80_RIP', '80_RNA', 'WT_RIP', 'WT_RNA']

In [16]:
merged_df = rpc.repli_merge(gx_df, sub_samples, new_cols=col_names, out_dir=source_dir)

### 7 - Average TPM-normalized expression tables for replicates.

In [17]:
merged_tpm_df = rpc.repli_merge(tpm_df, sub_samples, new_cols=col_names, out_dir=source_dir, out_file='merged_tpm.csv')