In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools

------------------------------

## Config

In [3]:
import sys

In [4]:
project_dir = '/home/pmonteagudo/workspace/silencing_project'
if project_dir not in sys.path: 
    sys.path.append(project_dir)
from config_analysis import *

In [5]:
import RepTools as rpc    
from Util import to_log2_tpm

- Result **directories**

In [6]:
#in_dir = os.path.join(project_data_dir, 'results/xp_data/RNA')
in_dir = rna_dir
in_dir

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/RNA'

In [7]:
#out_dir = os.path.join(project_data_dir, 'results/xp_data/RNA')
out_dir = rna_dir
out_dir

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/RNA'

------------

# Process Replicates: **RNA-seq**

------------

## Workflow

- Calculate **log2(tpm + 1)** of all TPM counts of all samples.
- Get grouping of **replicate samples**.
- Calculate **MA-plot**, **correlation plot**, and **pearson-correlation score** for all replicate pairs.
- Group replicates with correlation scores higher than 85%.
- Merge replicates groups by taking the mean of TPM values.
- Filter for **Heterochromatic** and **Protein coding** (mRNA) genes, **remove entries** from merged results:
    - `ncRNA_gene`
    - `pseudogene`
    - `rRNA_gene`
    - `snRNA_gene`
    - `snoRNA_gene`
    - `tRNA_gene` 

--------------------------

In [8]:
out_ma = os.path.join(out_dir, 'MA')
if not os.path.isdir(out_ma):    
    !mkdir -p $out_ma

In [9]:
out_corr = os.path.join(out_dir, 'CORR')
if not os.path.isdir(out_corr):    
    !mkdir -p $out_corr

-------------

- Import dataframe containing **annotation of all samples**

In [10]:
#sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'sample_annotation.csv')
sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'file_annotation.csv')
sample_annotation_file

'/gcm-lfs1/pablo/data/rna_silencing/seq_data/file_annotation.csv'

In [11]:
#select_cols = ['sample_id', 'pipeline_type', 'seq_category', 'seq_type',  'mutant_id', 'mutant', 'replicate', 'batch', 'trimmed', 'halic_local_dir']
select_cols = ['sample_id', 'pipeline_type', 'seq_category', 'seq_type',  'mutant_id', 'mutant_name', 'replicate', 'trimmed']

In [12]:
all_samples_df = pd.read_csv(sample_annotation_file, sep="\t", usecols=select_cols)[select_cols]
#all_samples_df = pd.read_csv(sample_annotation_file, sep="\t")
all_samples_df.rename(columns={'mutant_name':'mutant'}, inplace=True)
#all_samples_df.head()

In [13]:
all_samples_df.shape

(157, 8)

- **Ignore specific samples** 

In [14]:
#ignore_datasets.append('WT_S2-RIP_2') # try to ignore for Gene cloud plots

In [15]:
#ignore_datasets

In [16]:
all_samples_df = all_samples_df[~all_samples_df.sample_id.isin(ignore_datasets)]
all_samples_df.shape

(156, 8)

- **Ignore S2-RIP/S5-RIP samples** for now

In [17]:
#all_samples_df = all_samples_df[all_samples_df['seq_type'] != 'S2-RIP']
#all_samples_df = all_samples_df[all_samples_df['seq_type'] != 'S5-RIP']
#all_samples_df.shape

- Select **RNA Datasets**: | S2-RIP | S5-RIP | pA-RNA | total-RNA |

In [18]:
datasets_df = all_samples_df[all_samples_df['pipeline_type'] == 'RNA']
datasets_df.head()

Unnamed: 0,sample_id,pipeline_type,seq_category,seq_type,mutant_id,mutant,replicate,trimmed
4,1022_S2-RIP_2,RNA,S2-RIP,S2-RIP,1022,mot2d,2,False
5,1022_S2-RIP_3,RNA,S2-RIP,S2-RIP,1022,mot2d,3,False
6,1022_S2-RIP_4,RNA,S2-RIP,S2-RIP,1022,mot2d,4,False
7,1022_S2-RIP_5,RNA,S2-RIP,S2-RIP,1022,mot2d,5,False
8,1022_pA-RNA_1,RNA,pA-RNA,pA-RNA,1022,mot2d,1,False


In [19]:
datasets_df.shape

(80, 8)

- Get `samples` **columns**

In [20]:
sample_cols = datasets_df['sample_id'].tolist()
len(sample_cols)
#sample_cols

80

-------------

# **1.** Load raw and TPM-normalized gene expression tables

-------------

- Import **RNA** (raw) **gene counts Matrix**: `rna_pombe_gene_count_matrix.csv`

In [83]:
gx_df_file = os.path.join(in_dir, 'rna_pombe_gene_count_matrix.csv')
gx_df_file

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/RNA/rna_pombe_gene_count_matrix.csv'

In [84]:
#gx_df = pd.read_csv(gx_df_file, sep='\t', comment='#')
#gx_df.head()

In [37]:
#gx_df.shape

- Import **RNA** (TPM-normed) **gene expression Matrix**: `rna_pombe_tpm_matrix.csv`

In [21]:
#tpm_df_file ='/gcm-lfs1/pablo/data/rna_silencing/old_results/xp_data/RNA/rna_pombe_tpm_matrix.csv'
tpm_df_file = os.path.join(in_dir, 'rna_pombe_tpm_matrix.csv') # with all genes
tpm_df_file

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/RNA/rna_pombe_tpm_matrix.csv'

In [22]:
#tpm_df = pd.read_csv(gx_df_file, sep='\t', comment='#')
tpm_df = pd.read_csv(tpm_df_file, sep='\t', comment='#') # NAs here represent zeros counts

# ignore specific samples
tpm_df = tpm_df.loc[:, ~tpm_df.columns.isin(ignore_datasets)]
tpm_df.head()   # with NH-norm: 0.22139 (80_total-RNA_2)
                # w/o NH-norm: 0.833835 (80_total-RNA_2)

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,80_total-RNA_2,WT_S2-RIP_1,WT_S2-RIP_2,WT_S5-RIP_2,WT_pA-RNA_1,WT_pA-RNA_2,WT_pA-RNA_3,WT_pA-RNA_4,WT_total-RNA_1,WT_total-RNA_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,0.833835,3.903828,0.998344,,0.265064,1.146108,0.769286,0.695822,3.368858,0.153509
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,,,,,,,,,,
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,,50.138467,8.013835,55.013025,2.735617,4.92855,8.775227,8.378187,,
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,12.975573,1.999957,,,0.436481,34.364524,0.077785,0.167097,45.304578,6.488126
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,,,,,,,,,,


In [23]:
tpm_df.shape

(7021, 93)

In [24]:
# # check genes from specific regions in TE of wt scatterplot
# regions_df = tpm_df[tpm_df['gene_id'].isin(['SPCC1494.11c','SPCC1183.10'])]
# regions_df.loc[:, regions_df.columns.str.contains('gene|WT_S2')]

In [25]:
# check if bam was filtered correctly
#tpm_df[tpm_df['gene_id'].str.contains('SPRRNA', na=False)]
# residual reads because some rRNA features overlap with other genomic features
#tpm_df[tpm_df['gene_id'].str.contains('SPRRNA.07|SPRRNA.15|SPRRNA.29|SPRRNA.35', na=False)]

In [26]:
#tpm_df[tpm_df['gene_id'].str.contains('dg|dh')]

- Update `samples` **columns** for samples present in `tpm_df`

In [27]:
sample_cols = tpm_df.columns.intersection(sample_cols).tolist()
len(sample_cols)
#sample_cols

80

- Check **TPM normalization** for each sample expression should add to $10^6$

In [28]:
#tpm_df[sample_cols].describe()
assert all(np.isclose(tpm_df[sample_cols].sum(), 10**6)) 

#### <font color='red'> Deal with **NAs**: *Drop* or *fill with zeros?* </font>
- <font color='red'> `NaN's` originated from zero counts: **fill with zeros** </font>

In [29]:
# (NOT USED - by Parastou)
# => NAs originated from zero counts: fill with zeros
#tpm_df = tpm_df.fillna(0) # undesired off-target effects to other columns (e.g. `gene_name`)
tpm_df = tpm_df.fillna({kk:0 for kk in sample_cols})

# (NOT USED) drop row (axis - 0) if it finds ANY `na`, becareful when adding new columns!
#tpm_df = tpm_df.dropna(subset = sample_cols) 
#tpm_df = tpm_df.dropna(0)

tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,80_total-RNA_2,WT_S2-RIP_1,WT_S2-RIP_2,WT_S5-RIP_2,WT_pA-RNA_1,WT_pA-RNA_2,WT_pA-RNA_3,WT_pA-RNA_4,WT_total-RNA_1,WT_total-RNA_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,0.833835,3.903828,0.998344,0.0,0.265064,1.146108,0.769286,0.695822,3.368858,0.153509
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,0.0,50.138467,8.013835,55.013025,2.735617,4.92855,8.775227,8.378187,0.0,0.0
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,12.975573,1.999957,0.0,0.0,0.436481,34.364524,0.077785,0.167097,45.304578,6.488126
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
tpm_df.shape

(7021, 93)

In [31]:
# no surprises!
# => there should not be NAs in sample columns (with counts)
#assert tpm_df.dropna(subset = sample_cols).shape == tpm_df.shape
assert not tpm_df[sample_cols].isnull().values.any()

- **Log-transform** (TPM-normed)  **gene expression Matrix** - will be used to compute correlations between replicates

In [32]:
# log transformed counts will be used to compute correlations
# => compute log(1+x) to avoid issues with zero counts
#log2_tpm_df = to_log2_tpm(tpm_df)
log2_tpm_df = to_log2_tpm(tpm_df, gene_id_col='gene_id') # default shift=1, no issues with division by zero
#log2_tpm_df = to_log2_tpm(tpm_df, gene_id_col='gene_id', shift=0) # default shift=1

log2_tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,80_total-RNA_2,WT_S2-RIP_1,WT_S2-RIP_2,WT_S5-RIP_2,WT_pA-RNA_1,WT_pA-RNA_2,WT_pA-RNA_3,WT_pA-RNA_4,WT_total-RNA_1,WT_total-RNA_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,0.874864,2.293908,0.998805,0.0,0.33921,1.101723,0.823167,0.761985,2.127256,0.20603
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,0.0,5.676337,3.172141,5.80769,1.901347,2.567679,3.28913,3.229309,0.0,0.0
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,3.804836,1.584942,0.0,0.0,0.522539,5.144231,0.108069,0.222925,5.533083,2.904605
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
log2_tpm_df.shape

(7021, 93)

In [34]:
#log2_tpm_df.loc[:, log2_tpm_df.columns.str.contains("pA-")]

In [35]:
# no surprises!
# => the log should not introduce any issues
#assert log2_tpm_df.dropna(subset = sample_cols).shape == tpm_df.shape
assert not log2_tpm_df[sample_cols].isnull().values.any()

----

# **2.** Get groupings of **replicate samples**

----

In [38]:
#sample_cols = [ss for ss in log2_tpm_df.columns if ('RNA' in ss) | ('RIP' in ss)]
#len(sample_cols)

- **Replicate samples** grouped by `mutant` and `seq_type` type:

    - <font color='red'> **Atention!** it's a bit tricky due to ambiguity between `mutant_id -> mutant` map </font>
        - **510/591** -> `caf1d`
        - **1022/1023** -> `mot2d`
        - **523/524** -> `unknown`
     
     <font color='red'> First group using `mutant` then use `inv_mut_dict` which removes ambiguity to get `mutant_id` again. </font>

In [39]:
#datasets_df['sub_sample'] = datasets_df['mutant'] + '_' + datasets_df['seq_type']
datasets_df['sub_sample'] = datasets_df.mutant.map(inv_mut_dict) + '_' + datasets_df['seq_type']

In [40]:
# only for samples present in df
datasets_df = datasets_df[datasets_df['sample_id'].isin(sample_cols)]

In [41]:
sub_samples = dict(datasets_df.groupby('sub_sample')['sample_id'].apply(list))
sub_samples

{'1022_S2-RIP': ['1022_S2-RIP_2',
  '1022_S2-RIP_3',
  '1022_S2-RIP_4',
  '1022_S2-RIP_5'],
 '1022_pA-RNA': ['1022_pA-RNA_1', '1023_pA-RNA_2'],
 '1168_S2-RIP': ['1168_S2-RIP_1', '1168_S2-RIP_2'],
 '1168_pA-RNA': ['1168_pA-RNA_1', '1168_pA-RNA_2'],
 '301_S2-RIP': ['301_S2-RIP_1', '301_S2-RIP_2', '301_S2-RIP_3'],
 '301_pA-RNA': ['301_pA-RNA_1', '301_pA-RNA_2', '301_pA-RNA_3'],
 '302_S2-RIP': ['302_S2-RIP_1', '302_S2-RIP_2', '302_S2-RIP_3'],
 '302_pA-RNA': ['302_pA-RNA_1', '302_pA-RNA_2'],
 '324_S2-RIP': ['324_S2-RIP_1', '324_S2-RIP_2', '324_S2-RIP_3'],
 '324_pA-RNA': ['324_pA-RNA_1',
  '324_pA-RNA_2',
  '324_pA-RNA_3',
  '324_pA-RNA_4'],
 '491_S2-RIP': ['491_S2-RIP_1', '491_S2-RIP_2', '491_S2-RIP_3'],
 '491_pA-RNA': ['491_pA-RNA_1', '491_pA-RNA_2'],
 '504_S2-RIP': ['504_S2-RIP_1', '504_S2-RIP_2'],
 '504_pA-RNA': ['504_pA-RNA_1', '504_pA-RNA_2'],
 '510_S2-RIP': ['510_S2-RIP_1', '591_S2-RIP_1'],
 '510_pA-RNA': ['510_pA-RNA_1', '510_pA-RNA_2'],
 '510_total-RNA': ['510_total-RNA_1', '510_tot

<font color='green'> **Looks good there is at least two replicates per group.**

**Total number of samples**:

In [42]:
len(list(itertools.chain.from_iterable(sub_samples.values())))

80

**Number of mutants** (subsamples):

In [43]:
len(sub_samples)

33

------

# **3.** Produce **correlation-plots** and **pearson-r correlation scores**

------

- Run **correlation checks**  for all **pair-wise sample combinations**:

In [44]:
#corr_df = rpc.run_corr_checks(log2_tpm_df, out_dir=out_corr)
corr_df = rpc.run_corr_checks(log2_tpm_df, samples=sub_samples, out_dir=out_corr)
#corr_df.head()
corr_df

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-RIP_2,1022_S2-RIP_3,0.899038,0.0
1,1022_S2-RIP_2,1022_S2-RIP_4,0.907435,0.0
2,1022_S2-RIP_2,1022_S2-RIP_5,0.853185,0.0
3,1022_S2-RIP_3,1022_S2-RIP_4,0.959262,0.0
4,1022_S2-RIP_3,1022_S2-RIP_5,0.895938,0.0
...,...,...,...,...
61,WT_pA-RNA_1,WT_pA-RNA_4,0.943558,0.0
62,WT_pA-RNA_2,WT_pA-RNA_3,0.958599,0.0
63,WT_pA-RNA_2,WT_pA-RNA_4,0.934394,0.0
64,WT_pA-RNA_3,WT_pA-RNA_4,0.988562,0.0


In [45]:
corr_df.shape

(66, 4)

* Check for low correlation values (more checks below in Section 5)

In [46]:
corr_df[corr_df['Correlation'] < 0.80]

Unnamed: 0,Sample1,Sample2,Correlation,P-value
65,WT_total-RNA_1,WT_total-RNA_2,0.776538,0.0


In [47]:
#corr_df[corr_df['Sample2'].str.contains('total-RNA')]

-----------------

# **4.** Produce **MA-plots**

-----------------


An **MA-plot** is an application of a Bland–Altman plot for visual **representation of genomic data**. 

The plot visualizes the differences between measurements taken in two samples, by transforming the data onto:
* **M (log ratio)** scale
* **A (mean average)** scale

then plotting these values. 

- Run **MA checks**  for all **pair-wise sample combinations**:

In [48]:
#rpc.run_ma_checks(log2_tpm_df, out_dir=out_ma)
rpc.run_ma_checks(log2_tpm_df, samples=sub_samples, out_dir=out_ma)

-----------------

# **5.** Investigate  **pearson-r correlation scores**

-----------------

Select replicates to be merged based on their **pearson-r correlation scores**

In [49]:
from RepTools import report_corr

- Import **correlation checks** for all **pair-wise sample combinations**: `correlations.csv`

In [50]:
corr_df = pd.read_csv(os.path.join(out_corr, 'correlations.csv'), sep='\t')
corr_df.head()

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-RIP_2,1022_S2-RIP_3,0.899038,0.0
1,1022_S2-RIP_2,1022_S2-RIP_4,0.907435,0.0
2,1022_S2-RIP_2,1022_S2-RIP_5,0.853185,0.0
3,1022_S2-RIP_3,1022_S2-RIP_4,0.959262,0.0
4,1022_S2-RIP_3,1022_S2-RIP_5,0.895938,0.0


In [51]:
corr_df.shape

(66, 4)

- Check **correlation scores** any `Correlation == 1` (**Duplicates**):

In [52]:
corr_df[corr_df['Correlation'] > 0.99]

Unnamed: 0,Sample1,Sample2,Correlation,P-value
33,504_pA-RNA_1,504_pA-RNA_2,0.995487,0.0
43,523_pA-RNA_1,524_pA-RNA_1,0.995453,0.0
45,530_pA-RNA_1,530_pA-RNA_2,0.993761,0.0


- Check **correlation scores** any `Correlation < 0.85`:

In [53]:
corr_df[corr_df['Correlation'] < 0.80]

Unnamed: 0,Sample1,Sample2,Correlation,P-value
65,WT_total-RNA_1,WT_total-RNA_2,0.776538,0.0


- Check **correlation scores** grouped by `Mutant` type and `seq_type`:

In [54]:
#rpc.report_corr(corr_df, sub_samples['63_pA-RNA'])

In [55]:
# for s in sub_samples:
#     a = report_corr(corr_df, s)
#     print('-'*80)
#     print('Mutant samples group:', s)
#     print(a)
#     print('-'*80, '\n')

-----------------

# **6.** Merge replicates

-----------------

Merge and store as `.csv` files:

- **Average TPM-normalized expression tables** (tpm_df) for replicates:
    - `rna_tpm_merged.csv`

- <font color='red'> Add **`length` column** </font>

In [56]:
tpm_df['length'] = tpm_df['gene_length']

- **Merge RNA replicates** into an average **gene expression Matrix** per Mutant type: `rna_merged_tpm.csv`

In [57]:
out_dir

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/RNA'

#### <font color='red'> Deal with **NAs**: *Drop* or *fill with zeros?* </font>

In [58]:
merged_tpm_df = rpc.repli_merge(
    tpm_df,
    sub_samples,
    out_dir = out_dir, 
    out_file = 'rna_merged_tpm.csv'
)

In [59]:
merged_tpm_df.head()

Unnamed: 0,gene_id,gene_name,length,type,category,bio_type,1022_S2-RIP,1022_pA-RNA,1168_S2-RIP,1168_pA-RNA,...,638_S2-RIP,638_pA-RNA,638_total-RNA,80_S2-RIP,80_pA-RNA,80_total-RNA,WT_S2-RIP,WT_S5-RIP,WT_pA-RNA,WT_total-RNA
0,FP565355_region_1..2120,,2120,region,repeat,region,13.047263,8.610194,1.551187,0.812477,...,2.628246,2.906437,5.585458,2.94159,1.800442,3.257867,2.451086,0.0,0.71907,1.761184
1,FP565355_region_15417..15473,,57,region,repeat,region,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FP565355_region_15474..15608,,135,region,repeat,region,4.362461,5.435505,6.791771,1.182042,...,5.225105,10.515522,6.788815,4.632683,1.165227,0.0,29.076151,55.013025,6.204396,0.0
3,FP565355_region_15609..16735,,1127,region,repeat,region,20.35727,57.937164,8.811443,83.772135,...,4.224988,6.463253,4.565533,18.242957,37.932886,26.830832,0.999979,0.0,8.761472,25.896352
4,FP565355_region_16736..16794,,59,region,repeat,region,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
merged_tpm_df.shape

(7021, 39)

------

# **7.** Filter for **Heterochromatic** and **Protein coding** (mRNA) genes

------


**Remove entries** from merged results:
- `ncRNA_gene`
- (some) `pseudogene` 
- `rRNA_gene`
- `snRNA_gene`
- `snoRNA_gene`
- `tRNA_gene` 

Filter **merged_tpm_df**: 
- Keep only **Heterochromatic** and **protein coding genes** (mRNA) 

In [61]:
#merged_tpm_df = merged_tpm_df[(merged_tpm_df['type']=='gene') | (merged_tpm_df['category'] =='repeat')]
merged_tpm_df = merged_tpm_df[(merged_tpm_df['bio_type'] == 'mRNA') | (merged_tpm_df['category'] == 'repeat')]

In [62]:
merged_tpm_df.head()

Unnamed: 0,gene_id,gene_name,length,type,category,bio_type,1022_S2-RIP,1022_pA-RNA,1168_S2-RIP,1168_pA-RNA,...,638_S2-RIP,638_pA-RNA,638_total-RNA,80_S2-RIP,80_pA-RNA,80_total-RNA,WT_S2-RIP,WT_S5-RIP,WT_pA-RNA,WT_total-RNA
0,FP565355_region_1..2120,,2120,region,repeat,region,13.047263,8.610194,1.551187,0.812477,...,2.628246,2.906437,5.585458,2.94159,1.800442,3.257867,2.451086,0.0,0.71907,1.761184
1,FP565355_region_15417..15473,,57,region,repeat,region,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FP565355_region_15474..15608,,135,region,repeat,region,4.362461,5.435505,6.791771,1.182042,...,5.225105,10.515522,6.788815,4.632683,1.165227,0.0,29.076151,55.013025,6.204396,0.0
3,FP565355_region_15609..16735,,1127,region,repeat,region,20.35727,57.937164,8.811443,83.772135,...,4.224988,6.463253,4.565533,18.242957,37.932886,26.830832,0.999979,0.0,8.761472,25.896352
4,FP565355_region_16736..16794,,59,region,repeat,region,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
merged_tpm_df.shape

(5159, 39)

#### <font color='red'> Deal with **NAs**: *Drop* or *fill with zeros?* </font>

In [64]:
# Deal with NAs: Drop or fill with zeros?
# => it should already have been taken care of! 

# (NOT USED) in most plots we get rid of nan's, here we set them to zero
#merged_tpm_df = merged_tpm_df.fillna(0) # undesired off-target effects to other columns (e.g. `gene_name`)
#merged_tpm_df = merged_tpm_df.fillna({kk:0 for kk in sample_cols})

# (NOT USED) drop row (axis - 0) if it finds ANY `na`, becareful when adding new columns!
#merged_tpm_df = merged_tpm_df.dropna(subset = sample_cols) 
#merged_tpm_df = merged_tpm_df.dropna(0)

- Store a copy of **merged_tpm_filtered_df**: `rna_merged_filtered_tpm.csv`

In [65]:
merged_tpm_df.to_csv(os.path.join(out_dir, 'rna_merged_filtered_tpm.csv'), sep='\t', index=None)

------

# **8.** Check **repeat genes** correlations by replicates 

------

Here we go back to the Data Frame used **before merging** replicates: `tpm_df` and filter for **Heterochromatic genes** (repeats)

- Keep only **Heterochromatic genes** (repeats)

In [66]:
tpm_rep_df = tpm_df[tpm_df['category'] == 'repeat']
tpm_rep_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,WT_S2-RIP_1,WT_S2-RIP_2,WT_S5-RIP_2,WT_pA-RNA_1,WT_pA-RNA_2,WT_pA-RNA_3,WT_pA-RNA_4,WT_total-RNA_1,WT_total-RNA_2,length
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,3.903828,0.998344,0.0,0.265064,1.146108,0.769286,0.695822,3.368858,0.153509,2120
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,50.138467,8.013835,55.013025,2.735617,4.92855,8.775227,8.378187,0.0,0.0,135
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,1.999957,0.0,0.0,0.436481,34.364524,0.077785,0.167097,45.304578,6.488126,1127
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59


In [67]:
tpm_rep_df.shape

(39, 94)

- Select columns of interest

In [68]:
#select_cols = ['gene-id', 'gene-name', 'type', 'category', 'bio_type']
select_cols = ['gene_id', 'gene_name', 'type', 'category', 'bio_type']

In [69]:
select_cols.extend(sample_cols)

In [70]:
tpm_rep_df = tpm_rep_df[select_cols]
tpm_rep_df.shape

(39, 85)

- **Log-transform** (tpm-normed)  **gene expression Matrix**:

In [71]:
#log2_tpm_df = to_log2_tpm(tpm_df)
tpm_rep_df_l2 = to_log2_tpm(tpm_rep_df, gene_id_col='gene_id') # default shift=1
tpm_rep_df_l2.head()

Unnamed: 0,gene_id,gene_name,type,category,bio_type,1022_S2-RIP_2,1022_S2-RIP_3,1022_S2-RIP_4,1022_S2-RIP_5,1022_pA-RNA_1,...,80_total-RNA_2,WT_S2-RIP_1,WT_S2-RIP_2,WT_S5-RIP_2,WT_pA-RNA_1,WT_pA-RNA_2,WT_pA-RNA_3,WT_pA-RNA_4,WT_total-RNA_1,WT_total-RNA_2
0,FP565355_region_1..2120,,region,repeat,region,3.829264,3.793063,3.919725,3.698085,2.91175,...,0.874864,2.293908,0.998805,0.0,0.33921,1.101723,0.823167,0.761985,2.127256,0.20603
1,FP565355_region_15417..15473,,region,repeat,region,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FP565355_region_15474..15608,,region,repeat,region,0.0,4.205537,0.0,0.0,3.235441,...,0.0,5.676337,3.172141,5.80769,1.901347,2.567679,3.28913,3.229309,0.0,0.0
3,FP565355_region_15609..16735,,region,repeat,region,4.268906,4.814858,4.815248,3.300297,6.200266,...,3.804836,1.584942,0.0,0.0,0.522539,5.144231,0.108069,0.222925,5.533083,2.904605
4,FP565355_region_16736..16794,,region,repeat,region,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Run **correlation checks**  for all **pair-wise sample combinations**:

In [72]:
out_htc_corr = os.path.join(out_dir, 'htc_CORR')
if not os.path.isdir(out_htc_corr):    
    !mkdir -p $out_htc_corr

In [73]:
rpc.run_corr_checks(tpm_rep_df_l2, samples = sub_samples, out_dir = out_htc_corr, prefix = 'repeats.')

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-RIP_2,1022_S2-RIP_3,0.851285,6.569509e-12
1,1022_S2-RIP_2,1022_S2-RIP_4,0.934655,3.359376e-18
2,1022_S2-RIP_2,1022_S2-RIP_5,0.745979,5.029418e-08
3,1022_S2-RIP_3,1022_S2-RIP_4,0.917315,2.251776e-16
4,1022_S2-RIP_3,1022_S2-RIP_5,0.723603,1.941904e-07
...,...,...,...,...
61,WT_pA-RNA_1,WT_pA-RNA_4,0.942236,3.661138e-19
62,WT_pA-RNA_2,WT_pA-RNA_3,0.880080,1.581129e-13
63,WT_pA-RNA_2,WT_pA-RNA_4,0.829692,6.651693e-11
64,WT_pA-RNA_3,WT_pA-RNA_4,0.941505,4.591424e-19


- Import **correlation checks** for all **pair-wise sample combinations**: `correlations.csv`

In [74]:
rep_corr_df = pd.read_csv(os.path.join(out_htc_corr, 'repeats.correlations.csv'), sep='\t')
rep_corr_df.head()

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-RIP_2,1022_S2-RIP_3,0.851285,6.569509e-12
1,1022_S2-RIP_2,1022_S2-RIP_4,0.934655,3.359376e-18
2,1022_S2-RIP_2,1022_S2-RIP_5,0.745979,5.029418e-08
3,1022_S2-RIP_3,1022_S2-RIP_4,0.917315,2.251776e-16
4,1022_S2-RIP_3,1022_S2-RIP_5,0.723603,1.941904e-07


In [75]:
rep_corr_df.shape

(66, 4)

- Select **highly correlated** samples: `correlation > 0.85`

In [76]:
# Highly correlated samples
high_rep_corr_df = rep_corr_df[rep_corr_df['Correlation'] > .85]

In [77]:
high_rep_corr_df.head()

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-RIP_2,1022_S2-RIP_3,0.851285,6.569509e-12
1,1022_S2-RIP_2,1022_S2-RIP_4,0.934655,3.359376e-18
3,1022_S2-RIP_3,1022_S2-RIP_4,0.917315,2.251776e-16
6,1022_pA-RNA_1,1023_pA-RNA_2,0.92459,4.3633400000000004e-17
7,1168_S2-RIP_1,1168_S2-RIP_2,0.869405,6.971057e-13


In [78]:
high_rep_corr_df.to_csv(os.path.join(out_htc_corr, 'high_corr_reps.csv'), index=None, sep='\t')

- Check **low correlation scores** any `Correlation < 0.85`:

In [79]:
# Low correlated samples
low_rep_corr_df = rep_corr_df[rep_corr_df['Correlation'] < .85]

In [80]:
low_rep_corr_df.sort_values(by = "Correlation", ascending=True)

Unnamed: 0,Sample1,Sample2,Correlation,P-value
36,510_total-RNA_1,510_total-RNA_2,0.459953,0.003218913
65,WT_total-RNA_1,WT_total-RNA_2,0.583771,9.575741e-05
41,523_S2-RIP_2,524_S2-RIP_1,0.719455,2.459475e-07
4,1022_S2-RIP_3,1022_S2-RIP_5,0.723603,1.941904e-07
2,1022_S2-RIP_2,1022_S2-RIP_5,0.745979,5.029418e-08
10,301_S2-RIP_1,301_S2-RIP_3,0.783698,3.648201e-09
39,523_S2-RIP_1,524_S2-RIP_1,0.784714,3.375341e-09
5,1022_S2-RIP_4,1022_S2-RIP_5,0.797564,1.216086e-09
44,530_S2-RIP_1,530_S2-RIP_2,0.804117,7.022865e-10
40,523_S2-RIP_2,523_S2-RIP_3,0.828387,7.570631e-11


- Run **correlation checks**  for all **pair-wise MERGED sample combinations**:

In [81]:
#out_labelled_corr = os.path.join(out_dir, 'labelled_CORR')
#if not os.path.isdir(out_labelled_corr):    
#    !mkdir -p $out_labelled_corr

In [82]:
# Labeled scatter plots for samples
#rpc.labeled_corr_plots(tpm_rep_df_l2, out_dir = out_labelled_corr, prefix = 'repeats.')