In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools

------------------------------

## Config

In [3]:
import sys

In [4]:
project_dir = '/home/pmonteagudo/workspace/silencing_project'
if project_dir not in sys.path: 
    sys.path.append(project_dir)
from config_analysis import *

In [5]:
import RepTools as rpc    
from Util import to_log2_tpm

<font color='red'> **Attention!** for `H3K9me2` samples It's possible to use other directories than the default (e.g. `ChIP/h3k9me2`) </font>

In [6]:
h3k9me2_analysis=False
#h3k9me2_analysis=True

In [7]:
h3k9me2_dir = 'h3k9me2'  # use longest 3 mRNA genes
h3k9me2_dir = 'strict_mean_h3k9me2'
h3k9me2_dir = 'strict_median_h3k9me2'
#h3k9me2_dir = 'strict_q98_h3k9me2'
#h3k9me2_dir = 'strict_q95_h3k9me2'
#h3k9me2_dir = 'wo_ip_subtraction_h3k9me2'

- Result **directories**

In [8]:
#in_dir = os.path.join(project_data_dir, 'results/xp_data/ChIP')
in_dir = chip_dir
if h3k9me2_analysis:
    in_dir = os.path.join(in_dir, h3k9me2_dir)
in_dir

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/ChIP'

In [9]:
#out_dir = os.path.join(project_data_dir, 'results/xp_data/ChIP')
out_dir = chip_dir
if h3k9me2_analysis:
    out_dir = os.path.join(out_dir, h3k9me2_dir)
out_dir

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/ChIP'

------

# Process Replicates: **ChIP-seq**

------------

## Workflow

- Calculate **log2(tpm + 1)** of all TPM counts of all samples.
    - <font color='red'> (INPUT Subtraction) Take care of **Negative counts** --> **Transform to zeros** </font>
- Get grouping of **replicate samples**.
- Calculate **MA-plot**, **correlation-plot**, and **pearson-correlation score** for all replicate pairs.
- Group replicates with correlation scores higher than 85%.
- Merge replicates groups by taking the mean of TPM values.
- Filter for **Heterochromatic** and **Protein coding** (mRNA) genes, **remove entries** from merged results:
    - `ncRNA_gene`
    - `pseudogene`
    - `rRNA_gene`
    - `snRNA_gene`
    - `snoRNA_gene`
    - `tRNA_gene` 

--------------------------

In [10]:
out_ma = os.path.join(out_dir, 'MA')
if not os.path.isdir(out_ma):    
    !mkdir -p $out_ma

In [11]:
out_corr = os.path.join(out_dir, 'CORR')
if not os.path.isdir(out_corr):    
    !mkdir -p $out_corr

In [12]:
out_negative_genes = os.path.join(out_dir, 'negative_genes')
if not os.path.isdir(out_negative_genes):    
    !mkdir -p $out_negative_genes

-------------

- Import dataframe containing **annotation of all samples**

In [13]:
#sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'sample_annotation.csv')
sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'file_annotation.csv')
sample_annotation_file

'/gcm-lfs1/pablo/data/rna_silencing/seq_data/file_annotation.csv'

In [14]:
#select_cols = ['sample_id', 'pipeline_type', 'seq_category', 'seq_type',  'mutant_id', 'mutant', 'replicate', 'batch', 'trimmed', 'halic_local_dir']
select_cols = ['sample_id', 'pipeline_type', 'seq_category', 'seq_type',  'mutant_id', 'mutant_name', 'replicate', 'trimmed']

In [15]:
all_samples_df = pd.read_csv(sample_annotation_file, sep="\t", usecols=select_cols)[select_cols]
#all_samples_df = pd.read_csv(sample_annotation_file, sep="\t")
all_samples_df.rename(columns={'mutant_name':'mutant'}, inplace=True)
#all_samples_df.head()

In [16]:
all_samples_df.shape

(157, 8)

- **Ignore specific samples** 

In [17]:
# ignore_datasets.append('WT_S2-ChIP_2') # try to ignore for Gene cloud plots, it seems to be a duplicate of WT_S2-ChIP_1 correlation is too close to 1
# ignore_datasets.append('WT_S2-ChIP_1') # try to ignore for Gene cloud plots, it seems like it's not a great sample! Block-like read alignments!

In [18]:
all_samples_df = all_samples_df[~all_samples_df.sample_id.isin(ignore_datasets)]
all_samples_df.shape

(156, 8)

- **Ignore S2-ChIP/S5-ChIP samples** for now

In [19]:
#all_samples_df = all_samples_df[all_samples_df['seq_type'] != 'S2-ChIP']
#all_samples_df = all_samples_df[all_samples_df['seq_type'] != 'S5-ChIP']
#all_samples_df.shape

- Select **ChIP Datasets**: | S2-ChIP | S5-ChIP | H3K9me2 |

In [20]:
all_samples_df = all_samples_df[all_samples_df['pipeline_type'] == 'ChIP']

# distinguish between ChIP and INPUT samples
#input_types = ['S2-ChIP-OIN', 'S2-ChIP-INPUT', 'simulated-data', 'H3K9me2']
input_types = ['S2-ChIP-OIN', 'S2-ChIP-INPUT', 'simulated-data']

# filter-out INPUT samples
datasets_df = all_samples_df[~all_samples_df['seq_type'].isin(input_types)]
datasets_df.head()

Unnamed: 0,sample_id,pipeline_type,seq_category,seq_type,mutant_id,mutant,replicate,trimmed
2,1022_S2-ChIP_1,ChIP,S2-ChIP,S2-ChIP,1022,mot2d,1,False
3,1022_S2-ChIP_2,ChIP,S2-ChIP,S2-ChIP,1022,mot2d,2,False
11,1168_S2-ChIP_1,ChIP,S2-ChIP,S2-ChIP,1168,caf1d*ccr4d*,1,False
12,1168_S2-ChIP_2,ChIP,S2-ChIP,S2-ChIP,1168,caf1d*ccr4d*,2,False
17,301_H3K9me2_1,ChIP,H3K9me2,H3K9me2,301,swi6d,1,False


In [21]:
datasets_df.shape

(42, 8)

- Get `samples` **columns**

In [22]:
sample_cols = datasets_df['sample_id'].tolist()
len(sample_cols)
#sample_cols

42

-------------

# **1.** Load raw and TPM-normalized gene expression tables

-------------

- Import **ChIP** (raw) **gene counts Matrix**: `chip_pombe_gene_count_matrix.csv`

In [23]:
gx_df_file = os.path.join(in_dir, 'chip_pombe_gene_count_matrix.csv')
gx_df_file

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/ChIP/chip_pombe_gene_count_matrix.csv'

In [24]:
#gx_df = pd.read_csv(gx_df_file, sep='\t', comment='#')
#gx_df.head()

In [25]:
#gx_df.shape

- Import **ChIP** (TPM-normed) **gene expression Matrix**: `chip_pombe_tpm_matrix.csv`

In [26]:
#tpm_df_file = '/gcm-lfs1/pablo/data/rna_silencing/results_with_nh-norm/xp_data/ChIP/chip_pombe_tpm_matrix.csv'
tpm_df_file = os.path.join(in_dir, 'chip_pombe_tpm_matrix.csv') # with all genes
tpm_df_file

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/ChIP/chip_pombe_tpm_matrix.csv'

In [27]:
#tpm_df = pd.read_csv(gx_df_file, sep='\t', comment='#') 
tpm_df = pd.read_csv(tpm_df_file, sep='\t', comment='#') # NAs here represent zeros counts, but there AREN'T any
                                                         # => every 'gene' contains at least some noise (which has been subtracted, Negative counts)
# ignore specific samples
tpm_df = tpm_df.loc[:, ~tpm_df.columns.isin(ignore_datasets)]
tpm_df.head()   # with NH-norm: 77.15 (638_S2-ChIP_2)
                # w/o NH-norm: 31.11 (638_S2-ChIP_2) / 296.21 (301_H3K9me2_1)

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,25.655295,41.523328,7.936841,11.483831,13.021492,21.544192,66.004963,40.94797,23.136412,5.636927
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,,,,,,,371.637325,42.548842,,
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,90.02418,60.192701,0.0,39.503547,21.339404,49.4594,45.515078,60.91411,67.196362,43.735495
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,37.160607,61.052081,39.888992,32.052439,42.637502,61.665463,72.944537,52.32955,45.728274,0.0
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,156.233253,119.45936,34.907401,117.350779,101.019563,174.010572,,102.117222,382.549419,78.852255


In [28]:
tpm_df.shape

(7021, 46)

In [29]:
# tpm_df = tpm_df[~tpm_df['gene_id'].str.contains('centromer')]
# #tpm_df[tpm_df['gene_id'].str.contains('centromer')]

In [30]:
# # check genes from specific regions in TE of wt scatterplot
# regions_df = tpm_df[tpm_df['gene_id'].isin(['SPCC1494.11c','SPCC1183.10'])]
# regions_df.loc[:, regions_df.columns.str.contains('gene|WT_S2')]

In [31]:
# check if bam was filtered correctly
#tpm_df[tpm_df['gene_id'].str.contains('SPRRNA', na=False)]
# residual reads because some rRNA features overlap with other genomic features
#tpm_df[tpm_df['gene_id'].str.contains('SPRRNA.07|SPRRNA.15|SPRRNA.29|SPRRNA.35', na=False)]

In [32]:
#tpm_df[tpm_df['gene_id'].str.contains('dg|dh')]

In [33]:
#tpm_df[tpm_df['gene_id'].isin(non_degraded)][[xx for xx in tpm_df.columns if ('80' in xx) | (xx == 'gene_id')]]

- Update `samples` **columns** for samples present in `tpm_df`

In [34]:
sample_cols = tpm_df.columns.intersection(sample_cols).tolist()
len(sample_cols)
#sample_cols

33

- Check **TPM normalization** for each sample expression should add to $10^6$

In [35]:
#tpm_df[sample_cols].describe()
assert all(np.isclose(tpm_df[sample_cols].sum(), 10**6)) 

-------------

### Remove <font color='red'> Negative Counts due to INPUT subtraction </font>

* Check how many genes are negative per sample - determine **"bad" samples**

In [36]:
# samples where >5k genes are negative are considered bad!
#exclude_samples = (tpm_df[sample_cols] < 0).sum()
exclude_samples = (tpm_df[sample_cols] < 0).sum() > (5 * 10 ** 3)
exclude_samples = exclude_samples[exclude_samples].index.tolist()
exclude_samples

[]

In [37]:
# exclude_samples = []
# #exclude_samples = ['638_H3K9me2_1']
# exclude_samples

In [38]:
if len(exclude_samples) > 0:
    # analysis cam proced, but be aware of the issue! some samples will be excluded
    #raise ValueError
    print("raise ValueError") 

In [39]:
sample_cols = [ss for ss in sample_cols if not (ss in exclude_samples)]
len(sample_cols)
#sample_cols

33

- Inspect **Negative Genes**, genes that are negative at least once across all samples

In [40]:
tpm_df = tpm_df.drop(exclude_samples, axis=1)
# are bad!
negative_genes_df = tpm_df[(tpm_df[sample_cols] < 0).any(axis=1)]

<font color='red'> **Attention!** if no negatives genes appear here, be suspicious that there was no INPUT subtraction! </font>

In [41]:
#negative_genes_df[['gene_id', 'type', 'seqid', 'bio_type' ]].groupby(['seqid', 'type', 'bio_type']).count()
negative_genes_df[['gene_id', 'category', 'seqid', 'bio_type' ]].groupby(['seqid', 'category', 'bio_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gene_id
seqid,category,bio_type,Unnamed: 3_level_1


- Check **Negative genes** for **repeat features**

In [42]:
negative_htc = negative_genes_df[negative_genes_df['category'] == 'repeat']
negative_htc

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2


- Check **Negative genes** per **sample**

In [43]:
negative_genes_sample_df = negative_genes_df[negative_genes_df[sample_cols] < 0][sample_cols].melt().dropna()
negative_genes_sample_df['mutant'] = negative_genes_sample_df['variable'].apply(lambda x: x.split('_')[0])
negative_genes_sample_df['replicate'] = negative_genes_sample_df['variable'].apply(lambda x: x.split('_')[-1])

In [44]:
#negative_genes_sample_df

In [45]:
#negative_genes_sample_df[['gene_id', 'type', 'seqid', 'bio_type' ]].groupby(['seqid', 'type', 'bio_type']).count()
#negative_genes_sample_df.groupby('variable').mean()

In [46]:
import seaborn as sns

In [47]:
import matplotlib.pyplot as plt

In [48]:
#g = sns.FacetGrid(negative_genes_sample_df, row="mutant", col="replicate")
##g = g.map(plt.hist, "value", bins=100)
#g = g.map(sns.kdeplot, "value")

In [49]:
for mutant_id in negative_genes_sample_df['mutant'].unique():
    
    gg = negative_genes_sample_df[negative_genes_sample_df['mutant'] == mutant_id]
    g = sns.FacetGrid(gg, row = "mutant", col = "replicate")
    #g = g.map(plt.hist, "value", bins=100)
    g = g.map(sns.kdeplot, "value")
    
    negative_genes_fig = os.path.join(out_negative_genes, mutant_id + '.pdf')
    g.savefig(negative_genes_fig)

- Convert **Negative Genes** to 0 counts

In [50]:
def make_positive(col):
    # don't turn to zero but some small value.
    #col[col < 0] = 0.01
    #col[col < 0] = np.nan
    col[col < 0] = 0 # shouldn't be an issue due to default shift=1 in `log2_tpm_df`

    return col

In [51]:
tpm_df[sample_cols] = tpm_df[sample_cols].apply(lambda col: make_positive(col))
tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,25.655295,41.523328,7.936841,11.483831,13.021492,21.544192,66.004963,40.94797,23.136412,5.636927
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,,,,,,,371.637325,42.548842,,
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,90.02418,60.192701,0.0,39.503547,21.339404,49.4594,45.515078,60.91411,67.196362,43.735495
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,37.160607,61.052081,39.888992,32.052439,42.637502,61.665463,72.944537,52.32955,45.728274,0.0
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,156.233253,119.45936,34.907401,117.350779,101.019563,174.010572,,102.117222,382.549419,78.852255


In [52]:
tpm_df.shape

(7021, 46)

-------------

#### <font color='red'> Deal with **NAs**: *Drop* or *fill with zeros?* </font>
- <font color='red'> `NaN's` originated from zero counts: **fill with zeros** </font>
- <font color='red'> In **ChIP**, due to INPUT subtraction there are no `NaN's` (negative counts were transformed to zeros) </font>


In [53]:
# (NOT USED - by Parastou)
# => NAs originated from zero counts: fill with zeros
#tpm_df = tpm_df.fillna(0) # undesired off-target effects to other columns (e.g. `gene_name`)
tpm_df = tpm_df.fillna({kk:0 for kk in sample_cols})

# (NOT USED) drop row (axis - 0) if it finds ANY `na`, becareful when adding new columns!
#tpm_df = tpm_df.dropna(subset = sample_cols) 
#tpm_df = tpm_df.dropna(0)

tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,25.655295,41.523328,7.936841,11.483831,13.021492,21.544192,66.004963,40.94797,23.136412,5.636927
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,371.637325,42.548842,0.0,0.0
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,90.02418,60.192701,0.0,39.503547,21.339404,49.4594,45.515078,60.91411,67.196362,43.735495
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,37.160607,61.052081,39.888992,32.052439,42.637502,61.665463,72.944537,52.32955,45.728274,0.0
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,156.233253,119.45936,34.907401,117.350779,101.019563,174.010572,0.0,102.117222,382.549419,78.852255


In [54]:
tpm_df.shape

(7021, 46)

In [55]:
# no surprises!
# => there should not be NAs in sample columns (with counts)
#assert tpm_df.dropna(subset = sample_cols).shape == tpm_df.shape
assert not tpm_df[sample_cols].isnull().values.any()

- **Log-transform** (tpm-normed)  **gene expression Matrix** - will be used to compute correlations between replicates

In [56]:
# log transformed counts will be used to compute correlations
# => compute log(1+x) to avoid issues with zero counts
#log2_tpm_df = to_log2_tpm(tpm_df)
log2_tpm_df = to_log2_tpm(tpm_df, gene_id_col='gene_id') # default shift=1, no issues with division by zero
#log2_tpm_df = to_log2_tpm(tpm_df, gene_id_col='gene_id', shift=0) # default shift=1

log2_tpm_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,4.73635,5.410183,3.159765,3.641989,3.809568,4.494684,6.066196,5.390529,4.593139,2.730515
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,8.541628,5.444562,0.0,0.0
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,6.508178,5.935288,0.0,5.339976,4.481519,5.657051,5.539627,5.952196,6.091623,5.483348
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,5.254012,5.955408,5.353641,5.046685,5.447497,5.969599,6.208372,5.736863,5.546224,0.0
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,7.296763,6.912403,5.166209,6.886925,6.672702,7.451298,0.0,6.688141,8.583269,6.319261


In [57]:
log2_tpm_df.shape

(7021, 46)

In [58]:
# no surprises!
# => the log should not introduce any issues
#assert log2_tpm_df.dropna(subset = sample_cols).shape == tpm_df.shape
assert not log2_tpm_df[sample_cols].isnull().values.any()

----

# **2.** Get groupings of **replicate samples**

----

In [59]:
#sample_cols = [ss for ss in log2_tpm_df.columns if 'ChIP' in ss]
#len(sample_cols)

- **Replicate samples** grouped by `mutant` and `seq_type` type:

    - <font color='red'> **Atention!** it's a bit tricky due to ambiguity between `mutant_id -> mutant` map </font>
        - **510/591** -> `caf1d`
        - **1022/1023** -> `mot2d`
        - **523/524** -> `unknown`

     <font color='red'> First group using `mutant` then use `inv_mut_dict` which removes ambiguity to get `mutant_id` again. </font>

In [60]:
#datasets_df['sub_sample'] = datasets_df['mutant'] + '_' + datasets_df['seq_type']
datasets_df['sub_sample'] =  datasets_df.mutant.map(inv_mut_dict) + '_' + datasets_df['seq_type']

In [61]:
# only for samples present in df
datasets_df = datasets_df[datasets_df['sample_id'].isin(sample_cols)]

In [62]:
#sub_samples = dict(datasets_df.groupby('sub_sample')['sample_id'].apply(list))
sub_samples = dict(datasets_df[~datasets_df['sample_id'].isin(exclude_samples)].groupby('sub_sample')['sample_id'].apply(list))
sub_samples

{'1022_S2-ChIP': ['1022_S2-ChIP_1', '1022_S2-ChIP_2'],
 '1168_S2-ChIP': ['1168_S2-ChIP_1', '1168_S2-ChIP_2'],
 '301_S2-ChIP': ['301_S2-ChIP_1', '301_S2-ChIP_2'],
 '302_S2-ChIP': ['302_S2-ChIP_1', '302_S2-ChIP_2'],
 '324_S2-ChIP': ['324_S2-ChIP_1', '324_S2-ChIP_2'],
 '491_S2-ChIP': ['491_S2-ChIP_1', '491_S2-ChIP_2'],
 '504_S2-ChIP': ['504_S2-ChIP_1', '504_S2-ChIP_2'],
 '510_S2-ChIP': ['510_S2-ChIP_2', '591_S2-ChIP_1'],
 '523_S2-ChIP': ['523_S2-ChIP_1', '524_S2-ChIP_1'],
 '530_S2-ChIP': ['530_S2-ChIP_1', '530_S2-ChIP_2'],
 '544_S2-ChIP': ['544_S2-ChIP_1', '544_S2-ChIP_2'],
 '638_S2-ChIP': ['638_S2-ChIP_1', '638_S2-ChIP_2'],
 '80_S2-ChIP': ['80_S2-ChIP_1', '80_S2-ChIP_2', '80_S2-ChIP_3'],
 '80_S5-ChIP': ['80_S5-ChIP_1', '80_S5-ChIP_2'],
 'WT_S2-ChIP': ['WT_S2-ChIP_1', 'WT_S2-ChIP_3'],
 'WT_S5-ChIP': ['WT_S5-ChIP_1', 'WT_S5-ChIP_2']}

<font color='green'> **Looks good there is at least two replicates per group.**

**Total number of samples**:

In [63]:
len(list(itertools.chain.from_iterable(sub_samples.values())))

33

**Number of mutants** (subsamples):

In [64]:
len(sub_samples)

16

----

# **3.** Produce **correlation-plots** and **pearson-r correlation scores**

----

- Run **correlation checks**  for all **pair-wise sample combinations**:

In [65]:
#corr_df = rpc.run_corr_checks(log2_tpm_df, out_dir=out_corr)
corr_df = rpc.run_corr_checks(log2_tpm_df, samples=sub_samples, out_dir=out_corr)
#corr_df.head()
corr_df

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-ChIP_1,1022_S2-ChIP_2,0.973016,0.0
1,1168_S2-ChIP_1,1168_S2-ChIP_2,0.872532,0.0
2,301_S2-ChIP_1,301_S2-ChIP_2,0.953609,0.0
3,302_S2-ChIP_1,302_S2-ChIP_2,0.95485,0.0
4,324_S2-ChIP_1,324_S2-ChIP_2,0.949851,0.0
5,491_S2-ChIP_1,491_S2-ChIP_2,0.931224,0.0
6,504_S2-ChIP_1,504_S2-ChIP_2,0.958766,0.0
7,510_S2-ChIP_2,591_S2-ChIP_1,0.963736,0.0
8,523_S2-ChIP_1,524_S2-ChIP_1,0.879819,0.0
9,530_S2-ChIP_1,530_S2-ChIP_2,0.911706,0.0


In [66]:
corr_df.shape

(18, 4)

* Check for low correlation values (more checks below in Section 5)

In [67]:
corr_df[corr_df['Correlation'] < 0.80]

Unnamed: 0,Sample1,Sample2,Correlation,P-value


In [68]:
#corr_df[corr_df['Sample2'].str.contains('S5-ChIP')]

-----------------

# **4.** Produce **MA-plots**

-----------------

An **MA-plot** is an application of a Bland–Altman plot for visual **representation of genomic data**. 

The plot visualizes the differences between measurements taken in two samples, by transforming the data onto:
* **M (log ratio)** scale
* **A (mean average)** scale

then plotting these values. 

- Run **MA checks**  for all **pair-wise sample combinations**:

In [69]:
#rpc.run_ma_checks(log2_tpm_df, out_dir=out_ma)
rpc.run_ma_checks(log2_tpm_df, samples=sub_samples, out_dir=out_ma)

-----------------

# **5.** Investigate **pearson-r correlation scores**

-----------------

Select replicates to be merged based on their **pearson-r correlation scores**

In [70]:
from RepTools import report_corr

- Import **correlation checks** for all **pair-wise sample combinations**: `correlations.csv`

In [71]:
corr_df = pd.read_csv(os.path.join(out_corr, 'correlations.csv'), sep='\t')
corr_df.head()

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-ChIP_1,1022_S2-ChIP_2,0.973016,0.0
1,1168_S2-ChIP_1,1168_S2-ChIP_2,0.872532,0.0
2,301_S2-ChIP_1,301_S2-ChIP_2,0.953609,0.0
3,302_S2-ChIP_1,302_S2-ChIP_2,0.95485,0.0
4,324_S2-ChIP_1,324_S2-ChIP_2,0.949851,0.0


In [72]:
corr_df.shape

(18, 4)

- Check **correlation scores** any `Correlation == 1` (**Duplicates**):

In [73]:
corr_df[corr_df['Correlation'] > 0.99]

Unnamed: 0,Sample1,Sample2,Correlation,P-value


- Check **correlation scores** any `Correlation < 0.85`:

In [74]:
corr_df[corr_df['Correlation'] < 0.80]

Unnamed: 0,Sample1,Sample2,Correlation,P-value


- Check **correlation scores** grouped by `Mutant` type and `seq_type`:

In [75]:
#rpc.report_corr(corr_df, sub_samples['63_S2-ChIP'])

In [76]:
#for s in sub_samples:
#    a = report_corr(corr_df, s)
#    print('-'*80)
#    print('Mutant samples group:', s)
#    print(a)
#    print('-'*80, '\n')

-----------------

# **6.** Merge replicates

-----------------


Merge and store as `.csv` files:

- **Average TPM-normalized expression tables** (tpm_df) for replicates:
    - `chip_tpm_merged.csv`

- <font color='red'> Add **`length` column** </font>

In [77]:
tpm_df['length'] = tpm_df['gene_length']

- **Merge ChIP replicates** into an average **gene expression Matrix** per Mutant type: `chip_merged_tpm.csv`

In [78]:
out_dir

'/gcm-lfs1/pablo/data/rna_silencing/results/xp_data/ChIP'

#### <font color='red'> Deal with **NAs**: *Drop* or *fill with zeros?* </font>

In [79]:
merged_tpm_df = rpc.repli_merge(
    tpm_df,
    sub_samples,
    out_dir = out_dir, 
    out_file = 'chip_merged_tpm.csv'
)

In [80]:
merged_tpm_df.head()

Unnamed: 0,gene_id,gene_name,length,type,category,bio_type,1022_S2-ChIP,1168_S2-ChIP,301_S2-ChIP,302_S2-ChIP,...,504_S2-ChIP,510_S2-ChIP,523_S2-ChIP,530_S2-ChIP,544_S2-ChIP,638_S2-ChIP,80_S2-ChIP,80_S5-ChIP,WT_S2-ChIP,WT_S5-ChIP
0,FP565355_region_1..2120,,2120,region,repeat,region,36.353199,31.604013,25.447595,34.155106,...,82.344549,22.183831,131.192765,75.910257,8.783881,26.188978,20.314666,17.282842,53.476467,14.38667
1,FP565355_region_15417..15473,,57,region,repeat,region,32.989946,0.0,101.717017,55.581202,...,35.620283,25.716713,30.158095,112.748997,9.204962,0.0,0.0,0.0,207.093084,0.0
2,FP565355_region_15474..15608,,135,region,repeat,region,55.543018,35.786011,42.055579,103.33055,...,284.326357,53.69815,259.546526,177.695904,47.749498,100.410479,33.232083,35.399402,53.214594,55.465929
3,FP565355_region_15609..16735,,1127,region,repeat,region,57.460638,10.993232,40.619884,31.836483,...,55.698312,29.545178,112.472071,114.430437,23.11905,31.980414,44.331171,52.151483,62.637044,22.864137
4,FP565355_region_16736..16794,,59,region,repeat,region,105.639842,26.961737,21.862377,54.908906,...,44.786774,49.165672,240.632152,150.771303,115.088074,162.738097,90.572514,137.515068,51.058611,230.700837


In [81]:
merged_tpm_df.shape

(7021, 22)

------

# **7.** Filter for **Heterochromatic** and **Protein coding** (mRNA) genes

------


**Remove entries** from merged results:
- `ncRNA_gene`
- (some) `pseudogene` 
- `rRNA_gene`
- `snRNA_gene`
- `snoRNA_gene`
- `tRNA_gene` 

Filter **merged_tpm_df**: 
- Keep only **Heterochromatic** and **protein coding genes** (mRNA) 

In [82]:
#merged_tpm_df = merged_tpm_df[(merged_tpm_df['type']=='gene') | (merged_tpm_df['category'] =='repeat')]
merged_tpm_df = merged_tpm_df[(merged_tpm_df['bio_type'] == 'mRNA') | (merged_tpm_df['category'] == 'repeat')]

In [83]:
merged_tpm_df.head()

Unnamed: 0,gene_id,gene_name,length,type,category,bio_type,1022_S2-ChIP,1168_S2-ChIP,301_S2-ChIP,302_S2-ChIP,...,504_S2-ChIP,510_S2-ChIP,523_S2-ChIP,530_S2-ChIP,544_S2-ChIP,638_S2-ChIP,80_S2-ChIP,80_S5-ChIP,WT_S2-ChIP,WT_S5-ChIP
0,FP565355_region_1..2120,,2120,region,repeat,region,36.353199,31.604013,25.447595,34.155106,...,82.344549,22.183831,131.192765,75.910257,8.783881,26.188978,20.314666,17.282842,53.476467,14.38667
1,FP565355_region_15417..15473,,57,region,repeat,region,32.989946,0.0,101.717017,55.581202,...,35.620283,25.716713,30.158095,112.748997,9.204962,0.0,0.0,0.0,207.093084,0.0
2,FP565355_region_15474..15608,,135,region,repeat,region,55.543018,35.786011,42.055579,103.33055,...,284.326357,53.69815,259.546526,177.695904,47.749498,100.410479,33.232083,35.399402,53.214594,55.465929
3,FP565355_region_15609..16735,,1127,region,repeat,region,57.460638,10.993232,40.619884,31.836483,...,55.698312,29.545178,112.472071,114.430437,23.11905,31.980414,44.331171,52.151483,62.637044,22.864137
4,FP565355_region_16736..16794,,59,region,repeat,region,105.639842,26.961737,21.862377,54.908906,...,44.786774,49.165672,240.632152,150.771303,115.088074,162.738097,90.572514,137.515068,51.058611,230.700837


In [84]:
merged_tpm_df.shape

(5159, 22)

#### <font color='red'> Deal with **NAs**: *Drop* or *fill with zeros?* </font>

In [85]:
# Deal with NAs: Drop or fill with zeros?
# => it should already have been taken care of! 

# (NOT USED) in most plots we get rid of nan's, here we set them to zero
#merged_tpm_df = merged_tpm_df.fillna(0) # undesired off-target effects to other columns (e.g. `gene_name`)
#merged_tpm_df = merged_tpm_df.fillna({kk:0 for kk in sample_cols})

# (NOT USED) drop row (axis - 0) if it finds ANY `na`, becareful when adding new columns!
#merged_tpm_df = merged_tpm_df.dropna(subset = sample_cols) 
#merged_tpm_df = merged_tpm_df.dropna(0)

- Store a copy of **merged_tpm_filtered_df**: `chip_merged_filtered_tpm.csv`

In [86]:
merged_tpm_df.to_csv(os.path.join(out_dir, 'chip_merged_filtered_tpm.csv'), sep='\t', index=None)

------

# **8.** Check **repeat genes** correlations by replicates 

------

Here we go back to the Data Frame used **before merging** replicates: `tpm_df` and filter for **Heterochromatic genes** (repeats)

- Keep only **Heterochromatic genes** (repeats)

In [87]:
tpm_rep_df = tpm_df[tpm_df['category'] == 'repeat']
tpm_rep_df.head()

Unnamed: 0,gene_id,seqid,type,start,end,strand,gene_name,cds_length,utr_length,intron_length,...,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2,length
0,FP565355_region_1..2120,mating_type_region,region,1,2120,+,,,,,...,41.523328,7.936841,11.483831,13.021492,21.544192,66.004963,40.94797,23.136412,5.636927,2120
1,FP565355_region_15417..15473,mating_type_region,region,15417,15473,+,,,,,...,0.0,0.0,0.0,0.0,0.0,371.637325,42.548842,0.0,0.0,57
2,FP565355_region_15474..15608,mating_type_region,region,15474,15608,+,,,,,...,60.192701,0.0,39.503547,21.339404,49.4594,45.515078,60.91411,67.196362,43.735495,135
3,FP565355_region_15609..16735,mating_type_region,region,15609,16735,+,,,,,...,61.052081,39.888992,32.052439,42.637502,61.665463,72.944537,52.32955,45.728274,0.0,1127
4,FP565355_region_16736..16794,mating_type_region,region,16736,16794,+,,,,,...,119.45936,34.907401,117.350779,101.019563,174.010572,0.0,102.117222,382.549419,78.852255,59


In [88]:
tpm_rep_df.shape

(39, 47)

- Select columns of interest

In [89]:
#select_cols = ['gene-id', 'gene-name', 'type', 'category', 'bio_type']
select_cols = ['gene_id', 'gene_name', 'type', 'category', 'bio_type']

In [90]:
select_cols.extend(sample_cols)

In [91]:
tpm_rep_df = tpm_rep_df[select_cols]
tpm_rep_df.shape

(39, 38)

- **Log-transform** (tpm-normed)  **gene expression Matrix**:

In [92]:
#log2_tpm_df = to_log2_tpm(tpm_df)
tpm_rep_df_l2 = to_log2_tpm(tpm_rep_df, gene_id_col='gene_id') # default shift=1
tpm_rep_df_l2.head()

Unnamed: 0,gene_id,gene_name,type,category,bio_type,1022_S2-ChIP_1,1022_S2-ChIP_2,1168_S2-ChIP_1,1168_S2-ChIP_2,301_S2-ChIP_1,...,638_S2-ChIP_2,80_S2-ChIP_1,80_S2-ChIP_2,80_S2-ChIP_3,80_S5-ChIP_1,80_S5-ChIP_2,WT_S2-ChIP_1,WT_S2-ChIP_3,WT_S5-ChIP_1,WT_S5-ChIP_2
0,FP565355_region_1..2120,,region,repeat,region,5.138924,5.302748,5.272847,4.73042,5.28434,...,4.73635,5.410183,3.159765,3.641989,3.809568,4.494684,6.066196,5.390529,4.593139,2.730515
1,FP565355_region_15417..15473,,region,repeat,region,0.0,6.065656,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.541628,5.444562,0.0,0.0
2,FP565355_region_15474..15608,,region,repeat,region,5.676155,5.953126,4.613577,5.617394,6.403127,...,6.508178,5.935288,0.0,5.339976,4.481519,5.657051,5.539627,5.952196,6.091623,5.483348
3,FP565355_region_15609..16735,,region,repeat,region,5.579121,6.110932,0.0,4.522713,5.825185,...,5.254012,5.955408,5.353641,5.046685,5.447497,5.969599,6.208372,5.736863,5.546224,0.0
4,FP565355_region_16736..16794,,region,repeat,region,5.887283,7.267626,5.779351,0.0,0.0,...,7.296763,6.912403,5.166209,6.886925,6.672702,7.451298,0.0,6.688141,8.583269,6.319261


- Run **correlation checks**  for all **pair-wise sample combinations**:

In [93]:
out_htc_corr = os.path.join(out_dir, 'htc_CORR')
if not os.path.isdir(out_htc_corr):    
    !mkdir -p $out_htc_corr

In [94]:
rpc.run_corr_checks(tpm_rep_df_l2, samples = sub_samples, out_dir = out_htc_corr, prefix = 'repeats.')

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-ChIP_1,1022_S2-ChIP_2,0.825588,9.958645e-11
1,1168_S2-ChIP_1,1168_S2-ChIP_2,0.651241,7.13407e-06
2,301_S2-ChIP_1,301_S2-ChIP_2,0.625471,2.067094e-05
3,302_S2-ChIP_1,302_S2-ChIP_2,0.817912,2.061308e-10
4,324_S2-ChIP_1,324_S2-ChIP_2,0.956403,2.265821e-21
5,491_S2-ChIP_1,491_S2-ChIP_2,0.890557,3.193402e-14
6,504_S2-ChIP_1,504_S2-ChIP_2,0.793032,1.757299e-09
7,510_S2-ChIP_2,591_S2-ChIP_1,0.969343,3.744198e-24
8,523_S2-ChIP_1,524_S2-ChIP_1,0.931224,8.407465e-18
9,530_S2-ChIP_1,530_S2-ChIP_2,0.687251,1.349116e-06


- Import **correlation checks** for all **pair-wise sample combinations**: `correlations.csv`

In [95]:
rep_corr_df = pd.read_csv(os.path.join(out_htc_corr, 'repeats.correlations.csv'), sep='\t')
rep_corr_df.head()

Unnamed: 0,Sample1,Sample2,Correlation,P-value
0,1022_S2-ChIP_1,1022_S2-ChIP_2,0.825588,9.958645e-11
1,1168_S2-ChIP_1,1168_S2-ChIP_2,0.651241,7.13407e-06
2,301_S2-ChIP_1,301_S2-ChIP_2,0.625471,2.067094e-05
3,302_S2-ChIP_1,302_S2-ChIP_2,0.817912,2.061308e-10
4,324_S2-ChIP_1,324_S2-ChIP_2,0.956403,2.265821e-21


In [96]:
rep_corr_df.shape

(18, 4)

- Select **highly correlated** samples: `correlation > 0.85`

In [97]:
# Highly correlated samples
high_rep_corr_df = rep_corr_df[rep_corr_df['Correlation'] > .85]

In [98]:
high_rep_corr_df.head()

Unnamed: 0,Sample1,Sample2,Correlation,P-value
4,324_S2-ChIP_1,324_S2-ChIP_2,0.956403,2.265821e-21
5,491_S2-ChIP_1,491_S2-ChIP_2,0.890557,3.193402e-14
7,510_S2-ChIP_2,591_S2-ChIP_1,0.969343,3.744198e-24
8,523_S2-ChIP_1,524_S2-ChIP_1,0.931224,8.407465e-18
11,638_S2-ChIP_1,638_S2-ChIP_2,0.912846,5.736764e-16


In [99]:
high_rep_corr_df.to_csv(os.path.join(out_htc_corr, 'high_corr_reps.csv'), index=None, sep='\t')

- Check **low correlation scores** any `Correlation < 0.85`:

In [100]:
# Low correlated samples
low_rep_corr_df = rep_corr_df[rep_corr_df['Correlation'] < .85]

In [101]:
low_rep_corr_df.sort_values(by = "Correlation", ascending=True)

Unnamed: 0,Sample1,Sample2,Correlation,P-value
2,301_S2-ChIP_1,301_S2-ChIP_2,0.625471,2.067094e-05
1,1168_S2-ChIP_1,1168_S2-ChIP_2,0.651241,7.13407e-06
16,WT_S2-ChIP_1,WT_S2-ChIP_3,0.658476,5.196781e-06
9,530_S2-ChIP_1,530_S2-ChIP_2,0.687251,1.349116e-06
17,WT_S5-ChIP_1,WT_S5-ChIP_2,0.715393,3.08715e-07
10,544_S2-ChIP_1,544_S2-ChIP_2,0.760585,1.927261e-08
6,504_S2-ChIP_1,504_S2-ChIP_2,0.793032,1.757299e-09
3,302_S2-ChIP_1,302_S2-ChIP_2,0.817912,2.061308e-10
0,1022_S2-ChIP_1,1022_S2-ChIP_2,0.825588,9.958645e-11


- Run **correlation checks**  for all **pair-wise MERGED sample combinations**:

In [102]:
#out_labelled_corr = os.path.join(out_dir, 'labelled_CORR')
#if not os.path.isdir(out_labelled_corr):    
#    !mkdir -p $out_labelled_corr

In [103]:
# Labeled scatter plots for samples
#rpc.labeled_corr_plots(tpm_rep_df_l2, out_dir = out_labelled_corr, prefix = 'repeats.')