In [1]:
import itertools
import time
import os
import sys

In [2]:
import HTSeq
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
import collections

---

## Config

In [5]:
#project_data_dir = '/data/pablo/RNAdeg'
project_data_dir = '/gcm-lfs1/pablo/data/RNAdeg'

In [6]:
project_dir = '/home/pmonteagudo/workspace/RNAdeg'

In [7]:
scripts_dir = os.path.join(project_dir, 'pyRNAdeg')
if scripts_dir not in sys.path: 
    sys.path.append(scripts_dir)

from Util import to_log2_tpm

In [8]:
## heterochromatic genes - old
#old_htc_genes = ('dg1', 'dh1', 'after_tlh', 'MAT2', 'MAT3', 'MAT1')
#new_htc_genes = ('SPAC212.11', 'SPAC212.10')
## heterochromatic genes - old
htc_genes_dict = {'dg1a': 'dg1', 'dg1b': 'dg1', 
                  'dh1a':'dh1', 'dh1b':'dh1',
                  'SPAC212.11b':'SPAC212.11', ## tlh1
                  'SPAC212.10b':'SPAC212.10'}
# htc_genes_dict = {'dg1': ['dg1a', 'dg1b'],
#                   'dh1':['dh1a', 'dh1b'], 
#                   'SPAC212.11': ['SPAC212.11a', 'SPAC212.11b'],
#                   'SPAC212.10': ['SPAC212.10a', 'SPAC212.10b']}

---

In [9]:
def load_gene_counts(gene_counts_file, index_col='gene_id', gdf='pombase', as_int=True, sample_ids=None):
    
    not_int_cols = [index_col, 'gene_name', 'seqid', 'type', 'strand', 'Name', 'category']
    #not_int_cols = ['gene_id', 'gene_name', 'chr', 'type', 'category', 'bio_type']
    not_int_cols.extend([index_col, 'gene-name', 'chr', 'type', 'category', 'bio_type'])

    if gdf == 'pombase':
        rename_cols= {}
        
    elif gdf == 'parastou':
        rename_cols = {'gene-id':'gene_id'}
    
    ## create list with everythingn but 'sample_ids'
    gdf_cols = ['start', 'end', 'cds_length', 'utr_length', 'gene_length']
    #gdf_cols = ['start', 'end', 'length']
    
    ## create list with everythingn but 'sample_ids'
    gdf_cols.extend(['length'])
        
    gdf_cols.extend(not_int_cols)
    
    ## Import Gene Counts DataFrame
    counts_df = pd.read_csv(gene_counts_file, sep='\t')
    
    ## Select only 'gene_id' and `sample count`s columns
    counts_df = counts_df[[ii for ii in counts_df.columns if (ii not in gdf_cols or ii == index_col)]]

    ## Careful! - convert NaNs to zeros: means there was no count for that feature!
    counts_df = counts_df.fillna(0)
    
    #import pdb
    #pdb.set_trace()
    if as_int:
        ## Convert necessary columns to int type
        counts_df = counts_df.astype({ky:'int64' for ky in counts_df.columns if (ky not in not_int_cols)})

    ## rename columns
    counts_df = counts_df.rename(columns=rename_cols)
    
    if not isinstance(sample_ids, type(None)):
        
        ## Select samples that were computed by me and intersect with Parastous
        counts_df = counts_df[ [ii for ii in counts_df.columns if (ii in sample_ids or ii == 'gene_id')] ]
    
    return counts_df.sort_values('gene_id')
                                                                   

# Comparison Gene Counts:  **RNA-Seq data**

In [10]:
params_dict = {## sequencing type: ['rna', 'chip' ]
               'seq_type': 'rna',
               #'seq_type': 'chip',
               ## gene expression matrix: ['gene_count', 'tpm'] 
               #'expression': 'gene_count',
               'expression': 'tpm',
               ## batch with corresponding parameters
               'batch': 'xp_data',  ## stranded=True, remove duplicate reads mapping to same gene, count only multimapped if repeat
               #'batch': 'xp_data_by_gene_unstranded'  ## stranded=False, remove duplicate reads mapping to same gene, count only multimapped if repeat
              }

### A. Results using **my counting scripts** (`htseq`)

- **RNA**

In [11]:
#xp_data_dir = os.path.join(project_dir, 'results/RNA/xp_data')  ## stranded=True, remove duplicate reads mapping to same gene, count only multimapped if repeat
#xp_data_dir = os.path.join(project_dir, 'results/RNA/xp_data_by_gene_unstranded') ## stranded=False, remove duplicate reads mapping to same gene, count only multimapped if repeat
#xp_data_dir = os.path.join(project_dir, 'results/RNA/xp_data_parastou') ## using parastous counting script

In [12]:
## a read can map more than once to the same repeat (fix above)
#counts_file = os.path.join(xp_data_dir, 'rna_pombe_gene_count_matrix.csv')
#counts_file = os.path.join(xp_data_dir, 'rna_pombe_tpm_matrix.csv')

- **ChIP**

In [13]:
#xp_data_dir = os.path.join(project_dir, 'results/ChIP/xp_data')  ## stranded=False, remove duplicate reads mapping to same gene, count only multimapped if repeat

In [14]:
## unique reads: remove duplicate reads mapping to same gene 
#counts_file = os.path.join(xp_data_dir, 'chip_pombe_gene_count_matrix.csv')
#counts_file = os.path.join(xp_data_dir, 'chip_pombe_tpm_matrix.csv')

- Import **gene expresion Matrix**: `*_pombe_tpm_matrix.csv`

In [15]:
if params_dict['seq_type'] == 'rna':
    xp_data_dir = os.path.join(project_dir, 'results/RNA', params_dict['batch'])  ## stranded=True, remove duplicate reads mapping to same gene, count only multimapped if repeat
else:
    xp_data_dir = os.path.join(project_dir, 'results/ChIP', params_dict['batch'])  ## stranded=False, remove duplicate reads mapping to same gene, count only multimapped if repeat

In [16]:
counts_file = os.path.join(xp_data_dir, params_dict['seq_type'] + '_pombe_' + params_dict['expression'] + '_matrix.csv')
counts_file

'/home/pmonteagudo/workspace/RNAdeg/results/RNA/xp_data/rna_pombe_tpm_matrix.csv'

In [17]:
if params_dict['batch'] == 'gene_count':
    counts_df =  load_gene_counts(counts_file, index_col='gene_id', gdf='pombase') ## for raw gene counts
else:
    counts_df =  load_gene_counts(counts_file, index_col='gene_id', gdf='pombase', as_int=False) ## for tpm
counts_df

Unnamed: 0,gene_id,1168_pA_2,1168_pA,1168_S2RIP_2,1168_S2RIP,283_RNA_pA_4,301_RNA_pA_2,301_RNA_pA_3,301_RNA_pA,301_S2RIP_2,...,63_RNA_pA_4,63_RNA_pA,63_S2PRIP,63_S2RIP_2,65_RNA_pA,80pARNA_2,80_RNA_pA,80S2RIP_1,80S2RIP_2,80_S2RIP
0,MAT1,11.381057,7.814922,10.805414,12.630129,7.620925,11.598520,5.145428,3.110889,10.707561,...,4.500308,2.915721,5.331106,1.598258,4.452123,3.814986,3.885067,5.923055,7.324734,12.442899
1,MAT2,1.171111,0.855113,6.080577,8.528888,1.388083,2.285509,0.767484,0.561922,5.623811,...,0.726618,0.528066,1.571427,0.399731,0.692539,0.882583,0.546565,3.434110,5.692119,2.534074
2,MAT3,0.634352,0.610795,3.715908,0.000000,0.683683,1.911916,1.023312,0.216124,7.364515,...,0.363309,0.132016,2.714283,0.999327,0.875858,1.765166,0.273283,2.626084,3.533039,3.067563
3,SPAC1002.01,39.589095,24.437913,25.029207,0.000000,19.034314,12.049140,11.889934,5.124219,32.739360,...,13.300910,23.056262,35.458556,17.770263,19.468385,10.780985,18.223391,17.960619,37.811515,51.385882
4,SPAC1002.02,149.816947,85.643443,53.763644,105.639431,63.024153,37.454949,46.010437,39.012178,52.332622,...,44.345934,127.281203,98.427593,37.849051,111.122855,40.558844,64.965564,44.765892,45.869418,73.621651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7003,dg1a,1.561425,3.350528,94.776405,106.133486,20.480158,32.548500,18.259494,5.828963,136.802935,...,3.077331,2.349271,44.733044,18.044355,3.282169,552.947451,535.926985,584.989815,693.123873,1207.166861
7004,dg1b,2.091194,2.512896,125.660774,112.631454,16.549862,17.329063,6.006762,5.186789,114.920587,...,3.785606,2.069083,58.610084,15.531850,3.771003,411.711958,265.985518,325.507356,370.787625,990.730378
7005,dh1a,1.732268,1.832385,79.723122,103.926082,5.510898,17.558860,2.116396,0.821271,60.924623,...,2.243968,1.471041,57.428504,16.988567,2.607206,49.734743,32.793906,211.904775,227.684741,821.706819
7006,dh1b,1.756666,2.687499,48.475712,46.119173,22.478664,59.049634,30.094688,10.417174,64.673831,...,4.573420,2.036826,30.999964,11.592199,4.094129,449.664095,512.313687,494.309803,523.086065,916.001045


In [18]:
counts_df.shape

(7008, 55)

- Make **manual entries** comparable: (add up `+` and `-` stranded htc genes)

In [19]:
#counts_df['gene_id'] = counts_df['gene_id'].apply(lambda x: htc_genes_dict[x] if x in htc_genes_dict else x)
## For RNA-seq - sum strands!
#counts_df = counts_df.groupby('gene_id').sum().reset_index()
## For ChIP-seq (or unstranded RNA-seq) - select only one!
#counts_df = counts_df.groupby('gene_id').first().reset_index()
#counts_df

- Select `sample_id`s for samples that I computed

In [20]:
sample_ids = [ii for ii in counts_df.columns if ii != 'gene_id']
len(sample_ids)

54

### B. Results using **Parastou's counting script**

**RNA**

In [21]:
#xp_parastou_file = '/data/parastou/RNAdeg/results/RipRna/xp_data/pombe_gene_count_matrix.csv' ## gene counts, old gff, obtained from Parastou
#xp_parastou_file = '/data/parastou/RNAdeg/results/RipRna/xp_data/pombe_tpm_matrix.csv' ## tpm, old gff and obtained from Parastou

**ChIP**

In [22]:
#xp_parastou_file = '/data/parastou/RNAdeg/results/RipChip/xp_data/chip_pombe_gene_count_matrix.csv' ## old gff, obtained from Parastou
#xp_parastou_file = '/data/parastou/RNAdeg/results/RipChip/xp_data/chip_pombe_tpm_matrix.csv' ## tpm and old gff, obtained from Parastou

- Import **gene expresion Matrix**: `*_pombe_tpm_matrix.csv`

In [23]:
if params_dict['seq_type'] == 'rna':
    xp_parastou_data_dir = os.path.join('/data/parastou/RNAdeg/results/RipRna/xp_data')
else:
    xp_parastou_data_dir = os.path.join('/data/parastou/RNAdeg/results/RipChip/xp_data')

In [24]:
xp_parastou_file = os.path.join(xp_parastou_data_dir, params_dict['seq_type'] + '_pombe_' + params_dict['expression'] + '_matrix.csv')
xp_parastou_file

'/data/parastou/RNAdeg/results/RipRna/xp_data/rna_pombe_tpm_matrix.csv'

In [25]:
if params_dict['batch'] == 'gene_count':
    xp_parastou_df = load_gene_counts(xp_parastou_file, index_col='gene-id', gdf='parastou', sample_ids=sample_ids) ## for raw gene counts
else:
    xp_parastou_df = load_gene_counts(xp_parastou_file, index_col='gene-id', gdf='parastou', sample_ids=sample_ids, as_int=False) ## for tpm
xp_parastou_df

FileNotFoundError: [Errno 2] File b'/data/parastou/RNAdeg/results/RipRna/xp_data/rna_pombe_tpm_matrix.csv' does not exist: b'/data/parastou/RNAdeg/results/RipRna/xp_data/rna_pombe_tpm_matrix.csv'

In [None]:
xp_parastou_df.shape

In [None]:
missing_genes = set(xp_parastou_df['gene_id']).symmetric_difference(counts_df['gene_id'])
#missing_genes

- **Merge** both counts DataFrames

In [None]:
#set(sample_ids).symmetric_difference(xp_parastou_df.columns)
#sample_ids = list(set(sample_ids).intersection(xp_parastou_df.columns))
#sample_ids

In [None]:
# Use concat with set_index by ID in both DataFrames and then swaplevel with sort_index for expected MultiIndex in columns:
merged_xp_rna = (pd.concat([counts_df.set_index('gene_id'), 
                xp_parastou_df.set_index('gene_id')], 
                axis=1, 
                keys=['htseq', 'parastou'], 
                sort=True)
        .swaplevel(0, 1, axis=1)
        .sort_index(axis=1, ascending=[True, False])
        )

In [None]:
merged_xp_rna

In [None]:
# https://stackoverflow.com/questions/40225683/how-to-simply-add-a-column-level-to-a-pandas-dataframe
#merged_xp_rna.columns = pd.MultiIndex.from_tuples(map(lambda x: ( x, '_'.join(x.split('_')[:-1]) ), merged_xp_rna.columns))

In [None]:
#merged_xp_rna = pd.merge(counts_df, xp_parastou_df, on='gene_id', how='outer')
#merged_xp_rna

In [None]:
merged_xp_rna.shape

### Comparison Gene Counts: `htseq_counts vs parastous_counts`

- **Visualization**: `scatter_plot`

In [None]:
import math
import numpy as np

In [None]:
def map_color(ratio, n_clusters):
    
    clust = np.linspace(0, 1, n_clusters)
    lower_bound = 0
    ## init
    color = 0
    
    for ii, upper_bound in enumerate(np.linspace(0, math.pi/2, n_clusters)):
        
        if (ratio > math.tan(lower_bound)) and (ratio <= math.tan(upper_bound)):
            color = clust[ii]

        ii =+ 1
        lower_bound = upper_bound
    
    
    return color

In [None]:
def map_color_old(ratio, n_clusters):
    
    clust = np.linspace(0, 1, n_clusters)
    lower_bound
    for ii, upper_bound in enumerate(np.linspace(0, math.pi/2, clusters)):
        
        ## Parastou's counts > htseqcounts 
        if ratio <= math.tan(math.pi/6):
            color = 0

        ## Parastou's counts ~ htseqcounts 
        elif (ratio > math.tan(math.pi/6)) and (ratio <= math.tan(math.pi/3)):
            color = 1/2

        ## Parastou's counts < htseqcounts 
        else:
            color = 1
    
    return color

In [None]:
def correlation_scatter_plot(merged_xp_rna, sample_ids=None, n_max=50000, out_dir=None):

    deviate_genes = collections.Counter( )
    
    if isinstance(sample_ids, type(None)):
        sample_ids = merged_xp_rna.columns.get_level_values(0).unique()
                      
    for sample_id in sample_ids:

        print(' sample_id: {}'.format(sample_id))

        ## remove nans
        df = merged_xp_rna[sample_id].dropna()

        #df['parastou'].plot.kde()
        #df['htseq'].plot.kde()

        n_max = 50000
        ## filter extreme counts
        df = df[ (df['parastou'] < n_max) & (df['htseq'] < n_max) ]

        ## Compute ratios: 
        df['ratio'] =  df['htseq'] / df['parastou']
        ## add color column
        #df['color'] = df['ratio'].apply(lambda x: map_color(x)
        df['color'] = df['ratio'].apply(lambda x: map_color(x, 6))

        #df['diff_count'] = abs(df['parastou'] - df['htseq'])
        #df.sort_values('diff_count', na_position='first')

        ## print distribution ammong clusters
        df_by_type = df['color'].value_counts().sort_index()
        #print(df_by_type, '\n')
        
        #for gg in df[df['color'] >= 0.8].index.tolist():
        for gg in df[df['color'] <= 0.4].index.tolist():
            deviate_genes[gg] += 1
        
        ## ------------
        ## Scatter Plot
        ## ------------
        
        plot_lim = max(max(df['parastou']), max(df['htseq']))
        x = np.linspace(0, plot_lim, 1000)

        plt.figure()
        #merged_xp_rna.plot.scatter(x = 'count', y = sample_name, xlim=(0, 1000), ylim=(0, 1000))
        df.plot.scatter(x = 'parastou', y = 'htseq', c='color', 
                        colormap='viridis',
                        xlim=(0, plot_lim),
                        ylim=(0, plot_lim))
        plt.title(' sample_id: {} (genes={})'.format(sample_id, df_by_type.sum()))
        ## y = x - perfect correlation
        plt.plot(x, x, '--k') # dashdot black
        
        if not isinstance(out_dir, type(None)):
            plt.savefig(os.path.join(out_dir, sample_id + '_parastou_vs_htseq.pdf' ))
            
        #plt.close() 

    return deviate_genes

- Scatter plots: **all Genes**

In [None]:
if params_dict['expression'] == 'gene_count':
    plot_dir = os.path.join(xp_data_dir, 'gxp_Plots')
else:
    plot_dir = os.path.join(xp_data_dir, 'tpm_Plots')

In [None]:
if not os.path.isdir(plot_dir):    
    !mkdir -p $plot_dir
plot_dir

In [None]:
deviate_genes = correlation_scatter_plot(merged_xp_rna, n_max=50000, out_dir=plot_dir)

In [None]:
#df[df['color'] > 0 ]

In [None]:
#deviate_genes
len(deviate_genes)

- Genes conntaining big deviations across all samples (or many samples):
    - After using equivalent `'count_mode' = 'union'` to Parastou (instead of intersectiom-empty)
    - Only **10 genes** contain big differences, most of which can be explained by differences between `ensembl` and `Pombase` **annotation**.
    - Unexplained differences: (ChIP-seq)
        - SPNCRNA.390 (This it's in the region where we MAT locus htc_genes are defined)
        - SPNCRNA.1306

In [None]:
weird_genes = ['SPNCRNA.390', 'SPNCRNA.1306'] # still not explained by differences in annotation

In [None]:
#big_deviate_genes = {k:v for k,v in deviate_genes.items() if v >= 40}
big_deviate_genes = {k:v for k,v in deviate_genes.items() if v >= 20}
#big_deviate_genes

In [None]:
#missing_genes

In [None]:
#big_deviate_genes_df = merged_xp_rna[merged_xp_rna.index.isin(big_deviate_genes) & ~merged_xp_rna.index.isin(missing_genes)]
big_deviate_genes_df = merged_xp_rna[merged_xp_rna.index.isin(weird_genes)]

## many deviates are due to 0 counts, we require ~ 1 count per sample
#big_deviate_genes_df[big_deviate_genes_df.sum(1) > 100] 
#big_deviate_genes_df[big_deviate_genes_df.xs('htseq', level=1, axis=1).sum(1) > 52]
#big_deviate_genes_df[big_deviate_genes_df.xs('parastou', level=1, axis=1).sum(1) > 52]
#big_deviate_genes_df[(big_deviate_genes_df.xs('htseq', level=1, axis=1).sum(1) > 100) & (big_deviate_genes_df.xs('parastou', level=1, axis=1).sum(1) > 100)]
big_deviate_genes_df

In [None]:
big_deviate_genes_df.xs('htseq', level=1, axis=1).sum(1)
big_deviate_genes_df.xs('parastou', level=1, axis=1).sum(1)

## Investigate **heterochromatic genes**

In [None]:
import viz_strands ## get deg1, deg2 and non_degraded

In [None]:
## centromeric genes: `deg1`
old_deg1 = ['dh1', 'dg1']
deg1 = viz_strands.deg1

## subtelomeric genes: `deg2`
old_deg2 = ['SPAC212.11', 'SPAC212.10']
deg2 = viz_strands.deg2

# Mating type region (MTR) gene counts
deg3 = ['MAT2', 'MAT3', 'MAT1']

## rest of Heterochromatic genes, including mat: `deg3`
# non_degraded = ['SPAC212.09c', 'SPNCRNA.70', 'SPAC212.08c', 'SPAC212.07c', 'SPAC212.12', 'SPAC212.06c',
#                 'SPAC212.04c', 'SPAC212.03', 'SPAC212.02', 'SPAC212.01c', 'SPAC977.01', 'SPAC977.18',
#                 'SPAC977.02', 'SPAC977.03', 'SPAC977.04', 'SPAC212.05c', 'MAT2', 'MAT3', 'MAT1']
non_degraded = viz_strands.non_degraded

In [None]:
old_all_htc_genes = old_deg1 + old_deg2 + deg3 + non_degraded
old_htc_genes = old_deg1 + old_deg2 + deg3

In [None]:
all_htc_genes = deg1 + deg2 + non_degraded
htc_genes = deg1 + deg2 + deg3

- Show counts for genes of interest: **manual entries Heterochromatic Genes (UNSTRANDED)**

In [None]:
#merged_xp_rna[(merged_xp_rna.index.isin(old_htc_genes)) | (merged_xp_rna.index.isin(new_htc_genes))]
merged_xp_rna[merged_xp_rna.index.isin(old_htc_genes)]

- Show counts for genes of interest: **manual entries Heterochromatic Genes (STRANDED)**

In [None]:
merged_xp_rna[merged_xp_rna.index.isin(htc_genes)]

- Show counts for genes of interest: **all Heterochromatic Genes**

In [None]:
#[xx for xx in merged_xp_rna.columns if '80' in xx[0]]

In [None]:
#htc_df = merged_xp_rna[merged_xp_rna.index.isin(het_genes)]
#htc_df = merged_xp_rna[merged_xp_rna.index.isin(old_all_htc_genes)]
#htc_df = merged_xp_rna[merged_xp_rna.index.isin(all_htc_genes)]
htc_df = merged_xp_rna[merged_xp_rna.index.isin(non_degraded)].xs([xx for xx in merged_xp_rna.columns if '80' in xx[0]], axis=1)
htc_df

In [None]:
htc_df.shape

- Scatter plots: **all Genes**

In [None]:
if params_dict['expression'] == 'gene_count':
    plot_dir = os.path.join(xp_data_dir, 'gxp_htc_plots')
else:
    plot_dir = os.path.join(xp_data_dir, 'tpm_htc_plots')

In [None]:
if not os.path.isdir(plot_dir):    
    !mkdir -p $plot_dir
plot_dir

In [None]:
htc_deviate_genes = correlation_scatter_plot(htc_df, n_max=50000, out_dir=plot_dir)