In [11]:
import os
import sys

import pandas as pd

------------------------------

In [15]:
rdg_dir = '/data/parastou/RNAdeg/scripts/'

In [18]:
if rdg_dir not in sys.path:
    sys.path.append(rdg_dir)

In [19]:
import Util

---

In [16]:
source_dir = '/data/parastou/RNAdeg/results/RipChip/xp_data/'

In [22]:
rna_dir = '/data/parastou/RNAdeg/results/RipRna/xp_data/'

--------------------------

## Transcription Efficiency workflow

- Load tpm data for pairs.
- Calculate ratios: tpm(RIP)/tpm(ChIP).

-------------

### 1 - Load TPM-normalized gene expression tables.

In [20]:
chip_df = pd.read_csv(os.path.join(source_dir, 'chip_pombe_tpm_merged.csv'), sep='\t')

In [23]:
rna_df = pd.read_csv(os.path.join(rna_dir, 'merged_tpm.csv'), sep='\t')

In [34]:
tpm_df = pd.merge(chip_df, rna_df, on=['gene-id', 'gene-name', 'type', 'category', 'bio_type'], how='outer')

In [38]:
tpm_df = tpm_df[tpm_df['type']!='rRNA_gene']

In [40]:
pairs = [('1160_RIP', '1160_ChIP'), ('301_RIP', '301_ChIP'), ('302_RIP', '302_ChIP'), ('324_RIP', '324_ChIP'), 
         ('491_RIP', '491_ChIP'), ('504_RIP', '504_ChIP'), ('510_RIP', '510_ChIP'), ('530_RIP', '530_ChIP'), 
         ('638_RIP', '638_ChIP'), ('80_RIP', '80_ChIP'), ('WT_RIP', 'WT_ChIP')]

In [47]:
tpm_df.columns = ['gene-id', 'gene-name', 'type', 'category', 'bio_type', '638_ChIP',
                   '1160_ChIP', 'WT_ChIP', '302_ChIP', '301_ChIP', '324_ChIP', '80_ChIP',
                   '504_ChIP', '510_ChIP', '491_ChIP', '530_ChIP', '1160_RIP', '1160_RNA',
                   '301_RIP', '301_RNA', '302_RIP', '302_RNA', '324_RIP', '324_RNA',
                   '491_RIP', '491_RNA', '504_RIP', '504_RNA', '510_RIP', '510_RNA',
                   '530_RIP', '530_RNA', '638_RIP', '638_RNA', '80_RIP', '80_RNA',
                   'WT_RIP', 'WT_RNA', '1160', '301', '302', '324', '491', '504']

---

In [13]:
rep_tpm_df = tpm_df[tpm_df['category']=='repeat']

In [14]:
rep_tpm_df.to_csv(os.path.join(source_dir, 'repeats_tpm.csv'), sep='\t', index=None)

------

### 2 - Calculate and save the ratio (RIP/ChIP) table.

In [43]:
col_names = ['1160', '301', '302', '324', '491', '504', '510', '530', '638', '80', 'WT']

In [48]:
te_table = Util.ratio_table(tpm_df, pairs, new_cols=col_names, out_dir=source_dir, 
                            file_name='transcription_efficiency.csv')

--------

In [49]:
# Calculate the log2(tpm) of tpm table for later reference.
l2_te_table = Util.to_log2_tpm(te_table, shift=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  new_df[col] = new_df[col].apply(lambda x: log((x + shift), 2))


11 columns contained zero values. Their log-transformed results are NaNs


In [50]:
l2_te_table.to_csv(os.path.join(source_dir, 'l2_te_0shift.csv'), sep='\t', index=None)

----

In [20]:
# Extract and save repeats ratio table.
rep_te_table = te_table[te_table['category']=='repeat']

In [21]:
rep_te_table.to_csv(os.path.join(source_dir, 'repeats_ratios.csv'), sep='\t', index=None)

-------------