In [1]:
%load_ext autoreload
%autoreload 2

---------------------------

## Config

In [2]:
import sys

In [3]:
project_dir = '/home/pmonteagudo/workspace/silencing_project'
if project_dir not in sys.path: 
    sys.path.append(project_dir)
from config_analysis import *

In [4]:
# needed to import all our code
my_scripts_dir = os.path.join(project_dir, 'htseq/scripts')
if my_scripts_dir not in sys.path:
    sys.path.append(my_scripts_dir)

In [5]:
import parse_utils as psutil

---------------------------

<font color='blue'> **This Notebook parses individual `odf` raw files to generate:** </font>
- `*_valid_samples.csv` files (for each **seq_assay**) 
- `sample_annotation.csv` file

<font color='blue'> **Additionally, extracts and stores some information into two separate files, necessary to run `sample_names.ipynb` Notebook:** </font>

* `trimmed_samples.tsv`
* `chip_input_map.tsv`


---------------------------

## Get **Sample Annotation** from **ODF** Documents

- Get **odf document** files:

In [6]:
#odf_docs_dir = os.path.join(project_data_dir, 'raw')
odf_docs_dir = raw_dir
odf_files = {ff.split("_")[0]:os.path.join(odf_docs_dir, ff) for ff in os.listdir(odf_docs_dir) if '.ods' in ff}
odf_files

{'INPUT': '/gcm-lfs1/pablo/data/rna_silencing/raw/INPUT_samples.ods',
 'pA-RNA': '/gcm-lfs1/pablo/data/rna_silencing/raw/pA-RNA_samples.ods',
 'S2-ChIP': '/gcm-lfs1/pablo/data/rna_silencing/raw/S2-ChIP_samples.ods',
 'S2-RIP': '/gcm-lfs1/pablo/data/rna_silencing/raw/S2-RIP_samples.ods',
 'simulated-data': '/gcm-lfs1/pablo/data/rna_silencing/raw/simulated-data_samples.ods',
 'S5-RIP': '/gcm-lfs1/pablo/data/rna_silencing/raw/S5-RIP_samples.ods',
 'S5-ChIP': '/gcm-lfs1/pablo/data/rna_silencing/raw/S5-ChIP_samples.ods',
 'H3K9me2': '/gcm-lfs1/pablo/data/rna_silencing/raw/H3K9me2_samples.ods',
 'total-RNA': '/gcm-lfs1/pablo/data/rna_silencing/raw/total-RNA_samples.ods'}

-  Import **odf document** files:

In [7]:
odf_dfs = {}

In [8]:
for key, val in odf_files.items():
    print("#"*80)
    odf_dfs[key] = psutil.read_odf_doc(val)
    print("#"*80, '\n')

################################################################################
Importing odf file /gcm-lfs1/pablo/data/rna_silencing/raw/INPUT_samples.ods ... 

Spreadsheet contains 1 sheet(s).
----------------------------------------
   Sheet name : 'Sheet1'
Size of Sheet : (rows=49, cols=19)

Done.
################################################################################ 

################################################################################
Importing odf file /gcm-lfs1/pablo/data/rna_silencing/raw/pA-RNA_samples.ods ... 

Spreadsheet contains 1 sheet(s).
----------------------------------------
   Sheet name : 'Sheet1'
Size of Sheet : (rows=41, cols=23)

Done.
################################################################################ 

################################################################################
Importing odf file /gcm-lfs1/pablo/data/rna_silencing/raw/S2-ChIP_samples.ods ... 

Spreadsheet contains 1 sheet(s).
---------------------------

## **Valid samples**
- **Process** the sample tables
- **Store** tables as .csv

In [9]:
#odf_dfs.items()

In [10]:
#odf_dfs['H3K9me2']
#odf_dfs['S5-RIP']   

In [11]:
#name_sample_csv = {'pa-rna_samples':'pA-RNA' , 'total-rna_samples':'total-RNA', 's2-chip_samples':'S2-ChIP', 's2-rip_samples':'S2-RIP', 'input_samples':'INPUT'}

- Filter each sample df for **valid_samples** only

In [12]:
for key, val in odf_dfs.items():
    
    print(key)
    # get rid of **last row** which is not a real row and contains directory paths used in the **odf document**
    val = val[0:-1]
    
    # filter each sample df for **valid_samples** only
    val = val[val['valid_sample']]
    
    #if key == 'H3K9me2':            
    #if key == 'simulated-data':
    #    import pdb; pdb.set_trace()
    
    # seems to be **ignored** already
    #val = val[~val['sample'].str.contains('283')]
    #val['mutant_id'] = val.mutant.map(inv_mut_dict)
    val['mutant'] = val.mutant_id.map(mut_dict)
    val['mutant'] = val['mutant'].fillna("unknown")
    
    # add `seq_category` col: names for sequencing categories used to categorize samples in directories
    val['seq_category'] = val.seq_type.map(seq_category)
    
    # add `pipeline_type` col: distinguishes between 'ChIP' and 'RNA' pipe-line
    val['pipeline_type'] = val.seq_type.map(pipeline_type)
    
    #csv_file = os.path.join(project_data_dir, 'seq_data', name_sample_csv[key], name_sample_csv[key] + '_valid_samples.csv')
    #csv_file = os.path.join(project_data_dir, 'seq_data', key, key + '_valid_samples.csv')
    csv_file = os.path.join(data_dir, key, key + '_valid_samples.csv')

    #print(csv_file)
    val.to_csv(csv_file, sep="\t", header=False, index=False)
    
    odf_dfs[key] = val


INPUT
pA-RNA
S2-ChIP
S2-RIP
simulated-data
S5-RIP
S5-ChIP
H3K9me2
total-RNA


## **Pool samples** into one Table

- Concatenate all different sequencing sample dataframes into one:

In [13]:
col_order = [
    'sample_id', 'valid_sample', 'original_id',
    'raw_file', 'original_raw_file', 
    'raw_data', 'fastq', 'trimmed', 
    'seq_type', 'mutant_id', 'replicate', 'batch',
    'file', 'halic_server', 'parastous_desktop', 'halic_local_dir',
    'INPUT_1', 'INPUT_2', 'INPUT_3', 'OIN',
    'comments',
    'mutant', 'pipeline_type', 'seq_category']

In [14]:
all_samples_df = pd.concat(odf_dfs, ignore_index=True, sort = True)[col_order]
all_samples_df.head()

Unnamed: 0,sample_id,valid_sample,original_id,raw_file,original_raw_file,raw_data,fastq,trimmed,seq_type,mutant_id,...,parastous_desktop,halic_local_dir,INPUT_1,INPUT_2,INPUT_3,OIN,comments,mutant,pipeline_type,seq_category
0,1022_S2-ChIP-INPUT_1,True,1022_1_INPUT,1022_S2-ChIP-INPUT_1.fastq.bz2,1022_1_INPUT.fastq.bz2,True,True,False,S2-ChIP-INPUT,1022,...,/data/parastou/RNAdeg/revision/Sequencing_New/...,Revision/Sequencing_Revision/ChIP/INPUT/,,,,,(MOVED),mot2d,ChIP,INPUT
1,1022_S2-ChIP-OIN_1,True,1022_OIN,1022_S2-ChIP-OIN_1.fastq.bz2,1022_OIN.fastq.bz2,True,True,False,S2-ChIP-OIN,1022,...,/data/parastou/RNAdeg/revision/Sequencing_New/...,Revision/Sequencing_Revision/ChIP/INPUT/,,,,,(MOVED),mot2d,ChIP,INPUT
2,1168_S2-ChIP-OIN_1,True,1168_OIN,1168_S2-ChIP-OIN_1.fastq.bz2,1168_OIN.fastq.bz2,True,True,False,S2-ChIP-OIN,1168,...,/data/parastou/RNAdeg/revision/Sequencing_New/...,Revision/Sequencing_Revision/ChIP/INPUT/,,,,,(MOVED),caf1d*ccr4d*,ChIP,INPUT
3,301_S2-ChIP-INPUT_1,True,301_INPUT,301_S2-ChIP-INPUT_1.fastq.bz2,301_INPUT.tar.bz2,True,True,False,S2-ChIP-INPUT,301,...,/data/parastou/RNAdeg/revision/INPUTs/more/301...,Revision/INPUTs/,,,,,.tar.bz2 – whole directory,swi6d,ChIP,INPUT
4,301_S2-ChIP-OIN_1,True,301_OIN,301_S2-ChIP-OIN_1.fastq.bz2,301_OIN.fastq.bz2,True,True,False,S2-ChIP-OIN,301,...,/data/parastou/RNAdeg/revision/Sequencing_New/...,Revision/Sequencing_Revision/ChIP/INPUT/,,,,,(MOVED),swi6d,ChIP,INPUT


In [15]:
#all_samples_df[all_samples_df['sample_id'].str.contains('WT')]

In [32]:
#all_samples_df[all_samples_df['sample_id'].str.contains('H3K9me2')]

In [17]:
all_samples_df.shape

(154, 24)

- **Store Data Frame** containing annotation of all samples:

In [18]:
#sample_annotation_file = os.path.join(project_data_dir, 'seq_data', 'sample_annotation.csv')
sample_annotation_file = os.path.join(data_dir, 'sample_annotation.csv')
sample_annotation_file

'/gcm-lfs1/pablo/data/rna_silencing/seq_data/sample_annotation.csv'

In [19]:
all_samples_df.to_csv(sample_annotation_file, sep="\t", index=False)

In [20]:
#raw_files_df[['raw_file_name', 'data_dir', 'seq']].groupby(['data_dir', 'seq']).count()

### Summary **Samples**

- Summarize by `seq_type` and `pipeline_type`

In [21]:
all_samples_df[['sample_id', 'seq_type', 'pipeline_type']].groupby(['seq_type', 'pipeline_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
seq_type,pipeline_type,Unnamed: 2_level_1
H3K9me2,ChIP,9
S2-ChIP,ChIP,30
S2-ChIP-INPUT,ChIP,20
S2-ChIP-OIN,ChIP,10
S2-RIP,RNA,37
S5-ChIP,ChIP,4
S5-RIP,RNA,1
pA-RNA,RNA,34
simulated-data,simulated-data,1
total-RNA,RNA,8


- Summarize by `mutant_id` and  `mutant` 

In [22]:
all_samples_df[['sample_id', 'mutant_id', 'mutant']].groupby(['mutant_id', 'mutant']).count()
#all_samples_df[['sample_id', 'mutant']].groupby(['mutant']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
mutant_id,mutant,Unnamed: 2_level_1
1022,mot2d,9
1023,mot2d,1
1168,caf1d*ccr4d*,7
301,swi6d,11
302,clr3d,10
324,chp2d,12
491,mit1d,10
504,rrp6d,10
510,caf1d,10
523,unknown,6


- Summarize by `seq_type` and  `mutant` - **should have at least 2 replicates for each**

In [23]:
all_samples_df[['sample_id', 'seq_type', 'mutant']].groupby(['seq_type', 'mutant']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
seq_type,mutant,Unnamed: 2_level_1
H3K9me2,ago1d,1
H3K9me2,caf1d,1
H3K9me2,chp2d,1
H3K9me2,clr3d,1
H3K9me2,exo2d,1
...,...,...
simulated-data,fake-reads,1
total-RNA,ago1d,2
total-RNA,caf1d,2
total-RNA,clr4d,2


### Store manual data **Samples**

- Store `trimmed` column: if `True` sample needs to be **trimed** to solve error

<font color='red'> **Error** - EXITING because of FATAL ERROR in reads input: short read sequence line: 1 - Need to trim reads: </font>

In [24]:
trimmed_samples_file = os.path.join(raw_dir, 'trimmed_samples.tsv')
trimmed_samples_file

'/gcm-lfs1/pablo/data/rna_silencing/raw/trimmed_samples.tsv'

In [25]:
trimmed_samples = all_samples_df[all_samples_df['trimmed'] == 'TRUE']
trimmed_samples[['sample_id', 'trimmed']].to_csv(trimmed_samples_file, sep ='\t', index=False)

- Store `INPUT_*` columns: associate ChIP samples with **corresponding `INPUT`**

<font color='red'> **Note**: only `seq_category = ChIP` samples will have an associated `INPUT` all other will contain NAs. </font>

In [26]:
## Note: `input_types` refers to INPUT samples, but note that we also process 'ChIP'-like samples
## that do NOT need 'INPUT substraction' in such a fashion. (e.g. 'H3K9me2', 'simulated-data' ).
#input_types = ['INPUT', 'H3K9me2', 'simulated-data']
input_types = ['INPUT', 'simulated-data'] # also subtract INPUT from H3k9me2 samples!

In [27]:
chip_input_map_file = os.path.join(raw_dir, 'chip_input_map.tsv')
chip_input_map_file

'/gcm-lfs1/pablo/data/rna_silencing/raw/chip_input_map.tsv'

In [28]:
# ChIP + H3k9me2
chip_input_map = all_samples_df[['sample_id', 'pipeline_type', 'seq_category', 'INPUT_1', 'INPUT_2', 'INPUT_3', 'OIN']]
chip_input_map = chip_input_map[(chip_input_map['pipeline_type'] == 'ChIP') & (~chip_input_map['seq_category'].isin(input_types))]

# replace '#N/A' from spread-sheet to np.NaN and downstream analysis
chip_input_map = chip_input_map.replace('#N/A', np.NaN)
chip_input_map
non_missing = chip_input_map[['INPUT_1', 'INPUT_2', 'INPUT_3', 'OIN']].dropna(how='all')
assert(len(non_missing) == len(chip_input_map))

chip_input_map[['sample_id', 'INPUT_1', 'INPUT_2', 'INPUT_3', 'OIN']].to_csv(chip_input_map_file, sep ='\t', index=False)

In [29]:
chip_input_map[chip_input_map['sample_id'].str.contains('H3K9me2')]

Unnamed: 0,sample_id,pipeline_type,seq_category,INPUT_1,INPUT_2,INPUT_3,OIN
137,WT_H3K9me2_1,ChIP,H3K9me2,WT_S2-ChIP-INPUT_1,,,
138,301_H3K9me2_1,ChIP,H3K9me2,301_S2-ChIP-INPUT_1,,,
139,302_H3K9me2_1,ChIP,H3K9me2,302_S2-ChIP-INPUT_1,,,
140,324_H3K9me2_1,ChIP,H3K9me2,324_S2-ChIP-INPUT_1,,,
141,491_H3K9me2_1,ChIP,H3K9me2,491_S2-ChIP-INPUT_1,,,
142,504_H3K9me2_1,ChIP,H3K9me2,504_S2-ChIP-INPUT_1,,,
143,510_H3K9me2_1,ChIP,H3K9me2,510_S2-ChIP-INPUT_1,,,
144,530_H3K9me2_1,ChIP,H3K9me2,530_S2-ChIP-INPUT_1,,,
145,638_H3K9me2_1,ChIP,H3K9me2,638_S2-ChIP-INPUT_1,,,


# Delete `'valid_samples' = False`

In [30]:
#raw_files_annotation_file = os.path.join(data_dir, 'file_annotation.csv')
#raw_files_df = pd.read_csv(raw_files_annotation_file, sep="\t")
#raw_files_df

In [31]:
#not_valid_samples = set(raw_files_df['sample_id']).symmetric_difference(all_samples_df['sample_id'])
#not_valid_samples