In [1]:
%load_ext autoreload
%autoreload 2

---------------------------

## Config

In [2]:
import sys

In [3]:
project_dir = '/home/pmonteagudo/workspace/silencing_project'
if project_dir not in sys.path: 
    sys.path.append(project_dir)
from config_analysis import *

In [4]:
# needed to import all our code
my_scripts_dir = os.path.join(project_dir, 'htseq/scripts')
if my_scripts_dir not in sys.path:
    sys.path.append(my_scripts_dir)

In [5]:
import parse_utils as psutil

---------------------------

<font color='blue'> **This Notebook generates the `file_annotation.csv` file that will be used by Snakemake!** </font>

Previously to running this Notebook, two more files are necessary:
* `trimmed_samples.tsv`
* `chip_input_map.tsv`

these files contain **"manual" information** about the data present in the `odf` raw files, that can be extracted by running: `sample_tables.ipynb` **Notebook**

---------------------------

## Get **Sample List** from `data` **Directory**

- Get **sample information**:

<font color='red'> **Attention! not all samples present in `seq_data` are going to be used in the analysis.** </font>
 
<font color='red'> **=> Manually add samples that need to be ignored.** </font>

In [6]:
ignore_files = ()
# `not_valid` samples
ignore_files = (
    '491_S2-ChIP_3', # correlation with other replicates ~0.73 < 0.8
    '324_S2-ChIP_3', '504_S2-ChIP_3','638_S2-ChIP_3', 'WT_S2-ChIP_2',
    '544_S2-RIP_2',
    '80_S5-RIP_1', 'WT_S5-RIP_1', 'WT_S5-RIP_3' # S5-data is very noisy
)
len(ignore_files)

9

In [7]:
#data_dir = '/gcm-lfs1/pablo/data/RNAdeg/data'

### **Raw Files** (compressed `fastq` files): `.fastq.bz2`

These are `.fastq.bz2` files that have been transfered from the `Halic Server`.

Depending on the *sequencing assay*, each `sample` has been classified into one of the following categories.

The main categories are:
* **INPUT**
* **S2-ChIP**
* **S2-RIP**
* **pA-RNA**
* **total-RNA**

other less important categories include:
* **H3K9me**
* **S5-ChIP**
* **S5-RIP**

an additional category was created (e.g. `simulated-data`) that needs to be treated slighlty different:
* **simulated-data**

**Raw Files** (compressed `fastq` files): `.fastq.bz2`

In [8]:
raw_files_list = []
ext = '.fastq.bz2'

# find samples - '.fastq.bz2' files are inside individual directories.
for root, dirs, files in os.walk(data_dir, topdown=True):
    for name in files:
        
        if (name.endswith(ext)) and (not name.startswith(ignore_files)):

          #print(os.path.join(root, name))
          raw_files_list.append(os.path.join(root, name))

In [9]:
#raw_files_list
len(raw_files_list) # 150/157

157

- Instantiate a DataFrame with: `[sample_id, raw_file_name, raw_file]`

In [10]:
raw_files_df = pd.DataFrame({'raw_file':raw_files_list})
raw_files_df['raw_file_name'] = [os.path.basename(x) for x in raw_files_df['raw_file']]
raw_files_df['sample_id'] = [ x.split('.')[0] for x in raw_files_df['raw_file_name']]

In [11]:
# re-order and re-index
raw_files_df = raw_files_df[['sample_id', 'raw_file_name', 'raw_file']]
raw_files_df = raw_files_df.sort_values('sample_id')
raw_files_df = raw_files_df.reset_index(drop='True')

In [12]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...


In [13]:
raw_files_df.shape

(157, 3)

- Add `trimmed` column: if `True` sample needs to be **trimed** to solve error during pipeline execution:

<font color='red'> Error - EXITING because of FATAL ERROR in reads input: short read sequence line: 1 - Need to trim reads: </font>

In [14]:
trimmed_samples_file = os.path.join(raw_dir, 'trimmed_samples.tsv')
trimmed_samples_file

'/gcm-lfs1/pablo/data/rna_silencing/raw/trimmed_samples.tsv'

In [15]:
trimmed_sample_ids = pd.read_csv(trimmed_samples_file, sep ='\t')
#trimmed_sample_ids.dtypes
trimmed_sample_ids = trimmed_sample_ids['sample_id'].tolist()

In [16]:
raw_files_df['trimmed'] = raw_files_df['sample_id'].apply(lambda x: True if x in trimmed_sample_ids else False)

In [17]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False


In [18]:
raw_files_df.shape

(157, 4)

In [19]:
raw_files_df[['sample_id', 'trimmed']].groupby(['trimmed']).count()

Unnamed: 0_level_0,sample_id
trimmed,Unnamed: 1_level_1
False,140
True,17


- Add `seq_type` column: names of **sequencing assays** and some additional meta-data are used to categorize samples

<font color='red'> **Note:** This is the finest of the classifications by **sequencing assay** (e.g. `seq_type`). </font> 

<font color='red'> Files inside the same *sequencing assay* directory might be sligthly different and we want be able to distinguish between them: </font> <br> e.g. `INPUT` - `S2-ChIP-INPUT`/`S2-ChIP-OIN`

In [20]:
raw_files_df['seq_type'] = raw_files_df['sample_id'].apply(lambda x: x.split('_')[1])

In [21]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP


In [22]:
raw_files_df.shape

(157, 5)

- Add `seq_category` column: names for **sequencing assays** are used to categorize samples into directories

In [23]:
#raw_files_df['seq_category'] = raw_files_df['raw_file'].apply(lambda x: psutil.get_data_dir(x, data_dir))
raw_files_df['seq_category'] = raw_files_df.seq_type.map(seq_category)

In [24]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT,INPUT
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN,INPUT
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP,S2-RIP


In [25]:
raw_files_df.shape

(157, 6)

- Add `pipeline_type` column: distinguishes between `ChIP` and `RNA` **pipe-line**

In [26]:
raw_files_df['pipeline_type'] = raw_files_df.seq_type.map(pipeline_type)

In [27]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category,pipeline_type
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT,INPUT,ChIP
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN,INPUT,ChIP
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP,S2-RIP,RNA


In [28]:
raw_files_df.shape

(157, 7)

In [29]:
raw_files_df[raw_files_df['seq_type']=='unknown']

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category,pipeline_type


- Add `mutant_id` to classify each sample by **mutant strain id**:

In [30]:
raw_files_df['mutant_id'] = raw_files_df['sample_id'].apply(lambda x: x.split('_')[0])

In [31]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category,pipeline_type,mutant_id
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT,INPUT,ChIP,1022
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN,INPUT,ChIP,1022
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP,S2-RIP,RNA,1022


In [32]:
raw_files_df.shape

(157, 8)

- Add `mutant_name` to classify each sample by **mutant strain name**:

In [33]:
raw_files_df['mutant_name'] = raw_files_df.mutant_id.map(mut_dict)
raw_files_df['mutant_name'] = raw_files_df['mutant_name'].fillna("unknown")

In [34]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category,pipeline_type,mutant_id,mutant_name
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT,INPUT,ChIP,1022,mot2d
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN,INPUT,ChIP,1022,mot2d
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022,mot2d
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022,mot2d
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP,S2-RIP,RNA,1022,mot2d


In [35]:
raw_files_df.shape

(157, 9)

- Add `replicate` to keep track of each **mutant replicate**:

In [36]:
raw_files_df['replicate'] = raw_files_df['sample_id'].apply(lambda x: x.split('_')[-1])

In [37]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category,pipeline_type,mutant_id,mutant_name,replicate
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT,INPUT,ChIP,1022,mot2d,1
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN,INPUT,ChIP,1022,mot2d,1
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022,mot2d,1
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022,mot2d,2
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP,S2-RIP,RNA,1022,mot2d,2


In [38]:
raw_files_df.shape

(157, 10)

- Add `INPUT_*` columns: associate ChIP samples with **corresponding `INPUT`**

<font color='red'> **Note**: only `seq_category = ChIP` samples will have an associated `INPUT` all other will contain NAs. </font>

<font color='red'> Needs external annotation file: </font> `chip_input_map.tsv`

In [39]:
## Note: `input_types` refers to INPUT samples, but note that we also process 'ChIP'-like samples
## that do NOT need 'INPUT substraction' in such a fashion. (e.g. 'H3K9me2', 'simulated-data' ).
input_types = ['INPUT', 'H3K9me2', 'simulated-data']

In [40]:
chip_input_map_file = os.path.join(raw_dir, 'chip_input_map.tsv')
chip_input_map_file

'/gcm-lfs1/pablo/data/rna_silencing/raw/chip_input_map.tsv'

In [41]:
#chip_input_map_df = pd.read_csv(chip_input_map_file, sep ='\t', dtype='str')
chip_input_map_df = pd.read_csv(chip_input_map_file, sep ='\t')
chip_input_map_df.head()

Unnamed: 0,sample_id,INPUT_1,INPUT_2,INPUT_3,OIN
0,1022_S2-ChIP_1,1022_S2-ChIP-INPUT_1,,,1022_S2-ChIP-OIN_1
1,1022_S2-ChIP_2,1022_S2-ChIP-INPUT_1,,,1022_S2-ChIP-OIN_1
2,1168_S2-ChIP_1,,,,1168_S2-ChIP-OIN_1
3,1168_S2-ChIP_2,,,,1168_S2-ChIP-OIN_1
4,301_S2-ChIP_1,301_S2-ChIP-INPUT_1,,,301_S2-ChIP-OIN_1


In [42]:
#raw_files_df.dtypes

In [43]:
#chip_input_map_df.dtypes

In [44]:
n_samples = len(raw_files_df)
raw_files_df = raw_files_df.merge(chip_input_map_df, how='left', on = 'sample_id')
assert(n_samples == len(raw_files_df))

In [45]:
raw_files_df.head()

Unnamed: 0,sample_id,raw_file_name,raw_file,trimmed,seq_type,seq_category,pipeline_type,mutant_id,mutant_name,replicate,INPUT_1,INPUT_2,INPUT_3,OIN
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-INPUT,INPUT,ChIP,1022,mot2d,1,,,,
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,False,S2-ChIP-OIN,INPUT,ChIP,1022,mot2d,1,,,,
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022,mot2d,1,1022_S2-ChIP-INPUT_1,,,1022_S2-ChIP-OIN_1
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-ChIP,S2-ChIP,ChIP,1022,mot2d,2,1022_S2-ChIP-INPUT_1,,,1022_S2-ChIP-OIN_1
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq.bz2,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,False,S2-RIP,S2-RIP,RNA,1022,mot2d,2,,,,


In [46]:
raw_files_df.shape

(157, 14)

- Summarize by `data_dir` and `seq`

In [47]:
samples_by_seq_categoryc = raw_files_df[['sample_id', 'pipeline_type', 'seq_category']].groupby(['pipeline_type', 'seq_category']).count()
samples_by_seq_categoryc.sort_values(['pipeline_type', 'sample_id'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id
pipeline_type,seq_category,Unnamed: 2_level_1
simulated-data,simulated-data,4
RNA,S2-RIP,37
RNA,pA-RNA,34
RNA,total-RNA,8
RNA,S5-RIP,1
ChIP,INPUT,30
ChIP,S2-ChIP,30
ChIP,H3K9me2,9
ChIP,S5-ChIP,4


- **Store Data Frame** containing annotation of all: `raw_files_df`

In [48]:
#raw_files_annotation_file = os.path.join(project_data_dir, 'seq_data', 'file_annotation.csv')
raw_files_annotation_file = os.path.join(data_dir, 'file_annotation.csv')
raw_files_annotation_file

'/gcm-lfs1/pablo/data/rna_silencing/seq_data/file_annotation.csv'

In [49]:
raw_files_df.to_csv(raw_files_annotation_file, sep="\t", index=False)

---------------------------------

 <font color='red'> **(NOT SURE WHAT THIS IS FOR)** </font>


### **FASTQ Files**: `.fastq`

- **FASTQ Files**: `.fastq`

In [50]:
fastq_files_list = []
ext = '.fastq'

## now samples '.fastq.bz2' files are inside individual directories.
for root, dirs, files in os.walk(data_dir, topdown=True):
    for name in files:
        
        if (name.endswith(ext)) and (not name.startswith(ignore_files)):

          #print(os.path.join(root, name))
          fastq_files_list.append(os.path.join(root, name))

In [51]:
fastq_files_df = pd.DataFrame({'fastq_file': fastq_files_list})
fastq_files_df['fastq_file_name'] = [os.path.basename(x) for x in fastq_files_df['fastq_file']]
fastq_files_df['sample_id'] = [ x.split('.')[0] for x in fastq_files_df['fastq_file_name']]

In [52]:
fastq_files_df = fastq_files_df[['sample_id', 'fastq_file_name', 'fastq_file']]
fastq_files_df = fastq_files_df.sort_values('sample_id')
fastq_files_df = fastq_files_df.reset_index(drop='True')

In [53]:
fastq_files_df.head()

Unnamed: 0,sample_id,fastq_file_name,fastq_file
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...


In [54]:
fastq_files_df.shape

(174, 3)

- Add `data_dir` to see where we are storing them

In [55]:
fastq_files_df['data_dir'] = fastq_files_df['fastq_file'].apply(lambda x: psutil.get_data_dir(x, data_dir))

In [56]:
fastq_files_df.head()

Unnamed: 0,sample_id,fastq_file_name,fastq_file,data_dir
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,INPUT
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,INPUT
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,S2-ChIP
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,S2-ChIP
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,S2-RIP


In [57]:
fastq_files_df.shape

(174, 4)

- Add `seq` to classify each sample by **sequencing type**:

In [58]:
fastq_files_df['seq'] = fastq_files_df['sample_id'].apply(lambda x: psutil.get_sequencing_type(x))

In [59]:
fastq_files_df.head()

Unnamed: 0,sample_id,fastq_file_name,fastq_file,data_dir,seq
0,1022_S2-ChIP-INPUT_1,1022_S2-ChIP-INPUT_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,INPUT,ChIP
1,1022_S2-ChIP-OIN_1,1022_S2-ChIP-OIN_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/IN...,INPUT,ChIP
2,1022_S2-ChIP_1,1022_S2-ChIP_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,S2-ChIP,ChIP
3,1022_S2-ChIP_2,1022_S2-ChIP_2.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,S2-ChIP,ChIP
4,1022_S2-RIP_2,1022_S2-RIP_2.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/S2...,S2-RIP,RIP


In [60]:
fastq_files_df.shape

(174, 5)

In [61]:
fastq_files_df[fastq_files_df['seq']=='unknown']

Unnamed: 0,sample_id,fastq_file_name,fastq_file,data_dir,seq
17,301_H3K9me2_1,301_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
30,302_H3K9me2_1,302_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
41,324_H3K9me2_1,324_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
56,491_H3K9me2_1,491_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
67,504_H3K9me2_1,504_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
79,510_H3K9me2_1,510_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
99,530_H3K9me2_1,530_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
120,638_H3K9me2_1,638_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
150,WT_H3K9me2_1,WT_H3K9me2_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/H3...,H3K9me2,unknown
170,chip-fake-reads_simulated-data_1,chip-fake-reads_simulated-data_1.fastq,/gcm-lfs1/pablo/data/rna_silencing/seq_data/si...,simulated-data,unknown


- Summarize by `data_dir` and `seq`

In [62]:
fastq_files_df[['fastq_file_name', 'data_dir', 'seq']].groupby(['data_dir', 'seq']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,fastq_file_name
data_dir,seq,Unnamed: 2_level_1
H3K9me2,unknown,9
INPUT,ChIP,30
S2-ChIP,ChIP,31
S2-RIP,RIP,42
S5-ChIP,ChIP,4
S5-RIP,RIP,1
pA-RNA,RNA,45
simulated-data,unknown,4
total-RNA,RNA,8


- Check for duplicates since I think that `RNA` and `ChIP` **directories** contain **ALL samples** once again:

In [63]:
fastq_files_df['duplicated'] = fastq_files_df.duplicated('fastq_file_name', keep= False)

In [64]:
fastq_files_duplicated_df = fastq_files_df[fastq_files_df['duplicated']]

In [65]:
fastq_files_duplicated_df

Unnamed: 0,sample_id,fastq_file_name,fastq_file,data_dir,seq,duplicated
