In [5]:
import pandas as pd

In [6]:
import os

In [7]:
from functools import reduce

In [8]:
pd.options.mode.chained_assignment = None

---------------------------

In [7]:
#project_data_dir = '/data/pablo/RNAdeg' # algbio /data
project_data_dir = '/gcm-lfs1/pablo/data/RNAdeg' # algbio /gcm-lfs1

In [8]:
project_dir = '/home/pmonteagudo/workspace/RNAdeg'

In [52]:
data_batch = 'RNA'

In [53]:
#source_dir = '/data/parastou/RNAdeg/data/AllRNA/'
#source_dir = os.path.join(project_data_dir, 'data/RNA')
#source_dir = os.path.join(project_data_dir, 'data/sequencing_new/RNA')
source_dir = os.path.join(project_data_dir, 'data', data_batch)

---------

In [13]:
def process_sample_names(names, col_name=None):
    
    ## select col_name
    if col_name is None:
        if isinstance(names, pd.DataFrame) and names.shape[0] == 1:
            col_name = names.columns[0]
        else:
            col_name = "sample_name"
    
    ## get dataframe
    if not isinstance(names, pd.DataFrame):

        ## create dataframe from list
        df = pd.DataFrame(data = names, columns=[col_name])
        
    else:
        
        ## already a dataframe
        df = names
        
    ## sort dataframe by col_name
    df = df.sort_values(by=[col_name]).reset_index(drop=True)
        
    ## get sample's prefix
    df["prefix"] = df[col_name].map(lambda x: x.split(".")[0])
    
    ## check for "duplicated" files (share same prefix) within the same directory.
    ## one should be careful and wonder why is that?
    n_samples = df.shape[0]
    df = df[~df.duplicated(subset="prefix", keep=False)] ## remove entries with duplicated prefix
    
    #assert(n_samples == df.shape[0])
    if n_samples != df.shape[0]:
        print("\n{}".format("-"*99))
        print(" Warning! Duplicated files (share same prefix) within the same directory. \n Will be ignored for now!")        
        print("{}\n".format("-"*99))

    #print("Number of files in `{}`: {}".format(col_name, df.shape[0]) )
    
    return df

## 0 - Investigate Samples

---------

### Import Data

A. __Load `valid_samples.txt` file__ 

In [18]:
valid_samples_file = os.path.join(source_dir, 'valid_samples.txt')

In [20]:
valid_samples = pd.read_csv(valid_samples_file, header=None, names = ["valid_sample"])
valid_samples = process_sample_names(valid_samples, col_name="valid_sample")
print("Number of valid samples (as given by Parastou): {}".format(valid_samples.shape[0]) )

Number of valid samples (as given by Parastou): 54


In [21]:
#valid_samples

B. __Load `remote_RNA_file_names.txt` file__ 

In [23]:
remote_samples_file = os.path.join(source_dir, 'remote_RNA_file_names.txt')

In [24]:
remote_samples = pd.read_csv(remote_samples_file, header=None, names = ["remote_sample"])
remote_samples = process_sample_names(remote_samples, col_name="remote_sample")
remote_dir = "/data/cryohalic01/home/ag_halic/share/Conny/fastq_for_Stefan_RNAdeg"
print("Number of remote samples (as present in {}): {}".format(remote_dir, remote_samples.shape[0]))

Number of remote samples (as present in /data/cryohalic01/home/ag_halic/share/Conny/fastq_for_Stefan_RNAdeg): 32


In [25]:
#remote_samples

C. __Check in `source_dir` for sample files ('.bz2', '.fastq', '.fastqsanger')__

In [27]:
file_formats = ('.bz2', '.fastq', '.fastqsanger')
source_samples_names = [ff for ff in os.listdir(source_dir) if ff.endswith(file_formats)]
#source_samples_names = pd.read_csv(source_dir + 'sample_names.txt', header=None, names = ["source_sample"])
source_samples = process_sample_names(source_samples_names, col_name="source_sample")
print("Number of sample files in `source_dir` ({}): {}".format(source_dir, source_samples.shape[0]))


---------------------------------------------------------------------------------------------------
 Will be ignored for now!
---------------------------------------------------------------------------------------------------

Number of sample files in `source_dir` (/data/pablo/RNAdeg/data/RNA): 66


In [32]:
#source_samples

- This means there are __"duplicated" files__ (share same prefix) within the same directory. One should be careful and wonder __why is that__?

In [28]:
assert(len(source_samples_names) == source_samples.shape[0])

AssertionError: 

In [29]:
duplicated_files = sorted(list(set(source_samples_names).difference(source_samples.source_sample)))
duplicated_files

['63_RNA_pA_3.fastq', '63_RNA_pA_3.ztr.fastq']

D. __Merge (Samples) DataFrames: [`source_dir`, `valid_samples.txt`, `remote_RNA_file_names.txt`]__

In [30]:
# compile the list of dataframes you want to merge
data_frames = [valid_samples, remote_samples, source_samples]

In [31]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['prefix'], how='outer'), data_frames)

In [37]:
# sort
df_merged = df_merged.sort_values("prefix").reset_index(drop=True)
# shuffle columns
df_merged = df_merged[["prefix", "valid_sample", "remote_sample", "source_sample"]]
# set "prefix" as index
df_merged = df_merged.set_index("prefix")

In [33]:
df_merged.shape[0]

74

In [34]:
#df_merged

---------

### __Compare  `source_dir` vs `valid_samples.txt`__

In [35]:
select_cols = ["valid_sample", "source_sample"]
source_vs_valid_samples = df_merged[select_cols]

This are samples that are either __not valid__ or __missing__

In [36]:
source_vs_valid_samples = source_vs_valid_samples[source_vs_valid_samples["valid_sample"].isna() != source_vs_valid_samples["source_sample"].isna()]
len(source_vs_valid_samples)

28

- __Not valid__ samples (present in `source_dir` but not in `valid_samples.txt`)

In [37]:
not_valid_samples = source_vs_valid_samples[source_vs_valid_samples.valid_sample.isna()]
print("Number of NOT valid samples:", len(not_valid_samples))

Number of NOT valid samples: 20


In [38]:
#not_valid_samples

- __Missing__ samples (present in `valid_samples.txt` but not in `source_dir`): Ideally this should be 0

In [39]:
missing_samples = source_vs_valid_samples[source_vs_valid_samples.source_sample.isna()]
print("Number of missing samples (e.g. valid but are not in the `source_dir`:", len(missing_samples))

Number of missing samples (e.g. valid but are not in the `source_dir`: 8


- We see that some of the `valid_sample`s (7) seem to be missing an "A" at the end.
- (1) `valid_sample` "302_S2RIP_2" is also missing but this the sample that we ignore because is raising an error.

In [None]:
missing_samples

### __Compare  `source_dir` vs `remote_RNA_file_names.txt`__

In [58]:
select_cols = ["source_sample", "remote_sample"]
source_vs_remote_samples = df_merged[select_cols]

This are samples that are either present in __missing from source_dir__ or __missing from remote_dir__

In [59]:
source_vs_remote_samples = source_vs_remote_samples[source_vs_remote_samples["source_sample"].isna() != source_vs_remote_samples["remote_sample"].isna()]
len(source_vs_remote_samples)

36

In [61]:
#source_vs_remote_samples

- __Missing from remote_dir__ samples (present in `source_dir` but not in `remote_RNA_file_names.txt`)

In [63]:
missing_from_remote = source_vs_remote_samples[source_vs_remote_samples.remote_sample.isna()]
print("Number of samples missing in the source_dir:", len(missing_from_remote))

Number of samples missing in the source_dir: 35


In [71]:
missing_from_remote.index.tolist()

['1167_S5RIP_2',
 '283_RNA_pA_4',
 '301_RNA_pA_3',
 '301_S2RIP_3',
 '302_S2RIP_3',
 '324S2RIP_1',
 '324_RNA_pA_3',
 '324_S2RIP_3',
 '491S2RIP_1',
 '491_S2RIP_3',
 '504S2RIP_1',
 '504S2RIP_2',
 '504_RNA_pA_1',
 '504_RNA_pA_2',
 '530S2RIP_1',
 '530S2RIP_2',
 '530_RNA_pA_1',
 '530_RNA_pA_2',
 '591_S5RIP_1',
 '63',
 '638S2RIP_1',
 '638S2RIP_2',
 '638_RNA_pA_1',
 '638_RNA_pA_2',
 '63_RIPS5P',
 '63_RNA_pA_3',
 '63_RNA_pA_4',
 '63_S2PRIP',
 '63_S2Ph_RIP',
 '63_S2RIP_2',
 '63_S5Ph_RIP',
 '65',
 '80S2RIP_1',
 '80S2RIP_2',
 '80pARNA_2']

- __Missing from source_dir__ samples (present in `remote_RNA_file_names.txt` but not in `source_dir`)

In [66]:
missing_from_source = source_vs_remote_samples[source_vs_remote_samples.source_sample.isna()]
print("Number of missing samples (e.g. valid but are not in the `source_dir`:", len(missing_from_source))

Number of missing samples (e.g. valid but are not in the `source_dir`: 1


- (1) `remote_sample` "302_S2RIP_2" is missing but this the sample that we ignore because is raising an error.

In [68]:
missing_from_source

Unnamed: 0_level_0,source_sample,remote_sample
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
302_S2RIP_2,,302_S2RIP_2.fastq.bz2


---------

### Samples used in Analysis

__Errors__:
-  302_S2RIP_2.fastq
-  63_RNA_pA_3.fastq

__Ignore samples that are not valid, also ignore samples that give errors__

In [40]:
#ignore_files = set(["302_S2RIP_2", "63_RNA_pA_3"])
ignore_files = set(["302_S2RIP_2"]) ## we already removed the file from directory
## union
ignore_files = set(not_valid_samples.index | ignore_files)

In [41]:
ignore_files = sorted([ff + "."  for ff in ignore_files])
print("Number of samples that will be ignored:", len(ignore_files))

Number of samples that will be ignored: 22


In [42]:
#ignore_files

__Samples used in Analysis are present in `source_dir` and `valid_samples.txt`__

In [43]:
prefix_files = df_merged[(~df_merged.source_sample.isna()) & (~df_merged.valid_sample.isna())].index.tolist()
print("Total number of samples that will be analyzed:", len(prefix_files))

Total number of samples that will be analyzed: 46


In [44]:
#prefix_files