In [1]:
import pandas as pd

In [2]:
import os

In [3]:
from functools import reduce

In [4]:
pd.options.mode.chained_assignment = None

---------------------------

In [5]:
project_data_dir = '/data/pablo/RNAdeg'

In [6]:
project_dir = '/home/pmonteagudo/workspace/RNAdeg'

In [7]:
#source_dir = '/data/parastou/RNAdeg/data/AllRNA/'
source_dir = os.path.join(project_data_dir, 'data/ChIP')
#source_dir = os.path.join(project_data_dir, 'data/sequencing_new/ChIP')

---------

In [8]:
def process_sample_names(names, col_name=None):
    
    ## select col_name
    if col_name is None:
        if isinstance(names, pd.DataFrame) and names.shape[0] == 1:
            col_name = names.columns[0]
        else:
            col_name = "sample_name"
    
    ## get dataframe
    if not isinstance(names, pd.DataFrame):

        ## create dataframe from list
        df = pd.DataFrame(data = names, columns=[col_name])
        
    else:
        
        ## already a dataframe
        df = names
        
    ## sort dataframe by col_name
    df = df.sort_values(by=[col_name]).reset_index(drop=True)
        
    ## get sample's prefix
    df["prefix"] = df[col_name].map(lambda x: x.split(".")[0])
    
    ## check for "duplicated" files (share same prefix) within the same directory.
    ## one should be careful and wonder why is that?
    n_samples = df.shape[0]
    df = df[~df.duplicated(subset="prefix", keep=False)] ## remove entries with duplicated prefix
    
    #assert(n_samples == df.shape[0])
    if n_samples != df.shape[0]:
        print("\n{}".format("-"*99))
        print(" Warning! Duplicated files (share same prefix) within the same directory. \n Will be ignored for now!")        
        print("{}\n".format("-"*99))

    #print("Number of files in `{}`: {}".format(col_name, df.shape[0]) )
    
    return df

## 0 - Investigate Samples

---------

### Import Data

A. __Load `valid_samples.txt` file__ 

In [9]:
valid_samples_file = os.path.join(source_dir, 'valid_samples.txt')

In [10]:
valid_samples = pd.read_csv(valid_samples_file, header=None, names = ["valid_sample"])
valid_samples = process_sample_names(valid_samples, col_name="valid_sample")
print("Number of valid samples (as given by Parastou): {}".format(valid_samples.shape[0]) )

Number of valid samples (as given by Parastou): 22


In [11]:
#valid_samples

B. __Load `remote_RNA_file_names.txt` file__ 

In [14]:
remote_samples_file = os.path.join(source_dir, 'remote_ChIP_file_names.txt')

In [15]:
remote_samples = pd.read_csv(remote_samples_file, header=None, names = ["remote_sample"])
remote_samples = process_sample_names(remote_samples, col_name="remote_sample")
remote_dir = "/data/cryohalic01/home/ag_halic/share/Conny/fastq_for_Stefan_RNAdeg"
print("Number of remote samples (as present in {}): {}".format(remote_dir, remote_samples.shape[0]))

Number of remote samples (as present in /data/cryohalic01/home/ag_halic/share/Conny/fastq_for_Stefan_RNAdeg): 8


In [16]:
#remote_samples

C. __Check in `source_dir` for sample files ('.bz2', '.fastq', '.fastqsanger')__

In [17]:
file_formats = ('.bz2', '.fastq', '.fastqsanger')
source_samples_names = [ff for ff in os.listdir(source_dir) if ff.endswith(file_formats)]
#source_samples_names = pd.read_csv(source_dir + 'sample_names.txt', header=None, names = ["source_sample"])
source_samples = process_sample_names(source_samples_names, col_name="source_sample")
print("Number of sample files in `source_dir` ({}): {}".format(source_dir, source_samples.shape[0]))

Number of sample files in `source_dir` (/data/pablo/RNAdeg/data/ChIP): 27


In [18]:
#source_samples

- This means there are __"duplicated" files__ (share same prefix) within the same directory. One should be careful and wonder __why is that__?

In [19]:
assert(len(source_samples_names) == source_samples.shape[0])

In [20]:
duplicated_files = sorted(list(set(source_samples_names).difference(source_samples.source_sample)))
duplicated_files

[]

D. __Merge (Samples) DataFrames: [`source_dir`, `valid_samples.txt`, `remote_RNA_file_names.txt`]__

In [21]:
# compile the list of dataframes you want to merge
data_frames = [valid_samples, remote_samples, source_samples]

In [22]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['prefix'], how='outer'), data_frames)

In [23]:
df_merged = df_merged.sort_values("prefix").reset_index(drop=True)
df_merged = df_merged[["prefix", "valid_sample", "remote_sample", "source_sample"]]
df_merged = df_merged.set_index("prefix")

In [24]:
df_merged.shape[0]

27

In [25]:
#df_merged

---------

### __Compare  `source_dir` vs `valid_samples.txt`__

In [26]:
select_cols = ["valid_sample", "source_sample"]
source_vs_valid_samples = df_merged[select_cols]

This are samples that are either __not valid__ or __missing__

In [27]:
source_vs_valid_samples = source_vs_valid_samples[source_vs_valid_samples["valid_sample"].isna() != source_vs_valid_samples["source_sample"].isna()]
len(source_vs_valid_samples)

5

- __Not valid__ samples (present in `source_dir` but not in `valid_samples.txt`)

In [28]:
not_valid_samples = source_vs_valid_samples[source_vs_valid_samples.valid_sample.isna()]
print("Number of NOT valid samples:", len(not_valid_samples))

Number of NOT valid samples: 5


In [29]:
#not_valid_samples

- __Missing__ samples (present in `valid_samples.txt` but not in `source_dir`): Ideally this should be 0

In [30]:
missing_samples = source_vs_valid_samples[source_vs_valid_samples.source_sample.isna()]
print("Number of missing samples (e.g. valid but are not in the `source_dir`:", len(missing_samples))

Number of missing samples (e.g. valid but are not in the `source_dir`: 0


---------

### Samples used in Analysis

__Ignore samples that are not valid, also ignore samples that give errors__

In [31]:
ignore_files = set(["302_S2RIP_2", "63_RNA_pA_3"])
## union
ignore_files = set(not_valid_samples.index | ignore_files)

In [32]:
ignore_files = sorted([ff + "."  for ff in ignore_files])
print("Number of samples that will be ignored:", len(ignore_files))

Number of samples that will be ignored: 7


In [33]:
#ignore_files

__Samples used in Analysis are present in `source_dir` and `valid_samples.txt`__

In [34]:
prefix_files = df_merged[(~df_merged.source_sample.isna()) & (~df_merged.valid_sample.isna())].index.tolist()
print("Total number of samples that will be analyzed:", len(prefix_files))

Total number of samples that will be analyzed: 22


In [44]:
#prefix_files