In [1]:
import pandas as pd
from glob import glob
import os

# Help functions

In [2]:
# for htseq file reading:

def load_count_files(file_name_map,columns = ['ID','data']):
    # load all count files, and make a long-format pandas array with all data.
    # input is a list of dict with file location as keys and the sample names as values. All files are need to be in the htseq output format (tsv).
    # output is a pandas array of all data in one long table, where each row is a transcript from one sample.
    # 'file' column is added to indicate which file the data is from. 
    # columns is a list of column names for the output table (default: ['ID','data'])
    
    
    raw_reads = pd.DataFrame(columns=columns)
    for file in file_name_map.keys():
             
        this_pd = load_count_file(file,columns = columns)
        this_pd['file'] = file_name_map[file]
        raw_reads = pd.concat([raw_reads,this_pd])
    
    # remove statistics lines from htseq output
    stat_lines = ['__no_feature', '__ambiguous', '__too_low_aQual', '__not_aligned',
       '__alignment_not_unique']
    raw_reads = raw_reads[~raw_reads.isin(stat_lines).any(axis=1)]

    return raw_reads

def load_count_file(count_file, columns = ['ID','data']):
    # load the countsfile from location count_file 
    # input is a count file created by htseq (tsv) 
    # output is a pandas array of counts data with columns as specified in columns (default: ['ID','data'])
    raw_reads = pd.read_csv(count_file,sep='\t',header = None,names = columns)
    
    return raw_reads


# Load the raw counts

In [5]:
# set the experiment name:
#Experiment_name = 'Experiment1'
#Experiment_name = 'Experiment2'
Experiment_name = 'Exponential'
#Experiment_name = 'subsample'
# This code will go to the folder: ..\map_count_outputs\"Experiment_name"\counts\ and load all count files.

# get the names of all count files:
counts_file_path = '..\\..\\map_count_outputs\\'+Experiment_name+'\\counts\\'
count_files = glob(counts_file_path+'*.tsv')
file_sample_map = {file:os.path.basename(file).split('.')[0] for file in count_files if 'reverse' not in file and '_intergenic_notstranded' not in file} # exclude the reversed files and _intergenic_notstranded files

# load the count files:
raw_reads = load_count_files(file_sample_map,['ID','data'])

# Convert to wide format and save to file, for easier reading:
raw_reads = raw_reads.pivot(index='ID', columns='file', values='data').reset_index()
raw_reads

file,ID,EXP_biorep1a,EXP_biorep1b,EXP_biorep1c,EXP_biorep2,EXP_biorep3
0,CDN75_RS05665,59265,80111,40478,93058,67899
1,CDN75_RS10700,24,36,21,32,27
2,CDN75_RS13515,76,66,59,54,46
3,CDN75_RS17410,286,272,125,445,176
4,ERCC-00002,1281093,1359800,1869557,1327319,1546864
...,...,...,...,...,...,...
4409,zraR,141,191,52,158,163
4410,zraS,62,144,74,87,72
4411,zupT,461,541,258,775,349
4412,zur,168,194,51,175,123


# Organize the data

In [7]:
# organize metadata and files for R 

# a dict of all comparisons you want to make:
# keys are the name of the comparison
# values are a dict with the protepties for the design matrix in deseq2: condition, biorep, techrep...
file_map = {}

samples = raw_reads.columns[1:]

# CASP samples
file_map['CASP'] = {sample:{'condition':'CASP'} for sample in samples if 'CASP' in sample}

# Disrupted samples
file_map['Disrupted'] = {sample:{'condition':'Disrupted'} for sample in samples if 'Disrupted' in sample}

# Exponential samples
file_map['Exponential'] = {sample:{'condition':'Exponential'} for sample in samples if 'EXP' in sample}

import os
data_fold_name ='raw_data_organized_for_R_'+ Experiment_name
if not os.path.exists(data_fold_name):
    os.mkdir(data_fold_name)

# Save metadata file for R. R needs a table with sample names under the column 'id' and the condition under the column 'dex'. 
for comparison,this_file_map in file_map.items():
    print(comparison)
    data_rep = raw_reads[['ID']+list(this_file_map.keys())]
        
    # remove rows with NaNs
    data_rep = data_rep.dropna()

    data_rep.to_csv(data_fold_name + '\\' + comparison + '_raw_reads_for_R.csv',index=False)

    meta_data_rep = pd.DataFrame({'id':this_file_map.keys()})
    for k,v in this_file_map.items():
        for k2,v2 in v.items():
            meta_data_rep.loc[meta_data_rep['id']==k,k2] = v2
    
    meta_data_rep.to_csv(data_fold_name + '\\' + comparison + '_metadata_for_R.csv',index=False)
meta_data_rep



CASP
Disrupted
Exponential


Unnamed: 0,id,condition
0,EXP_biorep1a,Exponential
1,EXP_biorep1b,Exponential
2,EXP_biorep1c,Exponential
3,EXP_biorep2,Exponential
4,EXP_biorep3,Exponential
