-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dataframe.py
executable file
·105 lines (93 loc) · 4.14 KB
/
create_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pathlib
import yaml
import pandas as pd
import numpy as np
from glob import glob
import os
import sys
# Create the datatable containing the samples, units and paths of all
# fastq files formatted correctly. This is vital for the snakemake
# pipeline, without it, the wildcards can't be created.
# Additionally, options will be checked.
with open(sys.argv[1]) as f_:
config = yaml.load(f_, Loader=yaml.FullLoader)
def create_dataframe(fl, fpl, config, slice):
if config['merge']['paired_End'] and not config['general']['already_assembled']:
df = pd.DataFrame(columns=['sample', 'unit', 'fq1', 'fq2'],
index =range(int(len(fl)/2)), dtype=str)
i, j = (0, 0)
while i < len(fl)/2:
# last split needs to be fwd or rev read
# second last can be unit
unit = fl[j].split('_')[-2]
if unit in ['A', 'B']:
df.loc[i]['unit'] = unit
df.loc[i]['sample'] = '_'.join(fl[j].split('_')[:-2])
else:
df.loc[i]['unit'] = ''
df.loc[i]['sample'] = '_'.join(fl[j].split('_')[:-1])
df.loc[i]['fq1'] = fpl[j][:slice]
df.loc[i]['fq2'] = fpl[j+1][:slice]
j += 2
i += 1
elif config['dataset']['nanopore']:
df = pd.DataFrame(columns=['sample', 'unit', 'fq1'],
index=range(int(len(fl))), dtype=str)
i = 0
while i < len(fl):
unit = fl[i].split('_')[-2]
print(unit)
# no units in sample name
# print(fl[0])
df.loc[i]['sample'] = '_'.join(fl[i].split('_')[:-2])
df.loc[i]['fq1'] = fpl[i][:slice]
df.loc[i]['unit'] = unit
i += 1
else:
df = pd.DataFrame(columns=['sample', 'unit', 'fq1', 'fq2'],
index = range(int(len(fl))), dtype=str)
i = 0
while i < len(fl):
# last split needs to be fwd or rev read
# second last can be unit
unit = fl[i].split('_')[-2]
if unit in ['A', 'B']:
df.loc[i]['unit'] = unit
df.loc[i]['sample'] = '_'.join(fl[i].split('_')[:-2])
else:
df.loc[i]['unit'] = ''
df.loc[i]['sample'] = '_'.join(fl[i].split('_')[:-1])
df.loc[i]['fq1'] = fpl[i][:slice]
df.loc[i]['fq2'] = np.nan
i += 1
return df
if __name__ == '__main__':
# check config options
if "-" in config["general"]["output_dir"]:
sys.exit("Please rename output folder, do not use a dash in the folder name")
if config["classify"]["mothur"] and config["blast"]["blast"]:
sys.exit("Please decide whether to use blast or classification with mothur. Both config options cannot be set to TRUE")
if config["general"]["seq_rep"] == "ASV" and config["postcluster"]["mumu"]:
print("Postclustering with mumu is not supported for ASVs.")
changeopt = input("To proceed and set the mumu config option to FALSE, type 'yes': ")
if changeopt == "yes":
print("Proceeding")
config["postcluster"]["mumu"] = False
else:
sys.exit("Workflow will be aborted")
if not config['general']['already_assembled']:
file_path_list = [os.path.join(config["general"]["output_dir"],'demultiplexed/' + name.split('/')[-1]) for name in
sorted(glob(config['general']['filename'].rstrip("/") + '/*.gz'))]
file_list = sorted([file_.split('/')[-1] for file_
in file_path_list])
slice = -3 # Remove the .gz extension from the file paths.
if config['dataset']['nanopore']:
file_path_list = sorted(glob(os.path.join(config["general"]["filename"],'*R1.fastq.gz')))
file_list = sorted([file_.split('/')[-1] for file_ in file_path_list])
slice = None
#print(file_list, file_path_list)
# create dataframe
df = create_dataframe(file_list, file_path_list, config, slice)
print(df)
pathlib.Path(config["general"]["output_dir"]).mkdir(parents=True, exist_ok=True)
df.to_csv(os.path.join(config["general"]["output_dir"],config["general"]['units']), sep='\t')