In [5]:
import os
import re
from typing import List

import numpy as np
import pandas as pd
from pandas import DataFrame

from celline.utils.config import Config

In [18]:
runtable = pd.read_csv(
    "../../test/runs.tsv",
    sep="\t"
)

In [28]:
errors = runtable[runtable["filetype"] == "fastq"]
error_srrs = errors.drop_duplicates("run_id")["run_id"].tolist()
print("***** SRA Run Fixer *****")
for index in range(len(error_srrs)):
    srr_id = error_srrs[index]
    gse_id: str = runtable[runtable["run_id"] == srr_id]["gse_id"].iloc[0]
    print(
        f"""--------------------------------------------
┏ [{index+1}/{len(error_srrs)}] Fix errors on {srr_id}.━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┣ Refer to [https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&acc={srr_id}&display=data-access]""")
    req_correct: bool = pd.isna(errors[errors["run_id"] == srr_id]["lane_id"].iloc[0])  # type: ignore
    if req_correct:
        print(f"┣ ERROR in lane id")
        exists_laneID: List[str] = [
            x for x in runtable[runtable["gse_id"] == gse_id]["lane_id"].unique().tolist() if x != "nan"]
        lane_id = "{:0=3}".format(len(exists_laneID))
        runtable.loc[runtable['run_id'] == srr_id, "lane_id"] = lane_id
        print(f"┣ ┗> fixed lane_id automatically to: {lane_id}")
    read_types: List[str] = errors[errors["run_id"] == srr_id]["read_type"].tolist()
    req_correct: bool = False
    for read_type in read_types:
        __req_correct = (read_type == "nan") | (read_type == "") | (pd.isna(read_type))
        if not __req_correct:
            __req_correct = re.match(pattern=r'[RI][12]', string=read_type) is None
        if __req_correct:
            req_correct = True
            break
    if req_correct:
        print(f"┣ ERROR in read type.")
        if len(read_types) == 1:
            print(
                "┣ ┗> read_type should minimally contains R1 and R2. Delete this column.")
            print(
                f"┣ ┗> {read_types}")
            # runtable.drop(runtable[runtable['run_id'] == srr_id].index, axis=1)
        elif len(read_types) == 2:
            print("┣ ┗> fixed read_type automatically to: R1 and R2")
            runtable.loc[runtable['run_id'] == srr_id, "read_type"] = ["R1", "R2"]
        elif len(read_types) == 3:
            print("┣ ┣ Detected index file. Which one is index file?")
            targets: List[str] = [filename.split("/")[-1] for filename in runtable[runtable["run_id"] == srr_id]["cloud_filepath"].tolist()]
            print(
f"""
┣ ┣ ┏ 1) {targets[0]}
┣ ┣ ┣ 2) {targets[1]}
┣ ┣ ┗ 3) {targets[3]}
"""
            )
            new_read_type: List[str] = []
            while True:
                lindex = input("1, 2, 3")
                try:
                    lindex = int(lindex, 10)
                    if lindex <= 3 & lindex >= 1:
                        if lindex == 1:
                            new_read_type = ["L1", "R1", "R2"]
                        elif lindex == 2:
                            new_read_type = ["R1", "L1", "R2"]
                        else:
                            new_read_type = ["R1", "R2", "L1"]
                        break
                    else:
                        print("Specify an integer value between 1 and 3.")
                except ValueError:
                    print("Specify an integer value between 1 and 3.")
            runtable.loc[runtable['run_id'] == srr_id, "read_type"] = new_read_type
    result = runtable[runtable['run_id'] == srr_id].iloc[0]
    dumped_filename = f'{result["gsm_id"]}_S1_L{result["lane_id"]}_{result["read_type"]}_001.fastq.gz'
    dumped_filepath = f'{result["sample_name"]}/0_dumped/{result["gsm_id"]}/fastqs/rep{result["replicate"]}/{dumped_filename}'
    runtable["dumped_filename"] = dumped_filename
    runtable["dumped_filepath"] = dumped_filepath
    print(f"┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛")

***** SRA Run Fixer *****
--------------------------------------------
┏ [1/1] Fix errors on SRR3879614.━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┣ Refer to [https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&acc=SRR3879614&display=data-access]
┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛


In [21]:
result

Unnamed: 0,dumped_filepath,cloud_filepath,dumped_filename,egress,filetype,gse_id,gsm_id,lane_id,location,raw_filename,read_type,replicate,run_id,sample_id,sample_name,spieces
0,TEST/0_dumped/GSM2230761/fastqs/rep0/GSM223076...,s3://sra-pub-src-7/SRR3879614/mouse1_lib1.R1.f...,GSM2230761_S1_LNone_None_001.fastq.gz,-,fastq,GSE84133,GSM2230761,1,AWS,mouse1_lib1.R1.fastq.gz,R1,0,SRR3879614,1,TEST,Musmusculus
1,TEST/0_dumped/GSM2230761/fastqs/rep0/GSM223076...,s3://sra-pub-src-5/SRR3879614/mouse1_lib1.R2.f...,GSM2230761_S1_LNone_None_001.fastq.gz,-,fastq,GSE84133,GSM2230761,1,AWS,mouse1_lib1.R2.fastq.gz,R2,0,SRR3879614,1,TEST,Musmusculus


In [20]:
runtable

Unnamed: 0,dumped_filepath,cloud_filepath,dumped_filename,egress,filetype,gse_id,gsm_id,lane_id,location,raw_filename,read_type,replicate,run_id,sample_id,sample_name,spieces
0,"0 TEST\n1 TEST\nName: sample_name, dtype...",s3://sra-pub-src-7/SRR3879614/mouse1_lib1.R1.f...,0 GSM2230761\n1 GSM2230761\nName: gsm_id...,-,fastq,GSE84133,GSM2230761,1,AWS,mouse1_lib1.R1.fastq.gz,R1,0,SRR3879614,1,TEST,Musmusculus
1,"0 TEST\n1 TEST\nName: sample_name, dtype...",s3://sra-pub-src-5/SRR3879614/mouse1_lib1.R2.f...,0 GSM2230761\n1 GSM2230761\nName: gsm_id...,-,fastq,GSE84133,GSM2230761,1,AWS,mouse1_lib1.R2.fastq.gz,R2,0,SRR3879614,1,TEST,Musmusculus


In [75]:
exists_laneID: List[str] = [
    x for x in errors[errors["gse_id"] == gse_id]["lane_id"].unique().tolist() if not np.isnan(x)]
len(exists_laneID)


0

In [57]:
exists_laneID


[nan, nan]