# Environment

## Imports

In [1]:
import os
import glob
import pandas as pd

## Global variables

In [2]:
qc_dir = "/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir"

## Load the data

### A list of public datasets

In [23]:
todo_df = pd.read_csv("todo_list.tsv", sep="\t", header=None, names=["dataset", "specie", "n_samples", "n_10x_samples", "samples"])
todo_df.head()

Unnamed: 0,dataset,specie,n_samples,n_10x_samples,samples
0,GSE115424,Mus musculus,4,2,"GSM4155928,GSM4155929"
1,GSE116390,Mus musculus,7,7,"GSM3937769,GSM3937770,GSM3937773,GSM3937767,GS..."
2,GSE117824,Homo sapiens,12,6,"GSM3739219,GSM3739221,GSM3739217,GSM3739222,GS..."
3,GSE118814,Mus musculus,49,2,"GSM4037331,GSM4037332"
4,GSE120409,Mus musculus,833,16,"GSM4225699,GSM4225695,GSM3398924,GSM3398925,GS..."


In [17]:
# Get a list of all samples
sample_list = list()
for i, row in todo_df.iterrows():
    for sample in row["samples"].split(","):
        sample_list.append([row.dataset, sample.strip()])

# Create a DataFrame from the sample list
sample_df = pd.DataFrame(sample_list, columns=["dataset", "sample"])
sample_df

Unnamed: 0,dataset,sample
0,GSE115424,GSM4155928
1,GSE115424,GSM4155929
2,GSE116390,GSM3937769
3,GSE116390,GSM3937770
4,GSE116390,GSM3937773
...,...,...
13072,GSE206409,GSM5257944
13073,GSE206409,GSM5257943
13074,GSE206409,GSM5257942
13075,GSE94877,GSM4007007


### Solo qc files

Get a list of all QC files

In [3]:
qc_files_list = glob.glob(f"{qc_dir}/*")
qc_files_list[:3], len(qc_files_list)

(['/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir/GSE143437.solo_qc.tsv',
  '/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir/GSE114724.solo_qc.tsv',
  '/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir/GSE132959.solo_qc.tsv'],
 2651)

Read their contents to one `dataframe`

In [4]:
df_list = list()
for filepath in qc_files_list:
    dataset_id = os.path.basename(filepath).split(".")[0]
    df = pd.read_csv(filepath, sep="\t")
    df["dataset"] = dataset_id
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True, axis=0)
print(f"Combined dataframe shape: {combined_df.shape}")
combined_df.head()

Combined dataframe shape: (18387, 19)


Unnamed: 0,Sample,Rd_all,Rd_in_cells,Frc_in_cells,UMI_in_cells,Cells,Med_nFeature,Good_BC,WL,Species,Paired,Strand,all_u+m,all_u,exon_u+m,exon_u,full_u+m,full_u,dataset
0,GSM4259473,228700456.0,104762310.0,0.458,10899841.0,3645.0,1135.0,0.98257,v2,Mouse,Single,Forward,0.859548,0.587572,0.695069,0.663193,0.763408,0.707288,GSE143437
1,GSM4259474,256778531.0,117821690.0,0.459,15516156.0,5702.0,1056.0,0.982054,v2,Mouse,Single,Forward,0.890066,0.601419,0.736884,0.703998,0.799736,0.741322,GSE143437
2,GSM4259475,237783790.0,66360336.0,0.279,33897683.0,8282.0,510.0,0.974452,v3,Mouse,Single,Forward,0.871597,0.461927,0.703091,0.694612,0.776523,0.756396,GSE143437
3,GSM4259476,143080183.0,89904246.0,0.628,43056446.0,6041.0,2059.0,0.982375,v2,Mouse,Single,Forward,0.825362,0.588886,0.681863,0.646748,0.746805,0.66609,GSE143437
4,GSM4259477,243114255.0,147027233.0,0.605,27721333.0,3216.0,2254.0,0.980075,v2,Mouse,Single,Forward,0.823184,0.589905,0.65322,0.619906,0.723066,0.6477,GSE143437


Save to file

In [5]:
combined_df.to_csv("solo_qc_combined.tsv", sep="\t", index=False)

### Find sample yet to be processed

In [22]:
sample_df[~sample_df["sample"].isin(combined_df.Sample.tolist())]

Unnamed: 0,dataset,sample
57,GSE129218,GSM4186893
58,GSE129218,GSM3702750
59,GSE129218,GSM3702748
60,GSE129218,GSM4186890
61,GSE129218,GSM4186892
...,...,...
13067,GSE189636,GSM5705275
13068,GSE189636,GSM5705266
13069,GSE189650,GSM5705583
13070,GSE189650,GSM5705584


In [None]:
samples_to_process = sample_df.groupby("dataset").aggregate(",".join)
samples_to_process

Unnamed: 0_level_0,sample
dataset,Unnamed: 1_level_1
GSE115424,"GSM4155928,GSM4155929"
GSE116390,"GSM3937769,GSM3937770,GSM3937773,GSM3937767,GS..."
GSE117824,"GSM3739219,GSM3739221,GSM3739217,GSM3739222,GS..."
GSE118814,"GSM4037331,GSM4037332"
GSE120409,"GSM4225699,GSM4225695,GSM3398924,GSM3398925,GS..."
...,...
GSE189576,"GSM5704139,GSM5704140"
GSE189636,"GSM5705276,GSM5705282,GSM5705278,GSM5705265,GS..."
GSE189650,"GSM5705583,GSM5705584,GSM5705585"
GSE206409,"GSM5257944,GSM5257943,GSM5257942"


In [27]:
samples_to_process.to_csv("samples_to_process.tsv", sep="\t", index=False, header=False)