# Environment

## Imports

In [1]:
import os
import glob
import pandas as pd

## Global variables

In [2]:
qc_dir = "/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir"

## Load the data

Get a list of all QC files

In [3]:
qc_files_list = glob.glob(f"{qc_dir}/*")
qc_files_list[:3], len(qc_files_list)

(['/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir/GSE143437.solo_qc.tsv',
  '/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir/GSE114724.solo_qc.tsv',
  '/lustre/scratch127/cellgen/cellgeni/aljes/reprocessing/solo_qc_dir/GSE132959.solo_qc.tsv'],
 2601)

Read their contents to one `dataframe`

In [4]:
df_list = list()
for filepath in qc_files_list:
    dataset_id = os.path.basename(filepath).split(".")[0]
    df = pd.read_csv(filepath, sep="\t")
    df["dataset"] = dataset_id
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True, axis=0)
combined_df.head(), combined_df.shape

(       Sample       Rd_all  Rd_in_cells  Frc_in_cells  UMI_in_cells   Cells  \
 0  GSM4259473  228700456.0  104762310.0         0.458    10899841.0  3645.0   
 1  GSM4259474  256778531.0  117821690.0         0.459    15516156.0  5702.0   
 2  GSM4259475  237783790.0   66360336.0         0.279    33897683.0  8282.0   
 3  GSM4259476  143080183.0   89904246.0         0.628    43056446.0  6041.0   
 4  GSM4259477  243114255.0  147027233.0         0.605    27721333.0  3216.0   
 
    Med_nFeature   Good_BC  WL Species  Paired   Strand   all_u+m     all_u  \
 0        1135.0  0.982570  v2   Mouse  Single  Forward  0.859548  0.587572   
 1        1056.0  0.982054  v2   Mouse  Single  Forward  0.890066  0.601419   
 2         510.0  0.974452  v3   Mouse  Single  Forward  0.871597  0.461927   
 3        2059.0  0.982375  v2   Mouse  Single  Forward  0.825362  0.588886   
 4        2254.0  0.980075  v2   Mouse  Single  Forward  0.823184  0.589905   
 
    exon_u+m    exon_u  full_u+m    full_u

# Calculate statistics

Calculate some statistics

In [10]:
specie_sample_counts = combined_df["Species"].value_counts().to_dict()
specie_cell_counts = combined_df[["Species", "Cells"]].groupby("Species").sum().to_dict()["Cells"]


print(f"Number of datasets: {combined_df['dataset'].nunique()}")
print(f"Number of samples: {combined_df.shape[0]}")
print(f"Number of cells: {combined_df['Cells'].sum()}")
print(f"Number of human samples: {specie_sample_counts['Human']}, Number of mouse samples: {specie_sample_counts['Mouse']}")
print(f"Number of human cells: {specie_cell_counts['Human']: .0f}, Number of mouse cells: {int(specie_cell_counts['Mouse']): .0f}")

Number of datasets: 2601
Number of samples: 17092
Number of cells: 110702641.0
Number of human samples: 8945, Number of mouse samples: 8147
Number of human cells:  58904046, Number of mouse cells:  51798595
