In [40]:
import pandas as pd
import json
import os
import ipdb
import numpy as np

In [41]:
config_file_path = input()
config_file = open(config_file_path)
config = json.load(config_file)

FileNotFoundError: [Errno 2] No such file or directory: ''

In [None]:
output_folder = os.path.join(config['output_folder'], config['experiment_name'])
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
def save_df_to_csv(df, output_file):
    # create the folder if not exists yet
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))

    # Write DataFrame to csv file
    df.to_csv(output_file, index=False)

In [None]:
def replicate_coverage(lib, df, on=None, on_lib=None, on_df=None):
    if on is not None:
        lib_len = len(lib[on])
        df_len = len(df[on])
    elif on_lib is not None and on_df is not None:
        lib_len = len(lib[on_lib])
        df_len = len(df[on_df])

    if lib_len is None or df_len is None:
        return None

    return df_len/lib_len

In [None]:
def clean_raw_pairs_file(df):
    # drop the id_ columns
    # and removed empty rows with unique_17 or barcode
    cleaned_df = df.drop(columns=['id_']).dropna(axis=0, subset=['unique_17', 'barcode'])

    return cleaned_df

In [None]:
def clean_raw_pairs_file_2(df):
    # drop the id_ columns
    # and removed empty rows with unique_17 or barcode
    cleaned_df = df.drop(columns=['id_']).dropna(axis=0, subset=['unique_17'])

    return cleaned_df

In [None]:
def unique_17_count(df):
    unique_17_counts_df = (
        df["unique_17"]
        .value_counts(sort=True)
        .rename_axis("unique_17")
        .reset_index(name="count")
    )

    return unique_17_counts_df

In [None]:
def unique_17_and_bc_count(df):
    unique_17__and_bc_counts_df = (
        df.value_counts(sort=True)
        .rename_axis(["unique_17", "barcode"])
        .reset_index(name="count")
    )

    return unique_17__and_bc_counts_df

In [None]:
def reads_per_million_norm(df, columns, scalling):
    norm_df = df.copy()
    for c in columns:
        total_read_num = df[c].sum()
        scalling_factor = total_read_num / scalling
        norm_df[c] = norm_df[c] / scalling_factor

    return norm_df

In [None]:
def num_of_read_by_fastq(fastq_file):
    with open(fastq_file, 'r') as f:
        x = len(f.readlines())
        
    # 4 lines per read
    return int(x/4)

In [43]:
def read_experimental_dfs(config):
    exp_obj = config.copy()

    for group in exp_obj["groups"]:

        for bio_rep in group["biological_replicates"]:
            
            for tech_rep in bio_rep["technical_replicates"]:
                paired_file = tech_rep["paired_file"]

                df = pd.read_csv(paired_file)
                cleaned_df = clean_raw_pairs_file(df)
                tech_rep["df"] = cleaned_df

    return exp_obj

In [None]:
lib_csv_path = config["lib_csv_path"]
lib_df = pd.read_csv(lib_csv_path)

In [46]:
def run_experiment_coverage(exp_obj):
    coverage_file = os.path.join(output_folder, "coverage.txt")
    for group in exp_obj["groups"]:
        group_name = group["name"]
        for bio_rep in group["biological_replicates"]:
            bio_name = bio_rep["name"]
            for tech_rep in bio_rep["technical_replicates"]:
                tech_name = tech_rep["name"]

                fastq_file = tech_rep["fastq_file"]
                clipped_fastq_file = tech_rep["clipped_fastq_file"]
                unclipped_fastq_file = tech_rep["unclipped_fastq_file"]
                too_long_fastq_file = tech_rep["too_long_fastq_file"]
                too_short_fastq_file = tech_rep["too_short_fastq_file"]

                unique_17_count_df = unique_17_count(tech_rep["df"])

                coverage = replicate_coverage(
                    lib_df, unique_17_count_df, on_lib="unique_17", on_df="unique_17"
                )
                fastq_reads = num_of_read_by_fastq(fastq_file)
                clipped_fastq_reads = num_of_read_by_fastq(clipped_fastq_file)
                unclipped_fastq_reads = num_of_read_by_fastq(unclipped_fastq_file)
                too_long_fastq_reads = num_of_read_by_fastq(too_long_fastq_file)
                too_short_fastq_reads = num_of_read_by_fastq(too_short_fastq_file)

                output_line = (
                    f"{group_name} -> {bio_name} -> {tech_name}:"
                    f"\n\tnumber of aligned reads: {tech_rep['df'].shape[0]} ===> coverage: {coverage}"
                    f"\n\tfastq number of reads: {fastq_reads}"
                    f"\n\tclipped fastq number of reads: {clipped_fastq_reads} ===> clipped reads coverage: {clipped_fastq_reads/fastq_reads}"
                    f"\n\tunclipped fastq number of reads: {unclipped_fastq_reads} ===> unclipped reads coverage: {unclipped_fastq_reads/fastq_reads}"
                    f"\n\ttoo long fastq number of reads: {too_long_fastq_reads} ===> too long reads coverage: {too_long_fastq_reads/fastq_reads}"
                    f"\n\ttoo short fastq number of reads: {too_short_fastq_reads} ===> too short reads coverage: {too_short_fastq_reads/fastq_reads}"
                )

                print(output_line)
                with open(coverage_file, "a+") as out:
                    out.write(output_line + "\n")


In [45]:
expr_obj = read_experimental_dfs(config)

In [47]:
run_experiment_coverage(expr_obj)

H7_R1 -> bio_1 -> tech_2:
	number of aligned reads: 525534 ===> coverage: 0.606
	fastq number of reads: 2359908
	clipped fastq number of reads: 531876 ===> clipped reads coverage: 0.2253799724396036
	unclipped fastq number of reads: 0 ===> unclipped reads coverage: 0.0
	too long fastq number of reads: 334810 ===> too long reads coverage: 0.14187417475596506
	too short fastq number of reads: 1493222 ===> too short reads coverage: 0.6327458528044314
H7_R1 -> bio_1 -> tech_3:
	number of aligned reads: 41911 ===> coverage: 0.34
	fastq number of reads: 1724697
	clipped fastq number of reads: 42629 ===> clipped reads coverage: 0.024716805328704114
	unclipped fastq number of reads: 0 ===> unclipped reads coverage: 0.0
	too long fastq number of reads: 239806 ===> too long reads coverage: 0.13904239411328484
	too short fastq number of reads: 1442262 ===> too short reads coverage: 0.8362408005580111
RL_R1 -> bio_1 -> tech_2:
	number of aligned reads: 1540268 ===> coverage: 0.9031666666666667
	fa

In [48]:
all_unique_17s = lib_df["unique_17"]
len(all_unique_17s)

6000

In [49]:
def run_experiment_analysis(expr_obj):
    expand_output_df = pd.DataFrame(data={"unique_17": all_unique_17s})
    bio_output_df = pd.DataFrame(data={"unique_17": all_unique_17s})

    for group in expr_obj["groups"]:
        group_name = group["name"]
        for bio_rep in group["biological_replicates"]:
            bio_name = bio_rep["name"]
            bio_count_col_name = f"{group_name}_{bio_name}"
            bio_count_df = pd.DataFrame(data={"unique_17": all_unique_17s, f"{bio_count_col_name}": np.zeros(len(all_unique_17s))})
            for tech_rep in bio_rep["technical_replicates"]:
                tech_name = tech_rep["name"]
                
                unique_17_count_df = unique_17_count(tech_rep["df"])
                tech_count_col_name = f"{group_name}_{bio_name}_{tech_name}"
                unique_17_count_df = unique_17_count_df.rename(columns={"count" : tech_count_col_name})
                bio_count_df = bio_count_df.merge(unique_17_count_df, how="left", on="unique_17").fillna(0)
                bio_count_df[bio_count_col_name] = bio_count_df[bio_count_col_name] + bio_count_df[tech_count_col_name]
            
            bio_output_df = bio_output_df.merge(bio_count_df[["unique_17",bio_count_col_name]], how="left", on="unique_17")
            expand_output_df = expand_output_df.merge(bio_count_df, how="left", on="unique_17")

    return expand_output_df, bio_output_df

In [51]:
expand_output_df, bio_output_df = run_experiment_analysis(expr_obj)

In [52]:
bio_output_df.head()

Unnamed: 0,unique_17,H7_R1_bio_1,RL_R1_bio_1,DS_R1_bio_1,U2_R1_bio_1,MC_CAR_R1_bio_1,NCI_R1_bio_1,HT_1080_R1_bio_1,CD4_R1_bio_1,CD8_R1_bio_1,...,MC_CAR_R2_bio_1,NCI_R2_bio_1,HT_1080_R2_bio_1,CD4_R2_bio_1,CD8_R2_bio_1,TREGS_R2_bio_1,B_CELLS_R2_bio_1,MONOCYTES_R2_bio_1,JURKAT_R2_bio_1,H1_R2_bio_1
0,ACTACCTGAAGAACCTT,0.0,1.0,0.0,0.0,3.0,2.0,1.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.0,0.0,96.0,234.0,17.0,1.0
1,GAGCTAAATGGCTGATT,53.0,0.0,0.0,0.0,0.0,45.0,0.0,0.0,46.0,...,4.0,60.0,0.0,0.0,98.0,5.0,89.0,0.0,4.0,2.0
2,GTGACCACACTTACAGT,0.0,0.0,7.0,0.0,15.0,23.0,17.0,81.0,14.0,...,35.0,23.0,15.0,78.0,57.0,45.0,176.0,19.0,42.0,45.0
3,TTGTTGGCGAGCAGTGT,0.0,0.0,9.0,0.0,9.0,4.0,0.0,0.0,0.0,...,13.0,2.0,2.0,0.0,0.0,3.0,8.0,0.0,20.0,0.0
4,CAATATCGGCGAGCTCT,1.0,68.0,40.0,0.0,61.0,63.0,38.0,22.0,98.0,...,112.0,66.0,31.0,26.0,305.0,23.0,81.0,2.0,172.0,121.0


In [53]:
expand_output_df.head()

Unnamed: 0,unique_17,H7_R1_bio_1,H7_R1_bio_1_tech_2,H7_R1_bio_1_tech_3,RL_R1_bio_1,RL_R1_bio_1_tech_2,RL_R1_bio_1_tech_3,DS_R1_bio_1,DS_R1_bio_1_tech_2,DS_R1_bio_1_tech_3,...,B_CELLS_R2_bio_1_tech_3,MONOCYTES_R2_bio_1,MONOCYTES_R2_bio_1_tech_2,MONOCYTES_R2_bio_1_tech_3,JURKAT_R2_bio_1,JURKAT_R2_bio_1_tech_2,JURKAT_R2_bio_1_tech_3,H1_R2_bio_1,H1_R2_bio_1_tech_2,H1_R2_bio_1_tech_3
0,ACTACCTGAAGAACCTT,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,85.0,234.0,234.0,0.0,17.0,13.0,4.0,1.0,1.0,0.0
1,GAGCTAAATGGCTGATT,53.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,87.0,0.0,0.0,0.0,4.0,2.0,2.0,2.0,0.0,2.0
2,GTGACCACACTTACAGT,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,...,109.0,19.0,19.0,0.0,42.0,19.0,23.0,45.0,8.0,37.0
3,TTGTTGGCGAGCAGTGT,0.0,0.0,0.0,0.0,0.0,0.0,9.0,6.0,3.0,...,2.0,0.0,0.0,0.0,20.0,7.0,13.0,0.0,0.0,0.0
4,CAATATCGGCGAGCTCT,1.0,0.0,1.0,68.0,48.0,20.0,40.0,28.0,12.0,...,62.0,2.0,2.0,0.0,172.0,81.0,91.0,121.0,48.0,73.0


In [54]:
expand_output_df_csv = os.path.join(output_folder, "all_replicates_sum.csv")
save_df_to_csv(expand_output_df, expand_output_df_csv)

In [55]:
bio_output_df_csv = os.path.join(output_folder, "all_bio_replicates_sum.csv")
save_df_to_csv(bio_output_df, bio_output_df_csv)