In [1]:
import os
import glob

In [2]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
import pickle
import pandas as pd

In [22]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211122_hca_benchmark_libds_dar_correlation_with_final_dars/'
os.chdir( wdir )

In [23]:
import glob
from collections import OrderedDict

# we read the same fragments as used in the libds downsampled analysis.

In [24]:
filenames = sorted(glob.glob('libds_fragments/*.tsv.gz'))
samples = [item.replace(".sinto.mm.fragments.tsv.gz", "") for item in filenames]
samples = [item.replace("libds_fragments/", "") for item in samples]
fragments_dict = {samples[i]: filenames[i] for i in range(len(samples))}
fragments_dict = OrderedDict(sorted(fragments_dict.items()))
fragments_dict.keys()

odict_keys(['Broad_1', 'Broad_2', 'Broad_mito_1', 'Broad_mito_2', 'CNAG_1', 'CNAG_2', 'Sanger_1', 'Sanger_2', 'Stanford_1', 'Stanford_2', 'VIB_1', 'VIB_2', 'VIB_Hydrop_1', 'VIB_Hydrop_2', 's3atac'])

# load the b cell barcodes

In [25]:
f_bcdir = 'libds_bcell_barcodes/'

bc_passing_filters = {}
for k in fragments_dict.keys():
    file = open(os.path.join(f_bcdir,k + '.txt'), 'r')
    bc_passing_filters[k] = [ x.strip() for x in file.readlines() ]
    file.close()

# now count within the dar union

In [26]:
path_to_regions = 'libds_bcell_dars_union/libds_dars.everything.named.noheader.sorted.bed'
path_to_blacklist = 'hg38_regions/hg38-blacklist.v2.bed'

In [27]:
f_cto_dir = 'cistopic_objs__dars_union'
if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

In [28]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In [29]:
for key in fragments_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_dars_union.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}, already exists")
        continue
        
    tmp_cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[key],
                                                    path_to_regions=path_to_regions,
                                                    path_to_blacklist=path_to_blacklist,
                                                    # metrics=metadata_bc_dict[key],
                                                    valid_bc=bc_passing_filters[key],
                                                    n_cpu=15,
                                                    partition=5,
                                                    project=key)
    
    with open(f_out, 'wb') as f:
        pickle.dump(tmp_cto, f)
    
    print(f"Generated and saved cistopic object for {key}")

2021-11-23 16:44:50,719 cisTopic     INFO     Reading data for Broad_1


FileNotFoundError: [Errno 2] No such file or directory: 'libds_bcell_dars_union/libds_dars.everything.named.noheader.sorted.bed'

In [None]:
# load objects into dict:
cistopic_obj_dict = {}
for key in bc_passing_filters.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_dars_union.pkl')
    with open(f_out, 'rb') as f:
        tmp_cto = pickle.load(f)
    cistopic_obj_dict[key] = tmp_cto
    print(f"Loaded {key}")

# check if everything went ok

In [None]:
for key in cistopic_obj_dict.keys():
    print(key)
    print(f'{len(cistopic_obj_dict[key].cell_data)} cells in cto')
    print(f'{len(pd.read_csv(f"libds_bcell_barcodes/{key}.txt", header=None))} cells in txt')

so this seems quite fine

In [None]:
counts_df = pd.DataFrame()
for key in cistopic_obj_dict.keys():
    counts = pd.DataFrame.sparse.from_spmatrix(cistopic_obj_dict[key].fragment_matrix.T).sum(axis=0)
    df = pd.DataFrame(counts).T
    df = df.rename({0:key},axis=0)
    counts_df = pd.concat([counts_df, df])
    print(f'{key} done')

In [None]:
counts_df.to_csv('libds_counts_tsv/fragment_counts_in_DAR_union.tsv', sep='\t', header=True, index=True)

# now plot

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
from scipy.stats import pearsonr
import matplotlib.pyplot as plt 

def corrfunc(x, y, ax=None, **kws):
    """Plot the correlation coefficient in the top left hand corner of a plot."""
    r, _ = pearsonr(x, y)
    ax = ax or plt.gca()
    ax.annotate(f'r = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes)

In [None]:
df = pd.read_csv('libds_counts_tsv/fragment_counts_in_DAR_union.tsv', sep='\t', header=0, index_col=0)

In [None]:
df = df.fillna(0)

In [None]:
g = sns.pairplot(df.T, corner=True, plot_kws={'s':0.1})
g.map_lower(corrfunc)
plt.savefig('plts_pub/fragment_counts_in_DAR_union_corr.png', facecolor='white')

this doesn't seem to work too hot. let's re-calculate with log2cpm

In [None]:
import numpy as np

In [None]:
df_log = np.log(df)

In [None]:
df_log

In [None]:
sns.pairplot(df_log.T, corner=True, plot_kws={'s':0.1})
plt.savefig('plts_pub/fragment_counts_in_DAR_union_corr_log_reads.png', facecolor='white')

or what about log cpm

In [None]:
df_log_cpm = np.log(df.div(df.sum(axis=1)/1000000, axis=0))

In [None]:
df_log_cpm = df_log_cpm.fillna(0)

In [None]:
g = sns.pairplot(df_log_cpm.T, corner=True, plot_kws={'s':0.1})
g.map_lower(corrfunc)

plt.savefig('plts_pub/fragment_counts_in_DAR_union_corr_log_cpm_reads.png', facecolor='white', dpi=300)

# ok, this isn't great
try with the `merged` DARs

In [None]:
path_to_regions = '/lustre1/project/stg_00002/lcb/fderop/data/20211024_hca_benchmark_libds_merged/downstream_analysis/DARs/libds__libds_merged__B_cell__DARs.bed'
path_to_blacklist = 'hg38_regions/hg38-blacklist.v2.bed'

In [None]:
f_cto_dir = 'cistopic_objs___merged_dars'
if not os.path.exists(os.path.join(wdir, f_cto_dir)):
    os.makedirs(os.path.join(wdir, f_cto_dir))

In [None]:
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments

Create cistopic objects for each sample. If pandas crashes, increase the number of partitions. This is necessary for the largest files.

In [None]:
for key in fragments_dict.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_dars_union.pkl')
    if os.path.isfile(f_out):
        print(f"Skipping {key}, already exists")
        continue
        
    tmp_cto = create_cistopic_object_from_fragments(path_to_fragments=fragments_dict[key],
                                                    path_to_regions=path_to_regions,
                                                    path_to_blacklist=path_to_blacklist,
                                                    # metrics=metadata_bc_dict[key],
                                                    valid_bc=bc_passing_filters[key],
                                                    n_cpu=15,
                                                    partition=5,
                                                    project=key)
    
    with open(f_out, 'wb') as f:
        pickle.dump(tmp_cto, f)
    
    print(f"Generated and saved cistopic object for {key}")

In [None]:
# load objects into dict:
cistopic_obj_dict = {}
for key in bc_passing_filters.keys():
    f_out = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_dars_union.pkl')
    with open(f_out, 'rb') as f:
        tmp_cto = pickle.load(f)
    cistopic_obj_dict[key] = tmp_cto
    print(f"Loaded {key}")

# check if everything went ok

In [None]:
for key in cistopic_obj_dict.keys():
    print(key)
    print(f'{len(cistopic_obj_dict[key].cell_data)} cells in cto')
    print(f'{len(pd.read_csv(f"libds_bcell_barcodes/{key}.txt", header=None))} cells in txt')

so this seems quite fine

In [None]:
counts_df = pd.DataFrame()
for key in cistopic_obj_dict.keys():
    counts = pd.DataFrame.sparse.from_spmatrix(cistopic_obj_dict[key].fragment_matrix.T).sum(axis=0)
    df = pd.DataFrame(counts).T
    df = df.rename({0:key},axis=0)
    counts_df = pd.concat([counts_df, df])
    print(f'{key} done')

In [None]:
counts_df.to_csv('libds_counts_tsv/fragment_counts_in_merged_DARs.tsv', sep='\t', header=True, index=True)

# now plot

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
from scipy.stats import pearsonr
import matplotlib.pyplot as plt 

def corrfunc(x, y, ax=None, **kws):
    """Plot the correlation coefficient in the top left hand corner of a plot."""
    r, _ = pearsonr(x, y)
    ax = ax or plt.gca()
    ax.annotate(f'r = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes)

In [None]:
df = pd.read_csv('libds_counts_tsv/fragment_counts_in_merged_DARs.tsv', sep='\t', header=0, index_col=0)

In [None]:
df = df.fillna(0)

In [None]:
g = sns.pairplot(df.T, corner=True, plot_kws={'s':0.1})
g.map_lower(corrfunc)
plt.savefig('plts_pub/fragment_counts_in_merged_dars_corr.png', facecolor='white')

this doesn't seem to work too hot. let's re-calculate with log2cpm

In [None]:
df = pd.read_csv('libds_counts_tsv/fragment_counts_in_merged_DARs.tsv', sep='\t', header=0, index_col=0)
df = df.fillna(0)
df = df + 1
df_log = np.log(df)

In [None]:
df_log.isna().sum().sum()

In [None]:
np.isinf(df_log).sum().sum()

In [None]:
g = sns.pairplot(df_log.T, corner=True, plot_kws={'s':0.1})
g.map_lower(corrfunc)

plt.savefig('plts_pub/fragment_counts_in_merged_dars_corr_log_reads.png', facecolor='white')

or what about log cpm

In [None]:
df = pd.read_csv('libds_counts_tsv/fragment_counts_in_merged_DARs.tsv', sep='\t', header=0, index_col=0)
df = df.fillna(0)
df = df + 1
df_log_cpm = np.log(df.div(df.sum(axis=1)/1000000, axis=0))

In [None]:
df_log_cpm = df_log_cpm.fillna(0)

In [None]:
df_log_cpm.isna().sum().sum()

In [None]:
np.isinf(df_log_cpm).sum().sum()

In [None]:
g = sns.pairplot(df_log_cpm.T, corner=True, plot_kws={'s':0.1})
g.map_lower(corrfunc)

plt.savefig('plts_pub/fragment_counts_in_merged_dars_corr_log_cpm_reads.png', facecolor='white', dpi=300)