
### Collects all raw UMI and gene counts and merges with splotch metadata

In [1]:
import os
import sys
import glob
import scanpy as sc
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import statistics
import statsmodels.api as sm
import matplotlib.patches as mpatches
import scipy.stats
import warnings
import pickle
from itertools import chain
warnings.filterwarnings('ignore')

### Gets seq reads data, annotation and UMI counts

In [2]:
counts_path = glob.glob(os.path.join('/home/brittalotstedt/host-microbiome/data/st_data/aligned_counts', "*"))
genes_all = []
umis_all = []
names_all = []
xy_all = []
for sam in counts_path:
    path = os.path.basename(sam)
    print("Processing...", sam)
    df = pd.read_csv(sam, sep = "\t")
    
    genes_all.append(df.astype(bool).sum(axis = 0).tolist())
    umis_all.append(df.sum(axis = 0).tolist())
    names_all.append([path for i in range(0,len(df.columns))])
    xy_all.append([str(round(float(i.split("_")[0])))+"_"+str(round(float(i.split("_")[1]))) for i in df.columns])
   

Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN38_D2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN41_C1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN46_E2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN44_D1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN73_C1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN72_E2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN46_C1_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-microbiome/data/st_data/aligned_counts/10015CN72_D2_stdata_under_tissue_IDs.txt
Processing... /home/brittalotstedt/host-

In [3]:
#Flatten lists
umis_all_flat = [item for sublist in umis_all for item in sublist]
genes_all_flat = [item for sublist in genes_all for item in sublist]
names_all_flat = [item.split("_stdata_under_tissue_IDs.txt")[0] for sublist in names_all for item in sublist]
xy_all_flat = [item for sublist in xy_all for item in sublist]

## Merge UMIs and genes into a df

In [4]:
# Collect umi and gene data into a dataframe
dfplot = pd.DataFrame([umis_all_flat, genes_all_flat, names_all_flat, xy_all_flat]).T
dfplot.columns = ['UMIs', 'Genes', 'Name', "x_y"]
dfplot_filtered = dfplot[dfplot['UMIs']>800]
dfplot_filtered.reset_index(inplace=True, drop = True)

## Merge with numbers of sections detected per array

In [5]:
# read in summary file from splotch colon prep
summary_sections = pd.read_csv('/home/brittalotstedt/host-microbiome/data/st_data/splotch_outputs/Splotch.e29685944', sep = "\t")
summary_sections.columns = ['metadata']
samples = [os.path.basename(i).split("_stdata_adjusted.tsv")[0] for i in summary_sections[summary_sections.metadata.str.contains('Processing')].metadata]
sections = [int(i[18:].split(" tissue sections")[0]) for i in summary_sections[summary_sections.metadata.str.contains('Keeping')].metadata]
secs = pd.DataFrame([samples, sections]).T
secs.columns = ['Name', 'sections']


## Merge with study design metadata (splotch)

In [6]:
# read in metadata file 
meta = pd.read_csv('/home/brittalotstedt/host-microbiome/data/st_data/metadata.txt', sep = "\t")
meta_1 = pd.merge(meta, secs, left_on="Name", right_on="Name", how = "inner")
meta_2 = pd.merge(meta_1, dfplot_filtered, left_on="Name", right_on="Name", how = "inner")
meta_2 = meta_2[meta_2["sections"] != 0]
meta_2.reset_index(inplace=True, drop = True)
spots = pd.DataFrame(meta_2.groupby(by = "Name").count().sections)
spots.columns = ["spots"]
meta_3 = pd.merge(meta_2, spots, left_on="Name", right_on="Name", how = "inner")


## Merge with morphological annotations (MROIs)

In [7]:
# read in all annotations files
anns_all = []
for ann in os.listdir("/home/brittalotstedt/host-microbiome/data/st_data/annotations"):
    
    if ann == ".DS_Store":
        continue
    
    short_name = ann.split("_annotations.txt")[0]
    
    if short_name not in np.unique(meta_3.Name):
        print(short_name)
        continue  
    
    df_ann = pd.read_csv(os.path.join("/home/brittalotstedt/host-microbiome/data/st_data/annotations", ann), sep = "\t")
    df_ann['x'] = [str(round(float(i))) for i in df_ann['x']]
    df_ann['y'] = [str(round(float(i))) for i in df_ann['y']]
    df_ann['x_y'] = [str(i)+"_"+str(j) for i,j in zip(df_ann['x'], df_ann['y'])]
     
    anns_all.append(df_ann)
anns_all_flat = pd.concat(anns_all)
anns_all_flat.reset_index(inplace = True, drop = True)
meta_3["patch"] = [i+"_"+j for i,j in zip (meta_3["Name"], meta_3["x_y"])]
anns_all_flat["patch"] = [i+"_"+j for i,j in zip (anns_all_flat["image"], anns_all_flat["x_y"])]
meta_4 = pd.merge(meta_3, anns_all_flat, left_on="patch", right_on="patch", how = "inner")
meta_4.drop(["image", 'x_y_y'], axis = 1, inplace = True)
meta_4.rename(columns={"x_y_x": "x_y"}, inplace = True,)
meta_4.sections = meta_4.sections.astype("int")
meta_4.drop_duplicates(["patch"], inplace = True, keep='first')


## Merge with patches

In [8]:
ls = []
for dirs in glob.glob(os.path.join('/home/brittalotstedt/host-microbiome/data/st_data/patches', "*")):
    ls.append([os.path.basename(i).split(".jpg")[0] for i in glob.glob(os.path.join(dirs, "*.jpg"))])

ls_all_patches = list(chain(*ls))
ls_all = [i.split("_")[0]+"_"+i.split("_")[1]+"_"+str(round(float(i.split("_")[2])))+"_"+str(round(float(i.split("_")[3]))) for i in ls_all_patches]
meta_4 = meta_4[meta_4["patch"].isin(list(np.intersect1d(meta_4.patch, ls_all)))]

## Check spots that contain normalized data and subset to QC only those

In [13]:
# Load sample_information file
info_file = os.path.join('/home/brittalotstedt/host-microbiome/data/st_data/splotch_outputs', 'information.p')
info = pickle.load(open(info_file,'rb'))   
metadata = info['metadata']
n_levels = info['n_levels']
lambda_patches = pd.DataFrame([os.path.basename(i[0]).split("_stdata_adjusted.tsv")[0]+"_"+i[1] for i in info['filenames_and_coordinates']])
lambda_patches.columns = ["spotch_patches"]
meta_5 = pd.merge(meta_4, lambda_patches, left_on="patch", right_on="spotch_patches", how = "inner")

In [14]:
# sanity check 
len(lambda_patches) == len(meta_5)

True

In [15]:
'Clean up metadata to match'

'Clean up metadata to match'

In [16]:
meta_5.rename(columns={"Level 1": "Mouse", "Level 2": "Type", "value": "annotation"},inplace=True)
meta_5.index = meta_5.spotch_patches
meta_5.drop(['spotch_patches', 'patch'], axis=1,inplace=True)
meta_5.rename(columns={"x_y":"patch"},inplace=True)
#meta_5.Region = [i.split(".")[1] for i in meta_5.Region]
meta_5.UMIs = meta_5.UMIs.astype(int)
meta_5.Genes = meta_5.Genes.astype(int)
meta_5.spots = meta_5.spots.astype(int)
meta_5.sections = meta_5.sections.astype(int)

In [17]:
abbreviate_anns_dict = {
    "peyer's patch": "PP",
     'epithelium': "E",
     'epithelium and mucosae':"EMM",
    'epithelium and muscle and submucosa':"ALL",
    'epithelium and mucosae and submucosa':"EMMSUB",
    'crypt apex':"APEX",
    'crypt base and mid':"LOWERMID",
    'crypt base': "BASE",
    'crypt mid': "MID",
    'muscle and submucosa':"MSUB",
    'mucosae and interna':"MMI",
    'externa':"ME",
    'externa and interna':"MEI",
    'interna':"MI",
    'pellet':'PE',
    'mucosa':'MU'
}
meta_5["short_annotations"] = meta_5.annotation.map(abbreviate_anns_dict)


Write the meta file as this is the final metadata used in the study

In [18]:
meta_5.to_csv("/home/brittalotstedt/host-microbiome/data/st_data/Metadata_final.csv", sep = "\t")