<a href="https://colab.research.google.com/github/agalvezm/ACE2_scRNAseq/blob/master/tcc/GSE130117_GSM3732362_from_gc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GSE130117_GSM3732362

This notebook uses the filtered count matrix resulting from the following google colab notebook. https://github.com/agalvezm/ACE2_scRNAseq/blob/master/notebooks_countmatrices/GSE130117_GSM3732362.ipynb Please run the notebook above and upload the results file in /content

In [1]:
# Install SRA-toolkit 
!wget "http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz"

! tar -xzf sratoolkit.current-centos_linux64.tar.gz

# Add to path
import os
os.environ['PATH'] += ":/content/sratoolkit.2.11.0-centos_linux64/bin"

# Configure
!vdb-config --interactive

# Import packages

import numpy as np


--2021-04-05 21:01:44--  http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz
Resolving ftp-trace.ncbi.nlm.nih.gov (ftp-trace.ncbi.nlm.nih.gov)... 165.112.9.228, 165.112.9.230, 2607:f220:41f:250::230, ...
Connecting to ftp-trace.ncbi.nlm.nih.gov (ftp-trace.ncbi.nlm.nih.gov)|165.112.9.228|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz [following]
--2021-04-05 21:01:44--  https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz
Connecting to ftp-trace.ncbi.nlm.nih.gov (ftp-trace.ncbi.nlm.nih.gov)|165.112.9.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 99139357 (95M) [application/x-gzip]
Saving to: ‘sratoolkit.current-centos_linux64.tar.gz’


2021-04-05 21:01:46 (60.2 MB/s) - ‘sratoolkit.current-centos_linux64.tar.gz’ saved [99139357/99139357]

[2J[?2

In [2]:
# List of SRAs to download and transform. Copy paste from excel as string
SRAs = "SRR8949480"

SRAs = SRAs.split()

# SRA to delete, usually the index file, which get downloaded because we include technical reads. 
# It varies so check in the SRA website
SRA_to_delete = "3"


SRAs_to_keep = np.setdiff1d(["1", "2", "3"], [SRA_to_delete])


In [3]:
for sra in SRAs:
  """
  Downloads SRAs and transforms to compressed fastq file. Remove index reads and original SRA
  """
  # Download SRA
  !prefetch $sra

  # Convert to fastq. Biological reads are usually considered technicals in 10x so do not skip them.
  fastq_dump_cmd = "fastq-dump -F --gzip --readids --split-files " + sra + "/" + sra +".sra"
  !$fastq_dump_cmd

  # Remove files
  rm_cmd_1 = "rm " + sra + "/" + sra + ".sra"
  rm_cmd_2 = "rm " + sra + "_" + SRA_to_delete + ".fastq.gz" 
  !$rm_cmd_1
  !$rm_cmd_2


2021-04-05T21:02:53 prefetch.2.11.0: 1) Downloading 'SRR8949480'...
2021-04-05T21:02:53 prefetch.2.11.0:  Downloading via HTTPS...
2021-04-05T21:03:33 prefetch.2.11.0:  HTTPS download succeed
2021-04-05T21:03:41 prefetch.2.11.0:  'SRR8949480' is valid
2021-04-05T21:03:41 prefetch.2.11.0: 1) 'SRR8949480' was downloaded successfully
2021-04-05T21:03:41 prefetch.2.11.0: 'SRR8949480' has 0 unresolved dependencies
Read 32645202 spots for SRR8949480/SRR8949480.sra
Written 32645202 spots for SRR8949480/SRR8949480.sra
rm: cannot remove 'SRR8949480_3.fastq.gz': No such file or directory


In [4]:
# Define fastq list to feed kb

fastqs = []
for sra in SRAs:
  for read in SRAs_to_keep:
    fastqs.append(sra + "_" + read + ".fastq.gz")

## Metadata

In [5]:
# define the values for the analysis

# accession id for the data
id = "GSE130117"

samp_id = ["GSM3732362"]

no_samples = 1

fastqs_per_sample = [8] 

sample_id = samp_id

database_id = [id] * no_samples

tissue = ["blood"] * no_samples

cell_type = ["PBMCs"] * no_samples

condition = ["None"] * no_samples

species = ["human"] * no_samples

technology = ["10xv2"] * no_samples

paper = ["Muus et al 2020"] * no_samples

figure = ["Fig 1 a,b  ED Fig 1 a,b,c,d  ED Fig 2 a,b,c,d,e"] * no_samples


# Set string variables for kb functions

species_kb = species[0]

technology_kb = technology[0]

# Imports and installs

In [6]:
# install and import necessary software

# Install kb and scanpy
!pip -q install kb-python 
!pip -q install scanpy

import re
import os

# Setup

import anndata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
import scanpy as sc
from scipy import stats

from collections import OrderedDict
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale

from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from matplotlib import cm
from matplotlib.lines import Line2D

def nd(arr):
    return np.asarray(arr).reshape(-1)
def yex(ax):
    lims = [np.min([ax.get_xlim(), ax.get_ylim()]),
            np.max([ax.get_xlim(), ax.get_ylim()])]

    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    return ax

def trim_axs(axs, N):
    """little helper to massage the axs list to have correct length..."""
    axs = axs.flat
    for ax in axs[N:]:
        ax.remove()
    return axs[:N]

import warnings
warnings.filterwarnings('ignore')

fsize=20

plt.rcParams.update({'font.size': fsize})
%config InlineBackend.figure_format = 'retina'

[K     |████████████████████████████████| 59.1MB 104kB/s 
[K     |████████████████████████████████| 122kB 44.6MB/s 
[K     |████████████████████████████████| 10.3MB 37.2MB/s 
[K     |████████████████████████████████| 51kB 5.6MB/s 
[K     |████████████████████████████████| 13.2MB 177kB/s 
[K     |████████████████████████████████| 81kB 8.5MB/s 
[K     |████████████████████████████████| 112kB 46.4MB/s 
[K     |████████████████████████████████| 71kB 7.9MB/s 
[K     |████████████████████████████████| 1.2MB 45.5MB/s 
[K     |████████████████████████████████| 51kB 5.7MB/s 
[?25h  Building wheel for loompy (setup.py) ... [?25l[?25hdone
  Building wheel for sinfo (setup.py) ... [?25l[?25hdone
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Building wheel for numpy-groupies (setup.py) ... [?25l[?25hdone
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone


# Downloads: index

In [7]:
# Download the corresponding Kallisto index to fastq folder
!kb ref -d $species_kb -i index.idx -g t2g.txt -f1 transcriptome.fasta

[2021-04-05 22:10:01,954]    INFO Downloading files for human from https://caltech.box.com/shared/static/v1nm7lpnqz5syh8dyzdk2zs8bglncfib.gz to tmp/v1nm7lpnqz5syh8dyzdk2zs8bglncfib.gz
100% 2.23G/2.23G [01:38<00:00, 24.3MB/s]
[2021-04-05 22:11:41,530]    INFO Extracting files from tmp/v1nm7lpnqz5syh8dyzdk2zs8bglncfib.gz


# Process fastq files (modify kb command according to fastqs list)


In [8]:
fastqs

['SRR8949480_1.fastq.gz', 'SRR8949480_2.fastq.gz']

In [None]:
if no_samples == 1:

  # Write the kb count command as a string with all fastqs of the list as an input
  cmd = "kb count --h5ad -i index.idx -g t2g.txt -x " + technology_kb + " -o tccoutput" + sample_id[0] + " "\
  + "--filter bustools --tcc -t 2 --overwrite " + "'" +  "' '".join(fastqs) + "'"
  
  # Execute it
  !$cmd

# If more than one sample, iterate through fastqs accordingly
else:

  # Initializa counter for fastq files
  j = 0

  # Loop over samples for analysis
  for i in range(no_samples):

    fastqs_to_analyze = fastqs[j:j + fastqs_per_sample[i]]
    # Write the kb count command as a string
    cmd = "kb count --h5ad -i ../index.idx -g ../t2g.txt -x " + technology_kb + " -o tccoutput" + sample_id[i] + " \
    --filter bustools --tcc -t 2 --overwrite " + "'" +  "' '".join(fastqs_to_analyze) + "'"

    # Execute it
    !$cmd

    # Update j to move to the next set of fastq
    j = j + fastqs_per_sample[i]






results = {}
tcc_results = {}
for sample in sample_id:
  # Read the filtered gene count matrix
  results["data_" + sample] = anndata.read("/content/result" + sample)


for sample in sample_id:
  output = "bus_raw_" + sample + "_from_gc"
  !mkdir $output
  folder_tcc = "tccoutput" + sample

  # read tcc matrix
  tcc_results[sample] = anndata.read(folder_tcc + "/counts_unfiltered/adata.h5ad")
  # apply gene count matrix's filter
  tcc_results[sample] = tcc_results[sample][results["data_" + sample].obs.index.values]
  # transfer obs data
  tcc_results[sample].obs = results["data_" + sample].obs
  # transfer metadata
  tcc_results[sample].uns = results["data_" + sample].uns
  # write tcc matrix
  tcc_results[sample].write("/content/tcc_" + sample)
  #gzip tcc matrix
  cmd = "gzip /content/tcc_" + sample
  !$cmd
  #gzip unfiltered bus file
  cmd = "gzip " + folder_tcc + "/output.unfiltered.bus"
  !$cmd
  # move and re-name bus file with sample id
  cmd = "mv " + folder_tcc + "/output.unfiltered.bus.gz " + output + "/" + sample +".unfiltered.bus.gz"
  !$cmd
  # gzip all unfiltered counts
  cmd = "gzip " + folder_tcc + "/counts_unfiltered/*"
  !$cmd

  # move unfiltered counts for both
  cmd = "mv " + folder_tcc + "/counts_unfiltered " + output + "/tcc_unfiltered"
  !$cmd

  cmd = "mv " + folder_tcc + "/*.json " + output 
  !$cmd
  # zip all files
  cmd = "zip -r "+ output + ".zip " + output
  !$cmd


import time
time.sleep(60000000)

[2021-04-05 22:40:56,435]    INFO Using index index.idx to generate BUS file to tccoutputGSM3732362 from
[2021-04-05 22:40:56,436]    INFO         SRR8949480_1.fastq.gz
[2021-04-05 22:40:56,436]    INFO         SRR8949480_2.fastq.gz
[2021-04-05 22:54:10,668]    INFO Sorting BUS file tccoutputGSM3732362/output.bus to tccoutputGSM3732362/tmp/output.s.bus
[2021-04-05 22:54:20,561]    INFO Whitelist not provided
[2021-04-05 22:54:20,561]    INFO Copying pre-packaged 10XV2 whitelist to tccoutputGSM3732362
[2021-04-05 22:54:20,685]    INFO Inspecting BUS file tccoutputGSM3732362/tmp/output.s.bus
[2021-04-05 22:54:25,937]    INFO Correcting BUS records in tccoutputGSM3732362/tmp/output.s.bus to tccoutputGSM3732362/tmp/output.s.c.bus with whitelist tccoutputGSM3732362/10xv2_whitelist.txt
[2021-04-05 22:54:29,557]    INFO Sorting BUS file tccoutputGSM3732362/tmp/output.s.c.bus to tccoutputGSM3732362/output.unfiltered.bus
[2021-04-05 22:54:36,094]    INFO Generating count matrix tccoutputGSM3732

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


  adding: bus_raw_GSM3732362_from_gc/ (stored 0%)
  adding: bus_raw_GSM3732362_from_gc/kb_info.json (deflated 75%)
  adding: bus_raw_GSM3732362_from_gc/run_info.json (deflated 38%)
  adding: bus_raw_GSM3732362_from_gc/inspect.json (deflated 57%)
  adding: bus_raw_GSM3732362_from_gc/GSM3732362.unfiltered.bus.gz (deflated 3%)
  adding: bus_raw_GSM3732362_from_gc/tcc_unfiltered/ (stored 0%)
  adding: bus_raw_GSM3732362_from_gc/tcc_unfiltered/adata.h5ad.gz (deflated 10%)
  adding: bus_raw_GSM3732362_from_gc/tcc_unfiltered/cells_x_tcc.ec.txt.gz (deflated 4%)
  adding: bus_raw_GSM3732362_from_gc/tcc_unfiltered/cells_x_tcc.mtx.gz (deflated 0%)
  adding: bus_raw_GSM3732362_from_gc/tcc_unfiltered/cells_x_tcc.barcodes.txt.gz (deflated 1%)
