In [1]:
!pip install -q scanpy==1.4.5

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
from pathlib import Path
from google.colab import drive

In [3]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Change this depending on where the data is uploaded.
DATA = Path("/content/drive/My Drive/ecbm_4060")

In [5]:
nsclc_df = pd.read_csv(DATA / "GSE99254_NSCLC.TCell.S11769.norm.centered.txt", sep="\t")
# transpose such that the data direction conforms with scanpy defaults
nsclc_df = nsclc_df.T
# row 0 is gene ID, row 1 is gene name. using gene name since melanoma dataset also uses name
nsclc_df.columns = nsclc_df.iloc[1, :]
nsclc_df = nsclc_df.iloc[2:, :]
# remove duplicate genes and NA genes
nsclc_df = nsclc_df.loc[:,~nsclc_df.columns.duplicated()]
nsclc_df = nsclc_df.loc[:,~nsclc_df.columns.isna()]
nsclc_df

geneSymbol,A1BG,ADA,AKT3,ZBTB11-AS1,MED6,SNORD116-26,DDTL,NAALADL1,CDKN2B-AS1,ACOT8,...,RCE1,HNRNPDL,DMTF1,PPP4R1,CDH1,SLC12A6,PTBP3,DGCR2,CASP8AP2,SCO2
NTH10-0616A,-0.848193,6.376295,4.45608,-0.644741,-1.129267,-0.131744,-1.048671,-0.458395,-0.089446,-0.002032,...,-0.515126,-6.612899,-4.003849,-0.574295,-0.037265,3.092914,4.256724,-1.684558,-1.888656,-0.809862
NTH11-0616A,-0.848193,7.420941,-1.091124,5.601858,-0.032977,-0.131744,-1.048671,-0.458395,-0.089446,-0.852278,...,-0.515126,-0.249157,-2.724533,-0.574295,-0.037265,-2.595739,1.808263,7.048808,-1.888656,-0.809862
NTH15-0616A,-0.848193,-1.115766,-1.091124,-0.644741,-1.307854,-0.131744,-1.048671,-0.458395,-0.089446,-0.852278,...,-0.515126,1.913013,4.086526,-0.574295,-0.037265,1.996612,-0.89738,-1.684558,-1.888656,6.783347
NTH17-0616A,-0.848193,-1.115766,-1.091124,-0.644741,5.743441,-0.131744,-1.048671,0.17834,-0.089446,-0.852278,...,-0.515126,1.539415,5.522117,-0.574295,-0.037265,-2.595739,0.666022,-1.684558,-1.888656,-0.809862
NTH2-0616A,-0.848193,-1.115766,-1.091124,-0.644741,-1.979512,-0.131744,-1.048671,-0.458395,-0.089446,1.729563,...,-0.515126,2.231985,-3.227802,-0.574295,-0.037265,5.384337,-1.595747,-1.684558,-1.888656,-0.809862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTY63-20171219,-0.468772,-1.183681,-0.357978,-0.238289,-0.788264,-0.065362,-1.017933,-0.136065,1.044818,-0.454805,...,-0.690047,-3.541746,-1.641898,-0.278881,-0.019025,-1.329944,-1.404401,-1.040901,-0.56329,-0.634206
TTY65-20171219,-0.468772,-1.183681,-0.357978,-0.238289,-0.788264,-0.065362,-1.017933,-0.136065,-0.06548,8.923864,...,-0.690047,-0.468231,-1.641898,-0.278881,-0.019025,-1.329944,-1.404401,-1.040901,-0.56329,6.672005
TTY7-20171219,-0.468772,-1.183681,-0.357978,-0.238289,-0.788264,-0.065362,-1.017933,-0.136065,-0.06548,-0.454805,...,-0.690047,5.98938,4.369252,-0.278881,-0.019025,-0.041229,-1.404401,-1.040901,-0.56329,-0.634206
TTY8-20171219,-0.468772,-1.183681,-0.357978,-0.238289,-0.788264,1.176531,-1.017933,-0.136065,-0.06548,-0.454805,...,0.551846,-2.299853,-1.641898,-0.278881,-0.019025,-1.329944,-1.404401,-1.040901,-0.56329,-0.634206


In [6]:
# scran normalized data is in log2 space. This looks anti-pattern but using applymap saves on ram somehow
nsclc_df = nsclc_df.applymap(lambda x: 2 ** x)
nsclc_df

geneSymbol,A1BG,ADA,AKT3,ZBTB11-AS1,MED6,SNORD116-26,DDTL,NAALADL1,CDKN2B-AS1,ACOT8,...,RCE1,HNRNPDL,DMTF1,PPP4R1,CDH1,SLC12A6,PTBP3,DGCR2,CASP8AP2,SCO2
NTH10-0616A,0.555480,83.072291,21.948946,0.639608,0.457148,0.912727,0.483413,0.727796,0.939883,0.998592,...,0.699732,0.010217,0.062333,0.671614,0.974501,8.532175,19.116200,0.311098,0.270059,0.570436
NTH11-0616A,0.555480,171.366431,0.469396,48.565422,0.977401,0.912727,0.483413,0.727796,0.939883,0.553910,...,0.699732,0.841388,0.151298,0.671614,0.974501,0.165426,3.502203,132.404443,0.270059,0.570436
NTH15-0616A,0.555480,0.461446,0.469396,0.639608,0.403921,0.912727,0.483413,0.727796,0.939883,0.553910,...,0.699732,3.765947,16.988964,0.671614,0.974501,3.990617,0.536861,0.311098,0.270059,110.151604
NTH17-0616A,0.555480,0.461446,0.469396,0.639608,53.573252,0.912727,0.483413,1.131581,0.939883,0.553910,...,0.699732,2.906766,45.953936,0.671614,0.974501,0.165426,1.586692,0.311098,0.270059,0.570436
NTH2-0616A,0.555480,0.461446,0.469396,0.639608,0.253576,0.912727,0.483413,0.727796,0.939883,3.316273,...,0.699732,4.697798,0.106742,0.671614,0.974501,41.768314,0.330851,0.311098,0.270059,0.570436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTY63-20171219,0.722579,0.440227,0.780257,0.847750,0.579040,0.955706,0.493823,0.909998,2.063106,0.729609,...,0.619834,0.085867,0.320435,0.824230,0.986900,0.397784,0.377775,0.486024,0.676757,0.644296
TTY65-20171219,0.722579,0.440227,0.780257,0.847750,0.579040,0.955706,0.493823,0.909998,0.955627,485.680492,...,0.619834,0.722850,0.320435,0.824230,0.986900,0.397784,0.377775,0.486024,0.676757,101.970266
TTY7-20171219,0.722579,0.440227,0.780257,0.847750,0.579040,0.955706,0.493823,0.909998,0.955627,0.729609,...,0.619834,63.530593,20.666926,0.824230,0.986900,0.971827,0.377775,0.486024,0.676757,0.644296
TTY8-20171219,0.722579,0.440227,0.780257,0.847750,0.579040,2.260327,0.493823,0.909998,0.955627,0.729609,...,1.465961,0.203084,0.320435,0.824230,0.986900,0.397784,0.377775,0.486024,0.676757,0.644296


In [7]:
# There are some duplicate genes as well as some NA's, but for the purpose
nsclc_adata = sc.AnnData(nsclc_df,
                         nsclc_df.index.to_frame(name="cell"),
                         nsclc_df.columns.to_frame(name="gene"))

  nsclc_adata = sc.AnnData(nsclc_df,


In [8]:
# For some reason the series is by default object type.
# This resolves this well enough to move forward.
nsclc_adata.var_names = nsclc_adata.var_names.astype('category', copy=False)

AnnData expects .var.index to contain strings, but got values like:
    ['A1BG', 'ADA', 'AKT3', 'ZBTB11-AS1', 'MED6']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")


In [9]:
nsclc_adata.write(filename=DATA / "GSE99254.h5ad")