# Setup

In [1]:
import os
import scanpy as sc

import numpy as np
import pandas as pd

from numpy.random.mtrand import RandomState
from sklearn.utils import check_random_state, check_array
from sklearn.decomposition import NMF

import gensim
from gensim import corpora, models, similarities

## NMF Section

### NMF Code Chunks

### NMF Using Script Import

## LDA Section

### LDA Code Chunks

In [2]:
## Read Data
# adata = sc.read('/ahg/regevdata/users/kgosik/data/pbmc/pbmc_test_data.h5ad')
adata = sc.read_10x_mtx('/ahg/regevdata/users/kgosik/data/pbmc/hg19')

In [3]:
adata.X

<2700x32738 sparse matrix of type '<class 'numpy.float32'>'
	with 2286884 stored elements in Compressed Sparse Row format>

In [4]:
## Extract Sparse gbm 
adata_use = adata.copy()
mat = adata_use.X.transpose()
geneids = adata_use.var_names

## Create Vocab list of genes
id_list = geneids.tolist()
out = [[]]
for i in id_list: out.append([i])
## Turn into dictionary for use in model
dictionary = corpora.Dictionary(out)
## Convert gbm to a corpus format for model
corpus = gensim.matutils.Sparse2Corpus(mat)
# corpus = gensim.matutils.Dense2Corpus(mat)
## corpora.MmCorpus.serialize(project_directory + '/corpus/' + output + '_corpus.mm', corpus)

In [5]:
num_topics = 5
random_state = 1
update_every = 0
chunksize = 1000

In [6]:
## model == 'lda'
## Latent Dirichlet Allocation ####
lda = models.LdaModel(corpus=corpus, id2word=dictionary,
                            num_topics=num_topics,
                            random_state=random_state,
                            update_every=update_every,
                            chunksize=chunksize,
                            passes=1,
                            alpha='auto',
                            per_word_topics=True)

## Cell Topics
cell_scores = lda.get_document_topics(corpus)
cell_scores_mat = gensim.matutils.corpus2dense(cell_scores, num_terms=num_topics)

## topic by cell/documents
cell_topics = pd.DataFrame(cell_scores_mat.T)
cell_topics['index'] = adata.obs.index.tolist()
cell_topics.set_index('index', inplace=True)

## every topic by every gene
topic_scores = pd.DataFrame(lda.get_topics()).T
topic_scores['index'] = adata.var_names
topic_scores.set_index('index', inplace=True)

In [7]:
cell_topics.head(3)

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAACATACAACCAC-1,0.17877,0.322717,0.161274,0.077828,0.259411
AAACATTGAGCTAC-1,0.138892,0.306201,0.103224,0.101763,0.349919
AAACATTGATCAGC-1,0.381413,0.233595,0.092662,0.151612,0.140718


In [8]:
topic_scores.head(3)

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MIR1302-10,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
FAM138A,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
OR4F5,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07


### LDA Using Script Import

In [9]:
# Solution B - If the script importing the module is not in a package
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
import sc_topics

In [10]:
## Read Data
# adata = sc.read('/ahg/regevdata/users/kgosik/data/pbmc/pbmc_test_data.h5ad')
adata = sc.read_10x_mtx('/ahg/regevdata/users/kgosik/data/pbmc/hg19')

In [11]:
adata.obs.head(3)

AAACATACAACCAC-1
AAACATTGAGCTAC-1
AAACATTGATCAGC-1


In [12]:
adata=sc_topics.topics(adata, model="lda",
                       num_topics=num_topics,
                       random_state=random_state,
                       update_every=update_every,
                       chunksize=chunksize,
                       passes=1,
                       alpha='auto',
                       per_word_topics=True,
                      copy=True)

Running LDA with  5 topics


In [13]:
adata.obs.head(5)

Unnamed: 0,0,1,2,3,4
AAACATACAACCAC-1,0.17877,0.322717,0.161274,0.077828,0.259411
AAACATTGAGCTAC-1,0.138892,0.306201,0.103224,0.101763,0.349919
AAACATTGATCAGC-1,0.381413,0.233595,0.092662,0.151612,0.140718
AAACCGTGCTTCCG-1,0.037346,0.089396,0.55485,0.071451,0.246956
AAACCGTGTATGCG-1,0.429878,0.172306,0.041497,0.188836,0.167484


In [14]:
adata.var.head(5)

Unnamed: 0,gene_ids,0,1,2,3,4
MIR1302-10,ENSG00000243485,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
FAM138A,ENSG00000237613,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
OR4F5,ENSG00000186092,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
RP11-34P13.7,ENSG00000238009,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
RP11-34P13.8,ENSG00000239945,2.390618e-07,1.415336e-07,1.362322e-07,2.817572e-07,1.024772e-07
