# Setup

## Load Packages

In [5]:
import os
import scanpy as sc

import numpy as np
import pandas as pd

from numpy.random.mtrand import RandomState
from sklearn.utils import check_random_state, check_array
from sklearn.decomposition import NMF

import gensim
from gensim import corpora, models, similarities

## Read Data

In [6]:
## Read Data
# adata = sc.read('/ahg/regevdata/users/kgosik/data/pbmc/pbmc_test_data.h5ad')
adata = sc.read_10x_mtx('/ahg/regevdata/users/kgosik/data/pbmc/hg19')

In [7]:
## Extract Sparse gbm 
adata_use = adata.copy()
mat = adata_use.X.transpose()
geneids = adata_use.var_names

## Create Vocab list of genes
id_list = geneids.tolist()
out = [[]]
for i in id_list: out.append([i])
## Turn into dictionary for use in model
dictionary = corpora.Dictionary(out)
## Convert gbm to a corpus format for model
corpus = gensim.matutils.Sparse2Corpus(mat)
# corpus = gensim.matutils.Dense2Corpus(mat)
## corpora.MmCorpus.serialize(project_directory + '/corpus/' + output + '_corpus.mm', corpus)

In [9]:
num_topics = 5
random_state = 1
update_every = 0
chunksize = 1000

## NMF Section

### NMF Code Chunks (sklearn)

In [10]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
nmf = NMF(n_components=num_topics, init='random', random_state=random_state) #, alpha=decay)
W = nmf.fit_transform(mat.T)
H = nmf.components_

## topic by cell/documents
cell_topics = pd.DataFrame(W)
cell_topics['index'] = adata.obs.index.tolist()
cell_topics.set_index('index', inplace=True)

## every topic by every gene
topic_scores = pd.DataFrame(H).T
topic_scores['index'] = adata.var_names
topic_scores.set_index('index', inplace=True)

In [18]:
cell_topics.head(3)

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAACATACAACCAC-1,0.321293,0.030133,0.281609,0.103328,0.0
AAACATTGAGCTAC-1,0.63207,0.007168,0.640968,0.109391,0.228019
AAACATTGATCAGC-1,0.200473,0.121113,0.786772,0.119246,0.0


In [19]:
topic_scores.head(3)

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MIR1302-10,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0
OR4F5,0.0,0.0,0.0,0.0,0.0


### NMF Using Script Import (sklearn)

In [None]:
# Solution A
import sc_topics

In [13]:
# Solution B - If the script importing the module is not in a package
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
import sc_topics

In [14]:
adata.obs.head(3)

AAACATACAACCAC-1
AAACATTGAGCTAC-1
AAACATTGATCAGC-1


In [15]:
adata=sc_topics.topics(adata, 
                       model="nmf",
                       num_topics=num_topics,
                       random_state=random_state,
                       update_every=update_every,
                       chunksize=chunksize,
                       passes=1,
                       alpha='auto',
                       per_word_topics=True,
                       copy=True)

In [16]:
adata.obs.head(3)

Unnamed: 0,0,1,2,3,4
AAACATACAACCAC-1,0.321293,0.030133,0.281609,0.103328,0.0
AAACATTGAGCTAC-1,0.63207,0.007168,0.640968,0.109391,0.228019
AAACATTGATCAGC-1,0.200473,0.121113,0.786772,0.119246,0.0


In [17]:
adata.var.head(3)

Unnamed: 0,gene_ids,0,1,2,3,4
MIR1302-10,ENSG00000243485,0.0,0.0,0.0,0.0,0.0
FAM138A,ENSG00000237613,0.0,0.0,0.0,0.0,0.0
OR4F5,ENSG00000186092,0.0,0.0,0.0,0.0,0.0


### NMF Code Chunks (gensim/online)

This one is not working.  It keeps giving an error saying that "AttributeError: module 'gensim.models' has no attribute 'nmf'"

In [None]:
## https://radimrehurek.com/gensim/models/nmf.html
nmf = gensim.models.nmf.Nmf(corpus, num_topics=10)

## Cell Topics
cell_scores = nmf.get_document_topics(corpus)
cell_scores_mat = gensim.matutils.corpus2dense(cell_scores, num_terms=num_topics)

## topic by cell/documents
cell_topics = pd.DataFrame(cell_scores_mat.T)
cell_topics['index'] = adata.obs.index.tolist()
cell_topics.set_index('index', inplace=True)

## every topic by every gene
topic_scores = pd.DataFrame(nmf.get_topics()).T
topic_scores['index'] = adata.var_names
topic_scores.set_index('index', inplace=True)

### NMF Using Script Import (gensim/online)

## LDA Section

### LDA Code Chunks

In [None]:
## Read Data
# adata = sc.read('/ahg/regevdata/users/kgosik/data/pbmc/pbmc_test_data.h5ad')
adata = sc.read_10x_mtx('/ahg/regevdata/users/kgosik/data/pbmc/hg19')

In [None]:
adata.X

In [None]:
## Extract Sparse gbm 
adata_use = adata.copy()
mat = adata_use.X.transpose()
geneids = adata_use.var_names

## Create Vocab list of genes
id_list = geneids.tolist()
out = [[]]
for i in id_list: out.append([i])
## Turn into dictionary for use in model
dictionary = corpora.Dictionary(out)
## Convert gbm to a corpus format for model
corpus = gensim.matutils.Sparse2Corpus(mat)
# corpus = gensim.matutils.Dense2Corpus(mat)
## corpora.MmCorpus.serialize(project_directory + '/corpus/' + output + '_corpus.mm', corpus)

In [None]:
num_topics = 5
random_state = 1
update_every = 0
chunksize = 1000

In [None]:
## model == 'lda'
## Latent Dirichlet Allocation ####
lda = models.LdaModel(corpus=corpus, id2word=dictionary,
                            num_topics=num_topics,
                            random_state=random_state,
                            update_every=update_every,
                            chunksize=chunksize,
                            passes=1,
                            alpha='auto',
                            per_word_topics=True)

## Cell Topics
cell_scores = lda.get_document_topics(corpus)
cell_scores_mat = gensim.matutils.corpus2dense(cell_scores, num_terms=num_topics)

## topic by cell/documents
cell_topics = pd.DataFrame(cell_scores_mat.T)
cell_topics['index'] = adata.obs.index.tolist()
cell_topics.set_index('index', inplace=True)

## every topic by every gene
topic_scores = pd.DataFrame(lda.get_topics()).T
topic_scores['index'] = adata.var_names
topic_scores.set_index('index', inplace=True)

In [None]:
cell_topics.head(3)

In [None]:
topic_scores.head(3)

### LDA Using Script Import

In [None]:
# Solution A
import sc_topics

In [None]:
# Solution B - If the script importing the module is not in a package
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
import sc_topics

In [None]:
## Read Data
# adata = sc.read('/ahg/regevdata/users/kgosik/data/pbmc/pbmc_test_data.h5ad')
adata = sc.read_10x_mtx('/ahg/regevdata/users/kgosik/data/pbmc/hg19')

In [None]:
adata.obs.head(3)

In [None]:
adata=sc_topics.topics(adata, 
                       model="lda",
                       num_topics=num_topics,
                       random_state=random_state,
                       update_every=update_every,
                       chunksize=chunksize,
                       passes=1,
                       alpha='auto',
                       per_word_topics=True,
                       copy=True)

In [None]:
adata.obs.head(3)

In [None]:
adata.var.head(3)