In [17]:
# Import required modules
# Python plotting library
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Numerical python library (pronounced "num-pie")
import numpy as np
# Dataframes in Python
import pandas as pd
# Statistical plotting library we'll use
import seaborn as sns
# This is necessary to show the plotted figures inside the notebook -- "inline" with the notebook cells
%matplotlib inline
## Add more when seeing new modules in other notebooks!

## I. Top expressed genes 

In [18]:
# Define function GetTopGene() to Rank gene by expression level and get the list of top expressed gene 
def GetTopGene(mtx, top_num = 200):
    """
    Sort expression level of all genes in a cell, and pick the top expressed genes for each cell. 
    Make the union list of the top expressed genes.
    Usage: gene_list = GetTopGene(mtx, top_num)
    Args:
        mtx: Gene by cell count matrix
        top_num: Number of top expressed genes to be included in the top list. Default = 200
    """
    tglist = []
    for name in mtx.columns:
        mtx_sorted = mtx.sort_values(by=name, ascending=False)
        tglist.extend(list(mtx_sorted.index[0:int(top_num)]))
    uniqtopgenes = list(set(tglist))
    return uniqtopgenes

In [27]:
# Define a function to take care the overall process from reading in files 
# to saving the matrix with only top expressed genes.
def TopGene(inFile, compression=False, index=0, top_num = 200):
    """
    Reading in filtered matrix, run GetTopGene() on the matrix, and save the matrix 
    with only the top expressed genes to a new file.
    Usage: TopGene(inFile)
    Args:
        inFile: Absolute path of the file containing a filtered cell-by-gene or gene-by-cell matrix
        compression: boolean. If True, file will be decompressed when reading in.
        index: 0 or 1. Use 0 when the indexes are genes, 1 when it is not.
        top_num: Number of top expressed genes to be included in the top list. Default = 200
    """
    # Read in the DGE data sheet
    if compression==True:
        expression = pd.read_table(inFile, sep=',',index_col=0,compression='gzip')
    else:
        expression = pd.read_table(inFile, sep=',',index_col=0)
    # Convert input file names to output file names
    base = inFile.strip(".mtx")
    out = base + "_top.mtx"
    print("Reading in %s" % inFile)
    if index==1:
        expression = expression.T
    # Get top genes with GetTopGene()
    topgenes = GetTopGene(expression, top_num=100)
    # How many genes are included in this gene list?
    print("%s genes included in topgene list" % (len(topgenes)))
    # Extract matrix with topgenes
    topgene_exp = expression.loc[topgenes]
    print("The shape of the top gene matrix is:", topgene_exp.shape)
    # Save top gene matrix to file
    topgene_exp.T.to_csv(out)
    print("Topgene matrix is saved as %s" % out)

In [28]:
# Run TopGene() on filtered mtx files.
TopGene('Test_filtered.mtx', index=1)

Reading in Test_filtered.mtx
1456 genes included in topgene list
The shape of the top gene matrix is: (1456, 135)
Topgene matrix is saved as Test_filtered_top.mtx


In [29]:
TopGene('Test_filtered_lr.mtx', index=1)

Reading in Test_filtered_lr.mtx
260 genes included in topgene list
The shape of the top gene matrix is: (260, 135)
Topgene matrix is saved as Test_filtered_lr_top.mtx


## II. Highly variable genes