## This notebook will walk through a common pipeline for calculating a developmental index from an expression matrix containing normalized expression data (i.e. RPKM, FPKM, or TPM). The input dataframe should be formatted such that every column corresponds to an individual sample/cell, and every row corresponds to a gene.
---

Step (1) - Import the necessary packages and data. Clean data as necessary.

---

In [1]:
import pandas as pd
import numpy as np

import developmental_index as dvp

df = pd.read_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\mdi_w_rpkm\\GSE99622_hanamsagar2017_tpm_unmelted_v2.csv', header = [0, 1, 2, 3, 4])
genes = df.iloc[:, df.columns.get_level_values(4) == 'gene'].values.flatten()
df.set_index(genes, inplace = True)
df.drop(df.columns[0], axis = 1, inplace = True)

---

Step (2) - Scale the expression values to between 0 and 1, so that all genes contribute to the index equally

---

In [2]:
df = dvp.scale_expression(df)

--- 

Step (3) - Drop any rows (genes) that do not have detectable expression in any of the samples

---

In [3]:
df = dvp.drop_unexpressed_genes(df)

---

Step (4) - Extract the columns (samples) corresponds to all the 'young' and 'old' samples, so that they can be compared against one another to determine if there is a significant difference in expression from young to old

---

In [10]:
males_p60 = df['P60']['Male']
females_p60 = df['P60']['Female']

males_e18 = df['E18']['Male']
females_e18 = df['E18']['Female']

grouped_p60 = df['P60']
grouped_e18 = df['E18']

---

Step (4.5) - Use the 'identify significant genes' function to identify the genes that are regulated by development. Here you need to specify young and old columns, which we defined in the cell - Step 4 - above

---

In [21]:
male_df = dvp.identify_significant_genes(df, males_e18, males_p60)
male_index = male_df[male_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
male_index.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'male_lps_index.csv')
male_index

Unnamed: 0,pvals,direction,sig,logdiff
,,,,
,,,,
,,,,
,,,,
Mfsd2a,0.004746,DOWN,4.070597,-1.871197
Tmem101,0.001749,DOWN,4.902030,-1.418217
Commd9,0.025456,DOWN,2.828646,-0.761157
Got1l1,0.014832,DOWN,3.211249,-2.798335
Atxn3,0.028935,UP,-2.739587,1.117906
...,...,...,...,...


In [22]:
female_df = dvp.identify_significant_genes(df, females_e18, females_p60)
female_index = female_df[female_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
female_index.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'female_lps_index.csv')
female_index

Unnamed: 0,pvals,direction,sig,logdiff
,,,,
,,,,
,,,,
,,,,
Ipo11,3.079665e-02,DOWN,2.616937,-0.477996
Mfsd2a,6.833469e-04,DOWN,5.353047,-2.621614
Fam101b,3.493722e-02,DOWN,2.535823,-1.163587
Glt8d2,1.735805e-02,DOWN,2.989089,-1.487459
Dapk1,3.861795e-05,DOWN,8.137531,-1.768404
...,...,...,...,...


In [23]:
combined_df = dvp.identify_significant_genes(df, grouped_e18, grouped_p60)
combined_index = combined_df[combined_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
combined_index.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'combined_lps_index.csv')
combined_index

Unnamed: 0,pvals,direction,sig,logdiff
,,,,
,,,,
,,,,
,,,,
Snx7,4.604725e-03,DOWN,3.260669,-1.157729
Mfsd2a,2.714318e-06,DOWN,6.870630,-2.253949
Glt8d2,2.035058e-02,DOWN,2.558500,-1.669142
Dapk1,1.734203e-03,DOWN,3.711443,-1.222572
Tmem101,1.309934e-03,DOWN,3.840672,-0.986118
...,...,...,...,...


In [24]:
male_genes = male_index.index
female_genes = female_index.index
combined_genes = combined_index.index

In [30]:
print(len(np.intersect1d(male_genes, female_genes)), 'out of', len(male_genes), '&', len(female_genes), 'genes are shared between males and females')
print(len(np.intersect1d(male_genes, combined_genes)), 'genes are shared between males and combined')
print(len(np.intersect1d(female_genes, combined_genes)), 'genes are shared between females and combined')

3997 out of 6062 & 7735 genes are shared between males and females
5658 genes are shared between males and combined
6869 genes are shared between females and combined


In [70]:
print(len(male_genes) / len(df))
print(len(female_genes) / len(df))

0.25233008692009634
0.08016546235207875


In [73]:
np.unique(female_index['direction'], return_counts = True)

(array(['DOWN', 'UP'], dtype=object), array([674, 857], dtype=int64))