## This notebook will walk through a common pipeline for calculating a developmental index from an expression matrix containing normalized expression data (i.e. RPKM, FPKM, or TPM). The input dataframe should be formatted such that every column corresponds to an individual sample/cell, and every row corresponds to a gene.
---

Step (1) - Import the necessary packages and data. Clean data as necessary.

---

In [33]:
import pandas as pd
import numpy as np

import developmental_index as dvp

df = pd.read_csv('C:\\Users\\Ben\\OneDrive\\bilbo_lab\\microglia-seq_website\\microglia-seq\\mdi_w_rpkm\\GSE99622_hanamsagar2017_tpm_unmelted_v2.csv', header = [0, 1, 2, 3, 4])
genes = df.iloc[:, df.columns.get_level_values(4) == 'gene'].values.flatten()
df.set_index(genes, inplace = True)
df.drop(df.columns[0], axis = 1, inplace = True)

---

Step (2) - Scale the expression values to between 0 and 1, so that all genes contribute to the index equally

---

In [14]:
avg_gene_expression = []

for i in range(len(df.columns)):
    avg_gene_expression = np.append(avg_gene_expression, df[df.columns[4]].mean())

new_df = df.T.reset_index()[df.T.reset_index().columns[0:4]]
new_df['avg_gene_expression'] = avg_gene_expression
new_df.to_csv('TPM_avg_gene_expression.csv')

In [35]:
df = dvp.scale_expression(df)

In [36]:
df

Unnamed: 0_level_0,E18,E18,E18,E18,P14,P14,P14,P14,P14,P14,...,P60 + LPS,P60 + LPS,P60 + LPS,P60 + LPS,P60 + LPS,P60,P60,P60,P60,P60
Unnamed: 0_level_1,Female,Male,Female,Male,Female,Female,Female,Female,Female,Female,...,Female,Male,Male,Male,Male,Male,Male,Male,Male,Female
Unnamed: 0_level_2,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,LPS,LPS,LPS,LPS,SAL,SAL,SAL,SAL,SAL
Unnamed: 0_level_3,F_E18 1,M_E18 1,F_E18 3,M_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,F_P60_LPS 3,M_P60_LPS 3,M_P60_LPS 4,M_P60_LPS 5,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,F_P60_Sal 6
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,...,Unnamed: 51_level_4,Unnamed: 52_level_4,Unnamed: 53_level_4,Unnamed: 54_level_4,Unnamed: 55_level_4,Unnamed: 56_level_4,Unnamed: 57_level_4,Unnamed: 58_level_4,Unnamed: 59_level_4,Unnamed: 60_level_4
Zfp85-rs1,0.307701,0.947800,0.444803,0.000000,0.225971,0.195132,0.043254,0.165641,0.178076,0.000000,...,0.625977,0.477451,0.097689,0.104165,0.806224,0.246272,0.577001,0.470818,0.717573,0.531543
Snx7,0.588140,0.368395,0.568197,1.000000,0.181462,0.219422,0.346333,0.241872,0.676114,0.452090,...,0.122795,0.154882,0.282375,0.196553,0.160842,0.294345,0.094570,0.218336,0.350578,0.467373
1700034O15Rik,,,,,,,,,,,...,,,,,,,,,,
Dnajc18,0.797903,0.653250,0.657849,0.000000,0.111909,0.408285,0.629233,0.236698,0.488088,0.484321,...,0.431513,0.262827,0.199129,0.218737,0.239025,0.592543,0.385785,0.979729,0.861057,0.033983
Lce1i,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cdkn2aip,0.747043,0.552209,0.538767,0.000000,0.155256,0.379097,0.334960,0.341667,0.249145,0.338652,...,0.640671,0.409028,0.205182,0.376656,0.283741,0.372469,0.684693,0.481230,0.369939,0.246787
Phf20,0.852279,0.625225,0.638040,0.000000,0.496946,0.497212,0.461477,0.410191,0.361023,0.230698,...,0.652753,0.815504,0.409657,0.394104,0.788039,0.723134,0.650915,0.550652,0.617484,0.235235
Sox1,0.596204,0.660740,0.109442,0.042407,0.020850,0.021605,0.031927,0.003057,0.012323,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003895,0.000000
Mettl2,0.732807,0.692819,0.577084,0.000000,0.255052,0.332333,0.308777,0.342049,0.053049,0.123743,...,0.551175,0.114495,0.212880,0.097181,0.106913,0.380824,0.254098,0.261927,0.400648,0.035146


In [37]:
avg_gene_expression = []

for i in range(len(df.columns)):
    avg_gene_expression = np.append(avg_gene_expression, df[df.columns[4]].mean())
    
new_df = df.T.reset_index()[df.T.reset_index().columns[0:4]]
new_df['avg_gene_expression'] = avg_gene_expression
new_df.to_csv('scaled_avg_gene_expression.csv')

--- 

Step (3) - Drop any rows (genes) that do not have detectable expression in any of the samples

---

In [8]:
df = dvp.drop_unexpressed_genes(df)

---

Step (4) - Extract the columns (samples) corresponds to all the 'young' and 'old' samples, so that they can be compared against one another to determine if there is a significant difference in expression from young to old

---

In [3]:
lps_males = df['P60 + LPS']['Male']
lps_females = df['P60 + LPS']['Female']
sal_males = df['P60']['Male']
sal_females = df['P60']['Female']

lps_grouped = df['P60 + LPS']
sal_grouped = df['P60']

---

Step (4.5) - Use the 'identify significant genes' function to identify the genes that are regulated by development. Here you need to specify young and old columns, which we defined in the cell - Step 4 - above

---

In [5]:
# male_df = dvp.identify_significant_genes(df, sal_males, lps_males)
# female_df = dvp.identify_significant_genes(df, sal_females, lps_females)
# combined_df = dvp.identify_significant_genes(df, sal_grouped, lps_grouped)

KeyboardInterrupt: 

In [26]:
# male_index = male_df[male_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
# female_index = female_df[female_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
# combined_index = combined_df[combined_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]

In [27]:
# male_index.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'male_lps_index.csv')
# female_index.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'female_lps_index.csv')
# combined_index.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'combined_lps_index.csv')

In [21]:
male_df = dvp.identify_significant_genes(df, sal_males, lps_males)
male_index = male_df[male_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
male_index.to_csv('C:\\Users\\Ben\\OneDrive - Duke University\\bilbo_lab\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'male_lps_index.csv')
male_index

Unnamed: 0,pvals,direction,sig,logdiff
,,,,
,,,,
Dnajc18,0.004442,DOWN,3.652720,-1.124817
Mfsd2a,0.003535,DOWN,3.791432,-2.068529
Fam101b,0.000558,DOWN,4.974899,-2.635221
Atg2b,0.017823,DOWN,2.830948,-0.249499
Dapk1,0.003158,DOWN,3.860416,-0.790312
...,...,...,...,...
Tst,0.002944,DOWN,3.903451,-2.716878
Cobl,0.033743,DOWN,2.458851,-1.014983


In [22]:
female_df = dvp.identify_significant_genes(df, sal_females, lps_females)
female_index = female_df[female_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
female_index.to_csv('C:\\Users\\Ben\\OneDrive - Duke University\\bilbo_lab\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'female_lps_index.csv')
female_index

Unnamed: 0,pvals,direction,sig,logdiff
,,,,
,,,,
,,,,
,,,,
Mfsd2a,0.029810,DOWN,2.494232,-2.905083
1600029D21Rik,0.040799,UP,-2.316916,1.990368
Creld2,0.025239,DOWN,2.587752,-0.842949
Spcs3,0.021521,DOWN,2.677054,-0.551456
Fahd2a,0.010836,UP,-3.060809,0.737627
...,...,...,...,...


In [23]:
combined_df = dvp.identify_significant_genes(df, sal_grouped, lps_grouped)
combined_index = combined_df[combined_df['pvals'] < 0.05][['pvals', 'direction', 'sig', 'logdiff']]
combined_index.to_csv('C:\\Users\\Ben\\OneDrive - Duke University\\bilbo_lab\\microglia-seq_website\\microglia-seq\\lps_index\\' + 'combined_lps_index.csv')
combined_index

Unnamed: 0,pvals,direction,sig,logdiff
,,,,
,,,,
,,,,
,,,,
Dnajc18,0.023227,DOWN,2.431688,-0.710954
Ipo11,0.020011,DOWN,2.499612,-0.503932
Mfsd2a,0.000406,DOWN,4.131305,-2.271911
Fam101b,0.000508,DOWN,4.040869,-1.751723
1600029D21Rik,0.000157,UP,-4.513321,2.294836
...,...,...,...,...


In [58]:
male_genes = male_index.index
female_genes = female_index.index
combined_genes = combined_index.index

In [65]:
print(len(np.intersect1d(male_genes, female_genes)), 'genes are shared between males and females')
print(len(np.intersect1d(male_genes, combined_genes)), 'genes are shared between males and combined')
print(len(np.intersect1d(female_genes, combined_genes)), 'genes are shared between females and combined')

709 genes are shared between males and females
2478 genes are shared between males and combined
1145 genes are shared between females and combined


In [70]:
print(len(male_genes) / len(df))
print(len(female_genes) / len(df))

0.25233008692009634
0.08016546235207875


In [73]:
np.unique(female_index['direction'], return_counts = True)

(array(['DOWN', 'UP'], dtype=object), array([674, 857], dtype=int64))