## _The goal of this code is for providing generalizable, useful, working functions intented to aid in the creation of developmental indices on any gene expression data (preferably a dataset that has already been adjusted to appropriate values i.e. RPKM/FPKM/TPM)_

#### 1) This first step is to scale all of the expression (whatever level it may be in - as long as it is normalized) to values between 0 and 1 to reduce bias between genes

In [1]:
## importing the necessary packages
import numpy as np
import pandas as pd
import scipy.stats as stats

In [21]:
df = pd.read_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\mdi_w_rpkm\\GSE99622_hanamsagar2017_tpm_unmelted_v2.csv', header = [0, 1, 2, 3, 4])

In [22]:
genes = df.iloc[:, df.columns.get_level_values(4) == 'gene'].values.flatten()

In [23]:
genes

array(['Zfp85-rs1', 'Snx7', '1700034O15Rik', ..., 'Sox1', 'Mettl2', 'Ttl'],
      dtype=object)

In [24]:
df.set_index(genes, inplace = True)
df.drop(df.columns[0], axis = 1, inplace = True)

In [25]:
### (expression value of the gene in a sample—minimum expression for the gene across all samples)/(maximum expression for the gene across all
### samples—minimum expression for the gene across all samples),
### scale all values so that they add equal weight to the index

def scale_expression(df):    
    
    ## creating a copy of the input dataframe to scale the expression on
    scaled_expression_df = df.copy()
    
    ## iterating through the rows of the dataframe to scale expression of each gene (by row)
    for row in range(len(scaled_expression_df)):
        scaled_expression_df.iloc[row] = (scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row])) / np.max(scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row]))        
    
    return scaled_expression_df

In [26]:
%%time

df = scale_expression(df)

Wall time: 20.7 s


In [27]:
df

Unnamed: 0_level_0,E18,E18,E18,E18,P14,P14,P14,P14,P14,P14,...,P60 + LPS,P60 + LPS,P60 + LPS,P60 + LPS,P60 + LPS,P60,P60,P60,P60,P60
Unnamed: 0_level_1,Female,Male,Female,Male,Female,Female,Female,Female,Female,Female,...,Female,Male,Male,Male,Male,Male,Male,Male,Male,Female
Unnamed: 0_level_2,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,LPS,LPS,LPS,LPS,SAL,SAL,SAL,SAL,SAL
Unnamed: 0_level_3,F_E18 1,M_E18 1,F_E18 3,M_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,F_P60_LPS 3,M_P60_LPS 3,M_P60_LPS 4,M_P60_LPS 5,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,F_P60_Sal 6
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,...,Unnamed: 51_level_4,Unnamed: 52_level_4,Unnamed: 53_level_4,Unnamed: 54_level_4,Unnamed: 55_level_4,Unnamed: 56_level_4,Unnamed: 57_level_4,Unnamed: 58_level_4,Unnamed: 59_level_4,Unnamed: 60_level_4
Zfp85-rs1,0.307701,0.947800,0.444803,0.000000,0.225971,0.195132,0.043254,0.165641,0.178076,0.000000,...,0.625977,0.477451,0.097689,0.104165,0.806224,0.246272,0.577001,0.470818,0.717573,0.531543
Snx7,0.588140,0.368395,0.568197,1.000000,0.181462,0.219422,0.346333,0.241872,0.676114,0.452090,...,0.122795,0.154882,0.282375,0.196553,0.160842,0.294345,0.094570,0.218336,0.350578,0.467373
1700034O15Rik,,,,,,,,,,,...,,,,,,,,,,
Dnajc18,0.797903,0.653250,0.657849,0.000000,0.111909,0.408285,0.629233,0.236698,0.488088,0.484321,...,0.431513,0.262827,0.199129,0.218737,0.239025,0.592543,0.385785,0.979729,0.861057,0.033983
Lce1i,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cdkn2aip,0.747043,0.552209,0.538767,0.000000,0.155256,0.379097,0.334960,0.341667,0.249145,0.338652,...,0.640671,0.409028,0.205182,0.376656,0.283741,0.372469,0.684693,0.481230,0.369939,0.246787
Phf20,0.852279,0.625225,0.638040,0.000000,0.496946,0.497212,0.461477,0.410191,0.361023,0.230698,...,0.652753,0.815504,0.409657,0.394104,0.788039,0.723134,0.650915,0.550652,0.617484,0.235235
Sox1,0.596204,0.660740,0.109442,0.042407,0.020850,0.021605,0.031927,0.003057,0.012323,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003895,0.000000
Mettl2,0.732807,0.692819,0.577084,0.000000,0.255052,0.332333,0.308777,0.342049,0.053049,0.123743,...,0.551175,0.114495,0.212880,0.097181,0.106913,0.380824,0.254098,0.261927,0.400648,0.035146


In [28]:
### just a quick function for cleaning data and making sure you don't have and unexpressed genes

def drop_unexpressed_genes(df):

    df.dropna(inplace = True)
    
    return df

In [29]:
df = drop_unexpressed_genes(df)

#### 2) This next step is to identify which genes make up the index. To do this, you must give it the scaled dataframe and column labels for the two ages you would like to create the index off of

In [30]:
def identify_significant_genes(df, young, old):

    for gene, series in young.iterrows():
        if np.mean(young.loc[gene]) == 0:
            young.drop(gene, inplace = True)

    for gene, series in old.iterrows():
        if np.mean(old.loc[gene]) == 0:
            old.drop(gene, inplace = True)

    pvals = np.zeros(shape = len(df))
    sig = np.zeros(shape = len(df))
    logdiff = np.zeros(shape = len(df))
    row = -1

    for gene, series in df.iterrows():
        row = row + 1
        if gene in young.index:
            if gene in old.index:
                pvals[row] = stats.ttest_ind(young.loc[gene], old.loc[gene])[1]
                sig[row] = stats.ttest_ind(young.loc[gene], old.loc[gene])[0]
                logdiff[row] = np.log2(np.mean(old.loc[gene])/np.mean(young.loc[gene]))
            else:
                pvals[row] = np.NAN
                sig[row] = np.NAN
                logdiff[row] = np.NAN
        else:
            pvals[row] = np.NAN
            sig[row] = np.NAN
            logdiff[row] = np.NAN


    df['pvals'] = pvals
    df['sig'] = sig
    df['logdiff'] = logdiff

    direction = [0] * len(df)
    for row in range(len(df)):
        if df['pvals'][row] < 0.05:
            if df['logdiff'][row] > 0:
                direction[row] = 'UP'
            else:
                direction[row] = 'DOWN'
        else:
            direction[row] = 'N/A'
    df['direction'] = direction


    return df

In [31]:
## these lines are different based on how you have indexind the samples in your dataframe. You may need to adjust accordingly
young = df['E18']
old = df['P60']
old = old.iloc[:, old.columns.get_level_values(1)=='SAL']

In [32]:
%%time

df = identify_significant_genes(df, young, old)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Wall time: 1min 21s


#### 4) This step is meant to drop the genes in the dataframe that were not developmentally regulated. This will just make it easier to move forward with the analysis

In [33]:
df

Unnamed: 0_level_0,E18,E18,E18,E18,P14,P14,P14,P14,P14,P14,...,P60 + LPS,P60,P60,P60,P60,P60,pvals,sig,logdiff,direction
Unnamed: 0_level_1,Female,Male,Female,Male,Female,Female,Female,Female,Female,Female,...,Male,Male,Male,Male,Male,Female,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unnamed: 0_level_2,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,SAL,SAL,SAL,SAL,SAL,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Unnamed: 0_level_3,F_E18 1,M_E18 1,F_E18 3,M_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,F_P60_Sal 6,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,...,Unnamed: 55_level_4,Unnamed: 56_level_4,Unnamed: 57_level_4,Unnamed: 58_level_4,Unnamed: 59_level_4,Unnamed: 60_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
Zfp85-rs1,0.307701,0.947800,0.444803,0.000000,0.225971,0.195132,0.043254,0.165641,0.178076,0.000000,...,0.806224,0.246272,0.577001,0.470818,0.717573,0.531543,0.089201,-1.802642,0.693172,
Snx7,0.588140,0.368395,0.568197,1.000000,0.181462,0.219422,0.346333,0.241872,0.676114,0.452090,...,0.160842,0.294345,0.094570,0.218336,0.350578,0.467373,0.004605,3.260669,-1.157729,DOWN
Dnajc18,0.797903,0.653250,0.657849,0.000000,0.111909,0.408285,0.629233,0.236698,0.488088,0.484321,...,0.239025,0.592543,0.385785,0.979729,0.861057,0.033983,0.959522,0.051506,-0.016202,
Ipo11,0.853112,0.442486,0.719860,0.000000,0.208482,0.398846,0.636584,0.382300,0.502430,0.789239,...,0.427983,0.644599,0.334307,0.572597,0.475738,0.543234,0.670546,0.432877,-0.103336,
Mfsd2a,0.712312,0.576530,0.319268,0.210441,0.216490,0.414484,0.639350,0.250279,0.272967,0.327651,...,0.016050,0.085388,0.056526,0.221666,0.143270,0.048499,0.000003,6.870630,-2.253949,DOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cdkn2aip,0.747043,0.552209,0.538767,0.000000,0.155256,0.379097,0.334960,0.341667,0.249145,0.338652,...,0.283741,0.372469,0.684693,0.481230,0.369939,0.246787,0.988700,-0.014373,0.004244,
Phf20,0.852279,0.625225,0.638040,0.000000,0.496946,0.497212,0.461477,0.410191,0.361023,0.230698,...,0.788039,0.723134,0.650915,0.550652,0.617484,0.235235,0.576374,-0.569638,0.139166,
Sox1,0.596204,0.660740,0.109442,0.042407,0.020850,0.021605,0.031927,0.003057,0.012323,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.003895,0.000000,0.000793,4.072045,-6.567069,DOWN
Mettl2,0.732807,0.692819,0.577084,0.000000,0.255052,0.332333,0.308777,0.342049,0.053049,0.123743,...,0.106913,0.380824,0.254098,0.261927,0.400648,0.035146,0.011804,2.819747,-1.048549,DOWN


In [34]:
def remove_unsignificant_rows(df):
    for gene, series in df.iterrows():
        if df['direction'][gene] == 'N/A':
            df.drop(gene, inplace = True)  
    df.reset_index(inplace=True)
    df.rename(columns = {'index' : 'gene'}, inplace = True)
    return df

In [35]:
%%time

df2 = remove_unsignificant_rows(df)

Wall time: 1min 54s


In [41]:
df2.rename(columns = {'index' : 'gene'}, inplace = True)

#### 5) This step allows you to extract all of the information for the index genes into a standalone dataframe

In [47]:
def extract_regulated_genes(df):
    
    down_genes = df['gene'][df['direction'] == 'DOWN'].to_list()
    up_genes = df['gene'][df['direction'] == 'UP'].to_list()
    regulated_genes = pd.DataFrame((dict([ (k,pd.Series(v)) for k,v in {'gene' : df['gene'], 
                                                                        'direction' : df['direction'], 
                                                                        'valence' : df['logdiff']}.items() ])))
    
    regulated_genes = regulated_genes.sort_values(by = 'valence', ascending = False).set_index('gene')
    
    return regulated_genes

In [48]:
regulated_genes = extract_regulated_genes(df2)

#### 6) This is the function that actually generates the index, and automatically applies it to each sample in the dataset (each column) 

In [74]:
def generate_index(df, sample_cols):
    
    ## initialize the dataframe with the sample columns and create an index per sample array
    ## also, create an int counter
    samples = sample_cols
    index_per_sample = [0] * len(samples)
    i = -1
    
    ## iterate through the sampels and calculate the index given the index genes
    for sample in samples:
        i = i + 1
        index_per_sample[i] = np.mean(df[sample][df['direction'] == 'UP']) / np.mean(df[sample][df['direction'] == 'DOWN'])
        
    ## create the output dataframe by stitching this index per sample array to the corresponding sample names    
    final_df = pd.DataFrame([index_per_sample], columns = samples)
        
    return final_df

In [82]:
index = generate_index(df2, sample_cols = df2.columns[1:-4])

#### 7) In this final function, we are scaling all of the index values to be between 0 and 1.

In [83]:
def scale_index(df):

    ## scale data to between 0 and 1
    final_df_scaled = pd.DataFrame(df.iloc[0] - np.min(df.iloc[0])) / (np.max(df.iloc[0] - np.min(df.iloc[0])))
    final_df_scaled.reset_index(inplace = True)
    
    return final_df_scaled

In [84]:
index = scale_index(index)

In [85]:
index

Unnamed: 0,level_0,level_1,level_2,level_3,level_4,0
0,E18,Female,SAL,F_E18 1,Unnamed: 1_level_4,0.074341
1,E18,Male,SAL,M_E18 1,Unnamed: 2_level_4,0.039307
2,E18,Female,SAL,F_E18 3,Unnamed: 3_level_4,0.069697
3,E18,Male,SAL,M_E18 4,Unnamed: 4_level_4,0.0
4,P14,Female,SAL,F_P14 1,Unnamed: 5_level_4,0.362481
5,P14,Female,SAL,F_P14 2,Unnamed: 6_level_4,0.356923
6,P14,Female,SAL,F_P14 3,Unnamed: 7_level_4,0.290847
7,P14,Female,SAL,F_P14 4,Unnamed: 8_level_4,0.382989
8,P14,Female,SAL,F_P14 5,Unnamed: 9_level_4,0.248252
9,P14,Female,SAL,F_P14 6,Unnamed: 10_level_4,0.336164
