## _The goal of this code is for providing generalizable, useful, working functions intented to aid in the creation of developmental indices on any gene expression data (preferably a dataset that has already been adjusted to appropriate values i.e. RPKM/FPKM/TPM)_

#### 1) This first step is to scale all of the expression (whatever level it may be in - as long as it is normalized) to values between 0 and 1 to reduce bias between genes

In [None]:
### (expression value of the gene in a sample—minimum expression for the gene across all samples)/(maximum expression for the gene across all
### samples—minimum expression for the gene across all samples),
### scale all values so that they add equal weight to the index

def scale_expression(df):
    
    ## importing the necessary packages
    import numpy as np
    import pandas as pd
    import scipy.stats as stats
    
    
    ## creating a copy of the input dataframe to scale the expression on
    scaled_expression_df = df.copy()
    
    ## iterating through the rows of the dataframe to scale expression of each gene (by row)
    for row in range(len(scaled_expression_df)):
        scaled_expression_df.iloc[row] = (scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row])) / np.max(scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row]))        
    
    return scaled_expression_df

#### 2) This next step is to identify which genes make up the index. To do this, you must give it the scaled dataframe and column labels for the two ages you would like to create the index off of

In [None]:
def identify_significant_genes(df):
    
    
    ## these lines are different based on how you have indexind the samples in your dataframe. You may need to adjust accordingly
    young = df['E18']
    old = df['P60']
    old = old.iloc[:, old.columns.get_level_values(1)=='SAL']

    pvals = np.zeros(shape = len(df))
    sig = np.zeros(shape = len(df))
    logdiff = np.zeros(shape = len(df))

    for row in range(len(df)):
        pvals[row] = stats.ttest_ind(e18_all.iloc[row], p60_all.iloc[row])[1]
        sig[row] = stats.ttest_ind(e18_all.iloc[row], p60_all.iloc[row])[0]
        logdiff[row] = np.log2(np.mean(p60_all.iloc[row])/np.mean(e18_all.iloc[row]))
    df['pvals'] = pvals
    df['sig'] = sig
    df['logdiff'] = logdiff

    direction = [0] * len(df)
    for row in range(len(df)):
        if df['pvals'][row] < 0.05:
            if df['logdiff'][row] > 0:
                direction[row] = 'UP'
            else:
                direction[row] = 'DOWN'
        else:
            direction[row] = 'N/A'
    df['direction'] = direction
    

    return df

#### 4) This step is meant to drop the genes in the dataframe that were not developmentally regulated. This will just make it easier to move forward with the analysis

In [1]:
def remove_unsignificant_rows(df):
    df.reset_index(inplace = True)
    for row in range(len(df)):
        if df['direction'][row] == 'N/A':
            df.drop(row, inplace = True)  
    df.reset_index(drop=True, inplace=True)
    return df

#### 5) This step allows you to extract all of the information for the index genes into a standalone dataframe

In [3]:
def extract_regulated_genes(df):
    down_genes = df_cleaned['gene'][df_cleaned['direction'] == 'DOWN'].to_list()
    up_genes = df_cleaned['gene'][df_cleaned['direction'] == 'UP'].to_list()

    regulated_genes = pd.DataFrame((dict([ (k,pd.Series(v)) for k,v in {'gene' : df_cleaned['gene'], 
                                                                        'direction' : df_cleaned['direction'], 
                                                                        'valence' : df_cleaned['logdiff']}.items() ])))
    
    regulated_genes = regulated_genes.sort_values(by = 'valence', ascending = False).set_index('gene')

In [None]:
regulated_genes = extract_regulated_genes(df)

regulated_genes.to_csv('YOUR DESTINATION HERE')

#### 6) This is the function that actually generates the index, and automatically applies it to each sample in the dataset (each column) 

In [2]:
def generate_index(df):
    
    ## initialize the dataframe with the sample columns and create an index per sample array
    ## also, create an int counter
    samples = df.columns[1:-4]
    index_per_sample = [0] * len(samples)
    i = -1
    
    ## iterate through the sampels and calculate the index given the index genes
    for sample in samples:
        i = i + 1
        index_per_sample[i] = np.mean(df[sample][df['direction'] == 'UP']) / np.mean(df[sample][df['direction'] == 'DOWN'])
        
    ## create the output dataframe by stitching this index per sample array to the corresponding sample names    
    final_df = pd.DataFrame(index_per_sample, columns = samples)
        
    return final_df

#### 7) In this final function, we are scaling all of the index values to be between 0 and 1.

In [None]:
def scale_index(df):

    ## scale data to between 0 and 1
    final_df_scaled = pd.DataFrame(df.iloc[0] - np.min(df.iloc[0])) / (np.max(df.iloc[0] - np.min(df.iloc[0])))
    final_df_scaled.reset_index(inplace = True)

In [None]:
## this is purely aesthetic, but you will want to manually relabel all of the columns of the scaled DF
final_df_scaled.columns = ['age', 'sex', 'tx', 'sample_name', 'index']

final_df_scaled.to_csv('YOUR DESTINATION HERE')