# Figure 7: gene expression statstical analysis

The following statistical tests will be used compare the gene expression data for significance:
1. MannU Whitney Analysis

In [1]:
#----------------------------------------------------------
# Import Libraries
#----------------------------------------------------------
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import statsmodels.stats.multicomp
sns.set()

## Download gene data

Download metafile data with the group description

In [2]:
metafile_location = "https://www.dropbox.com/s/37toe3vct1pipn5/groups.csv?dl=1"
metafile = pd.read_csv(metafile_location, index_col=0)
metafile['group']=metafile.diet+' '+metafile.feeding
metafile.head()

Unnamed: 0_level_0,diet,feeding,group
Rat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,control,ad lib,control ad lib
2,control,ad lib,control ad lib
3,control,ad lib,control ad lib
4,control,ad lib,control ad lib
5,control,ad lib,control ad lib


Get the data using dropbox link. Extract the gene names using column heading and save them in **gene list**

In [3]:
gene_data_location = "https://www.dropbox.com/s/rxjk6bjw2fpa3m1/qPCR_normalized_gapdph.csv?dl=1"
gene_data = pd.read_csv(gene_data_location, index_col=0)

# Rearrange so that Oxtr is the last gene column
col_list = list(gene_data)
col_list[11], col_list[8] = col_list[8], col_list[11]
gene_data = gene_data.loc[:,col_list]


gene_list = gene_data.columns.unique()
# Remove outliers, that is the measurments which is larger than 7
for c in gene_list:
    # Find index of an outlier and replace it with NAN
    out_ind = gene_data[c][gene_data[c]>=7].index
    gene_data[c][out_ind ]=np.NaN
# Add experimental group as a column to the gene dataset
ids_in_gene_data = gene_data.index
gene_data['group'] = metafile.group.loc[ids_in_gene_data]
gene_data['diet'] = metafile.diet.loc[ids_in_gene_data]
gene_data['feeding_schedule'] = metafile.feeding.loc[ids_in_gene_data]



gene_data.head()

Unnamed: 0,CART,NPY,AgRP,GLP1R,Npy2R,Ghsr,Insr,Lepr,MC4R,Cckar,Cckbr,Oxtr,group,diet,feeding_schedule
2,1.36934,0.74876,2.161361,1.297956,1.560879,4.597757,1.468209,1.883721,0.820854,1.990703,1.074096,1.61184,control ad lib,control,ad lib
6,1.669095,0.670232,1.381644,2.179846,3.612036,2.853085,1.652634,2.870864,4.686374,2.87124,4.289009,1.493576,control ad lib,control,ad lib
7,1.430304,1.504685,1.66385,2.070428,3.752371,2.621707,1.534294,2.275886,1.668385,4.892259,2.609463,2.664512,control ad lib,control,ad lib
3,0.450856,1.787492,0.688038,0.864752,1.325854,0.425616,1.204578,0.918813,3.339488,2.791819,3.417703,1.214725,control ad lib,control,ad lib
4,1.267757,0.936041,0.497659,1.092165,0.968283,1.218026,1.067142,1.467878,1.494192,4.658525,2.171984,0.946401,control ad lib,control,ad lib


### Compare significance for each gene and each experimental condition

#### Separate dataframes by experimental group

In [4]:
feeding = metafile.feeding.unique()
diet   = metafile.diet.unique()
group_dict={}
for x in diet:
        for y in feeding:
            group = str(x)+' '+str(y)
            ids = metafile[(metafile.diet==x) & (metafile.feeding==y)].index
            genes_by_group = gene_data.loc[ids]
            genes_by_group.dropna(inplace = True)
            group_dict[group]=genes_by_group

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


For each gene, compare expression between 4 experimental groups. Save p-values and separate into significant and non-significant dataframes

## Mann-U Whitney Analysis

In [5]:
groups = metafile.group.unique()
column_names = ["gene", "group1", "group2", "p_value"]
df = pd.DataFrame(columns = column_names)
i=0
for c in gene_list:
    
    for x in groups:
        group1 = group_dict[x][c]
        
        for y in groups:
            if(y!= x):
                group2 = group_dict[y][c]
                u_statistic, pVal = stats.mannwhitneyu(group1, group2)
                df.loc[i]=[c, x, y, pVal]
                i+=1
df.to_csv('all_genes_p_values_Mann_Whitney.csv')
# Sepatate significantly expressed genes (p<0.05) and  Remove repeated comparisons
significant_genes = df[df.p_value<0.05].drop_duplicates(subset='p_value').reset_index(drop=True)
# Save significant genes to .csv file
significant_genes.to_csv('significant_genes_p_values_Mann_Whitney.csv')
significant_genes


Unnamed: 0,gene,group1,group2,p_value
0,NPY,control restriction,HFHS restriction,0.026892
1,GLP1R,control restriction,HFHS ad lib,0.013405
2,Ghsr,control ad lib,control restriction,0.036819
3,Insr,control ad lib,control restriction,0.048349
4,Insr,control ad lib,HFHS restriction,0.00621
5,Insr,HFHS ad lib,HFHS restriction,0.046348
6,Lepr,control ad lib,control restriction,0.010725
