# Loading in and processing data from Mark Alter's Matlab file

In [155]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.io as scp

In [156]:
### loading in the mat file provided by Dr. Alter

mgla_rpkm = scp.loadmat('E:\\DATA\\microglia_sequencing\\mdi\\BLBO_NxtGen_Analysis_paper_Matlab\\MGLA_IDX_STRUCT_DATAUNIQUE.mat')

In [157]:
### extracting data from the very messy mat table

data = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][0]
probes = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][1]
data_unique = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][9]
probes_unique = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][10]
probes_unique = probes_unique[0]
headers = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][3]

In [158]:
### extracting and creating index information based on the 'header' column in the original mat file

age = [0] * 60
samples = [0] * 60
tx = [0] * 60
sex = [0] * 60

for i in range(len(headers[0])):
    age[i] = headers[1][i][0][0]
    samples[i] = headers[0][i][0]
    
i = -1
for sample in samples:
    i = i + 1
    if 'LPS' in sample:
        tx[i] = 'LPS'
    else:
        tx[i] = 'SAL'

i = -1
for sample in samples:
    i = i + 1
    if 'M' in sample:
        sex[i] = 'M'
    else:
        sex[i] = 'F'

In [159]:
### log normalizing all of the data in the matrix

mgla_devel_data_unique = np.log2(data_unique)
mgla_devel_data = np.log2(data)

In [160]:
### correcting the probes (gene names) so that they can be set as the index of the dataframe

corrected_probes = [0] * len(probes)
corrected_probes_unique = [0] * len(probes_unique)

for i in range(len(probes)):
    corrected_probes[i] = probes[i][0][0]
for i in range(len(probes_unique)):
    corrected_probes_unique[i] = probes_unique[i][0]

In [161]:
## creating new dataframes with all relevant labels as indices

df_unique = pd.DataFrame(mgla_devel_data_unique, columns = [age, tx, sex, samples])
df_unique.index = corrected_probes_unique

df = pd.DataFrame(mgla_devel_data, columns = [age, tx, sex, samples])
df.index = corrected_probes

In [162]:
## need to normalize every column against itself

## for col in df:
    ## df[col] = df[col] / df[col].mean()
    
    
## EXCLUDING this for now, doesn't seem to be used

In [163]:
### (expression value of the gene in a sample—minimum expression for the gene across all samples)/(maximum expression for the gene across all
### samples—minimum expression for the gene across all samples),
### scale all values so that they add equal weight to the index

def scale_expression(df):
    scaled_expression_df = df.copy()

    for row in range(len(scaled_expression_df)):
        scaled_expression_df.iloc[row] = (scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row])) / np.max(scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row]))
        
        
    return scaled_expression_df

In [164]:
%%time

scaled_df = scale_expression(df)

Wall time: 12 s


In [165]:
scaled_df

Unnamed: 0_level_0,18,18,18,18,35,35,35,35,35,35,...,81,81,81,81,81,81,81,81,81,81
Unnamed: 0_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,LPS,LPS,LPS,LPS,SAL,SAL,SAL,SAL,SAL
Unnamed: 0_level_2,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
Unnamed: 0_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 2,M_P60_LPS 3,M_P60_LPS 4,M_P60_LPS 5,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5
1-Mar,0.509899,0.542985,0.518782,0.000000,0.480139,0.687550,0.680888,0.643093,0.564216,0.610673,...,0.884215,0.968856,0.837516,0.850742,0.941732,0.817120,0.848177,0.771375,0.802613,0.917529
2-Mar,0.693426,0.844876,0.785941,1.000000,0.543396,0.543396,0.785941,0.844876,0.667074,0.667074,...,0.319562,0.422438,0.422438,0.543396,0.319562,0.506494,0.639124,0.639124,0.667074,0.806435
5-Mar,0.627555,0.508398,0.727812,0.696124,0.361520,0.550656,0.272961,0.439205,0.529879,0.414353,...,0.131568,0.679665,0.645406,0.570774,0.529879,0.439205,0.000000,0.046958,0.361520,0.609188
5-Mar,0.476296,0.754911,0.648986,0.609710,0.425373,0.172691,0.310581,0.568052,0.425373,0.370371,...,0.245089,0.786877,0.648986,0.476296,0.721385,0.754911,0.370371,0.000000,0.476296,0.609710
5-Mar,0.707897,0.707897,0.844690,0.812370,0.255142,0.501703,0.313887,0.076967,0.486143,0.561412,...,0.386745,0.963586,0.855213,0.720115,0.421098,0.767411,0.000000,0.421098,0.470312,0.546848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zyg11b,0.960081,0.908043,1.000000,0.000000,0.728491,0.811680,0.826767,0.747536,0.799123,0.846847,...,0.903335,0.855110,0.832623,0.786096,0.852378,0.898562,0.871072,0.914989,0.868462,0.832623
Zyx,0.700168,0.706865,0.703533,0.285593,0.982704,0.791586,0.741518,1.000000,0.764681,0.726201,...,0.706865,0.620563,0.806767,0.946886,0.653113,0.408540,0.756178,0.347859,0.589291,0.000000
Zzef1,0.725515,0.552026,0.636691,0.000000,0.766332,0.766332,0.680116,0.813408,0.743514,0.725515,...,1.000000,0.808440,0.737603,0.798316,0.787933,0.813408,0.793158,0.803409,0.636691,0.315900
Zzz3,0.750000,0.780846,0.883124,0.000000,0.869512,0.891921,0.845617,0.835598,0.904727,0.891921,...,0.803753,1.000000,0.840643,0.855366,0.900509,0.908897,0.786710,0.792481,0.786710,0.798161


In [166]:
def identify_significant_genes(df):
    
    e18_all= df[18]
    p60_all = df[81]['SAL']

    pvals = np.zeros(shape = len(df))
    sig = np.zeros(shape = len(df))
    logdiff = np.zeros(shape = len(df))

    for row in range(len(df)):
        pvals[row] = stats.ttest_ind(e18_all.iloc[row], p60_all.iloc[row])[1]
        sig[row] = stats.ttest_ind(e18_all.iloc[row], p60_all.iloc[row])[0]
        logdiff[row] = np.log2(np.mean(p60_all.iloc[row])/np.mean(e18_all.iloc[row]))
    df['pvals'] = pvals
    df['sig'] = sig
    df['logdiff'] = logdiff

    direction = [0] * len(df)
    for row in range(len(df)):
        if df['pvals'][row] < 0.05:
            if df['logdiff'][row] > 0:
                direction[row] = 'UP'
            else:
                direction[row] = 'DOWN'
        else:
            direction[row] = 'N/A'
    df['direction'] = direction
    

    return df

In [167]:
%%time

new_df = identify_significant_genes(scaled_df)

Wall time: 45.1 s


In [168]:
new_df

Unnamed: 0_level_0,18,18,18,18,35,35,35,35,35,35,...,81,81,81,81,81,81,pvals,sig,logdiff,direction
Unnamed: 0_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,SAL,SAL,SAL,SAL,SAL,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unnamed: 0_level_2,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Unnamed: 0_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
1-Mar,0.509899,0.542985,0.518782,0.000000,0.480139,0.687550,0.680888,0.643093,0.564216,0.610673,...,0.941732,0.817120,0.848177,0.771375,0.802613,0.917529,0.000001,-7.297913,1.016963,UP
2-Mar,0.693426,0.844876,0.785941,1.000000,0.543396,0.543396,0.785941,0.844876,0.667074,0.667074,...,0.319562,0.506494,0.639124,0.639124,0.667074,0.806435,0.000290,4.539842,-0.524563,DOWN
5-Mar,0.627555,0.508398,0.727812,0.696124,0.361520,0.550656,0.272961,0.439205,0.529879,0.414353,...,0.529879,0.439205,0.000000,0.046958,0.361520,0.609188,0.006317,3.113616,-0.716321,DOWN
5-Mar,0.476296,0.754911,0.648986,0.609710,0.425373,0.172691,0.310581,0.568052,0.425373,0.370371,...,0.721385,0.754911,0.370371,0.000000,0.476296,0.609710,0.505582,0.680126,-0.187489,
5-Mar,0.707897,0.707897,0.844690,0.812370,0.255142,0.501703,0.313887,0.076967,0.486143,0.561412,...,0.421098,0.767411,0.000000,0.421098,0.470312,0.546848,0.000092,5.081352,-0.972344,DOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zyg11b,0.960081,0.908043,1.000000,0.000000,0.728491,0.811680,0.826767,0.747536,0.799123,0.846847,...,0.852378,0.898562,0.871072,0.914989,0.868462,0.832623,0.258158,-1.169963,0.203813,
Zyx,0.700168,0.706865,0.703533,0.285593,0.982704,0.791586,0.741518,1.000000,0.764681,0.726201,...,0.653113,0.408540,0.756178,0.347859,0.589291,0.000000,0.077982,1.875668,-0.432397,
Zzef1,0.725515,0.552026,0.636691,0.000000,0.766332,0.766332,0.680116,0.813408,0.743514,0.725515,...,0.787933,0.813408,0.793158,0.803409,0.636691,0.315900,0.022372,-2.512414,0.546326,UP
Zzz3,0.750000,0.780846,0.883124,0.000000,0.869512,0.891921,0.845617,0.835598,0.904727,0.891921,...,0.900509,0.908897,0.786710,0.792481,0.786710,0.798161,0.083359,-1.839584,0.311750,


In [169]:
def remove_unsignificant_rows(df):
    df.reset_index(inplace = True)
    for row in range(len(df)):
        if df['direction'][row] == 'N/A':
            df.drop(row, inplace = True)  
    df.reset_index(drop=True, inplace=True)
    return df

In [170]:
%%time

df_cleaned = remove_unsignificant_rows(new_df)

Wall time: 44.4 s


In [171]:
np.unique(df_cleaned['direction'], return_counts = True)

(array(['DOWN', 'UP'], dtype=object), array([4152, 2836], dtype=int64))

In [172]:
down_genes = df_cleaned['index'][df_cleaned['direction'] == 'DOWN'].to_list()
up_genes = df_cleaned['index'][df_cleaned['direction'] == 'UP'].to_list()

In [173]:
z = np.unique(down_genes, return_counts = True)

In [174]:
def generate_index(df):
    
    samples = df.columns[1:-4]
    
    index_per_sample = [0] * len(samples)
    i = -1
    
    for sample in samples:
        i = i + 1
        index_per_sample[i] = np.mean(df[sample][df['direction'] == 'UP']) / np.mean(df[sample][df['direction'] == 'DOWN'])
        
    return index_per_sample

In [175]:
final_df = pd.DataFrame([generate_index(df_cleaned)], columns = df_cleaned.columns[1:-4])
final_df

Unnamed: 0_level_0,18,18,18,18,35,35,35,35,35,35,...,81,81,81,81,81,81,81,81,81,81
Unnamed: 0_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,LPS,LPS,LPS,LPS,SAL,SAL,SAL,SAL,SAL
Unnamed: 0_level_2,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
Unnamed: 0_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 2,M_P60_LPS 3,M_P60_LPS 4,M_P60_LPS 5,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5
0,0.646832,0.501804,0.640278,0.192485,1.295004,1.294625,1.209596,1.297525,1.062103,1.251147,...,1.774084,2.25097,1.717807,1.725162,1.935537,1.675991,1.774628,1.471903,1.531806,1.601751


In [176]:
## scale data to between 0 and 1
final_df_scaled = pd.DataFrame(final_df.iloc[0] - np.min(final_df.iloc[0])) / (np.max(final_df.iloc[0] - np.min(final_df.iloc[0])))
final_df_scaled.reset_index(inplace = True)

In [179]:
final_df_scaled.columns = ['age', 'tx', 'sex', 'sample_name', 'index']

In [180]:
final_df_scaled.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\mdi\\all_data_index.csv')