# Loading in and processing data from Mark Alter's Matlab file

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.io as scp

In [2]:
### loading in the mat file provided by Dr. Alter

mgla_rpkm = scp.loadmat('E:\\DATA\\microglia_sequencing\\mdi\\BLBO_NxtGen_Analysis_paper_Matlab\\MGLA_IDX_STRUCT_DATAUNIQUE.mat')

In [3]:
### extracting data from the very messy mat table

data = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][0]
probes = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][1]
data_unique = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][9]
probes_unique = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][10]
probes_unique = probes_unique[0]
headers = mgla_rpkm['MGLA_IDX_struct_dataunique'][0][0][3]

In [4]:
### extracting and creating index information based on the 'header' column in the original mat file

age = [0] * 60
samples = [0] * 60
tx = [0] * 60
sex = [0] * 60

for i in range(len(headers[0])):
    age[i] = headers[1][i][0][0]
    samples[i] = headers[0][i][0]
    
    
### rename ages to useful strings
for i in range(len(age)):
    if age[i] == 18:
        age[i] = 'E18'
    if age[i] == 25:
        age[i] = 'P4'
    if age[i] == 35:
        age[i] = 'P14'
    if age[i] == 81:
        age[i] = 'P60'
    
i = -1
for sample in samples:
    i = i + 1
    if 'LPS' in sample:
        tx[i] = 'LPS'
    else:
        tx[i] = 'SAL'

i = -1
for sample in samples:
    i = i + 1
    if 'M' in sample:
        sex[i] = 'M'
    else:
        sex[i] = 'F'

In [5]:
### log normalizing all of the data in the matrix

mgla_devel_data_unique = np.log2(data_unique)
mgla_devel_data = np.log2(data)

In [6]:
### correcting the probes (gene names) so that they can be set as the index of the dataframe

corrected_probes = [0] * len(probes)
corrected_probes_unique = [0] * len(probes_unique)

for i in range(len(probes)):
    corrected_probes[i] = probes[i][0][0]
for i in range(len(probes_unique)):
    corrected_probes_unique[i] = probes_unique[i][0]

In [7]:
## creating new dataframes with all relevant labels as indices

df_unique = pd.DataFrame(mgla_devel_data_unique, columns = [age, tx, sex, samples])
df_unique.index = corrected_probes_unique

df = pd.DataFrame(mgla_devel_data, columns = [age, tx, sex, samples])
df.index = corrected_probes

In [8]:
## need to normalize every column against itself

## for col in df:
    ## df[col] = df[col] / df[col].mean()
    
    
## EXCLUDING this for now, doesn't seem to be used

In [9]:
### (expression value of the gene in a sample—minimum expression for the gene across all samples)/(maximum expression for the gene across all
### samples—minimum expression for the gene across all samples),
### scale all values so that they add equal weight to the index

def scale_expression(df):
    scaled_expression_df = df.copy()

    for row in range(len(scaled_expression_df)):
        scaled_expression_df.iloc[row] = (scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row])) / np.max(scaled_expression_df.iloc[row] - np.min(scaled_expression_df.iloc[row]))
        
        
    return scaled_expression_df

# LITERALLY THE ONLY CELL I CHANGED

In [10]:
%%time

scaled_df = scale_expression(df_unique)

Wall time: 2.91 s


In [11]:
scaled_df

Unnamed: 0_level_0,E18,E18,E18,E18,P14,P14,P14,P14,P14,P14,...,P60,P60,P60,P60,P60,P60,P60,P60,P60,P60
Unnamed: 0_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,LPS,LPS,LPS,LPS,SAL,SAL,SAL,SAL,SAL
Unnamed: 0_level_2,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
Unnamed: 0_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 2,M_P60_LPS 3,M_P60_LPS 4,M_P60_LPS 5,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5
11-sep,0.978608,0.904136,0.810014,0.195134,0.371335,0.339284,0.502844,0.360920,0.410601,0.524856,...,0.225276,0.360920,0.596966,0.195134,0.195134,0.292369,0.253406,0.145454,0.179216,0.000000
2-mar,0.693426,0.844876,0.785941,1.000000,0.543396,0.543396,0.785941,0.844876,0.667074,0.667074,...,0.319562,0.422438,0.422438,0.543396,0.319562,0.506494,0.639124,0.639124,0.667074,0.806435
2-sep,0.744408,0.600041,0.645738,0.480035,0.309445,0.538756,0.614726,0.547960,0.555774,0.757948,...,0.000000,0.549267,0.285192,0.080841,0.267456,0.329721,0.256643,0.221508,0.401898,0.815006
6-mar,0.952675,0.901132,1.000000,0.000000,0.778327,0.802376,0.828689,0.719291,0.711469,0.742141,...,0.753236,0.805731,0.760517,0.657877,0.564321,0.828689,0.802376,0.856928,0.742141,0.533027
6-sep,0.867687,0.997259,0.838877,0.569122,0.621099,0.484831,0.569122,0.583126,0.521897,0.583126,...,0.389584,0.389584,0.000000,0.237336,0.324489,0.583126,0.441561,0.596412,0.504027,0.416874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zswim1,0.897712,0.698555,0.627474,0.758654,0.627474,0.627474,0.428317,0.540476,0.968794,0.758654,...,0.000000,0.758654,0.000000,0.270238,0.540476,0.698555,0.540476,0.698555,0.540476,0.270238
zw10,0.901151,0.736808,0.942246,0.653809,0.415409,0.374314,0.374314,0.374314,0.526837,0.623999,...,0.491479,0.526837,0.454395,0.560623,0.592971,0.787497,0.710136,0.653809,0.653809,0.736808
zwint,0.878675,0.980440,0.792424,1.000000,0.578322,0.733345,0.401169,0.520281,0.504763,0.667161,...,0.223361,0.472349,0.655314,0.564375,0.319408,0.381838,0.073559,0.455398,0.361804,0.690117
zyx,0.700168,0.706865,0.703533,0.285593,0.982704,0.791586,0.741518,1.000000,0.764681,0.726201,...,0.706865,0.620563,0.806767,0.946886,0.653113,0.408540,0.756178,0.347859,0.589291,0.000000


In [12]:
def identify_significant_genes(df):
    
    e18_all= df['E18']
    p60_all = df['P60']['SAL']

    pvals = np.zeros(shape = len(df))
    sig = np.zeros(shape = len(df))
    logdiff = np.zeros(shape = len(df))

    for row in range(len(df)):
        pvals[row] = stats.ttest_ind(e18_all.iloc[row], p60_all.iloc[row])[1]
        sig[row] = stats.ttest_ind(e18_all.iloc[row], p60_all.iloc[row])[0]
        logdiff[row] = np.log2(np.mean(p60_all.iloc[row])/np.mean(e18_all.iloc[row]))
    df['pvals'] = pvals
    df['sig'] = sig
    df['logdiff'] = logdiff

    direction = [0] * len(df)
    for row in range(len(df)):
        if df['pvals'][row] < 0.05:
            if df['logdiff'][row] > 0:
                direction[row] = 'UP'
            else:
                direction[row] = 'DOWN'
        else:
            direction[row] = 'N/A'
    df['direction'] = direction
    

    return df

In [13]:
%%time

new_df = identify_significant_genes(scaled_df)

Wall time: 9.17 s


In [14]:
new_df

Unnamed: 0_level_0,E18,E18,E18,E18,P14,P14,P14,P14,P14,P14,...,P60,P60,P60,P60,P60,P60,pvals,sig,logdiff,direction
Unnamed: 0_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,SAL,SAL,SAL,SAL,SAL,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unnamed: 0_level_2,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Unnamed: 0_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
11-sep,0.978608,0.904136,0.810014,0.195134,0.371335,0.339284,0.502844,0.360920,0.410601,0.524856,...,0.195134,0.292369,0.253406,0.145454,0.179216,0.000000,1.274814e-06,7.284858,-2.456187,DOWN
2-mar,0.693426,0.844876,0.785941,1.000000,0.543396,0.543396,0.785941,0.844876,0.667074,0.667074,...,0.319562,0.506494,0.639124,0.639124,0.667074,0.806435,2.899801e-04,4.539842,-0.524563,DOWN
2-sep,0.744408,0.600041,0.645738,0.480035,0.309445,0.538756,0.614726,0.547960,0.555774,0.757948,...,0.267456,0.329721,0.256643,0.221508,0.401898,0.815006,1.891136e-02,2.594048,-0.525897,DOWN
6-mar,0.952675,0.901132,1.000000,0.000000,0.778327,0.802376,0.828689,0.719291,0.711469,0.742141,...,0.564321,0.828689,0.802376,0.856928,0.742141,0.533027,9.302331e-01,-0.088858,0.017394,
6-sep,0.867687,0.997259,0.838877,0.569122,0.621099,0.484831,0.569122,0.583126,0.521897,0.583126,...,0.324489,0.583126,0.441561,0.596412,0.504027,0.416874,2.682084e-05,5.685525,-0.684771,DOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zswim1,0.897712,0.698555,0.627474,0.758654,0.627474,0.627474,0.428317,0.540476,0.968794,0.758654,...,0.540476,0.698555,0.540476,0.698555,0.540476,0.270238,4.630963e-02,2.149232,-0.382143,DOWN
zw10,0.901151,0.736808,0.942246,0.653809,0.415409,0.374314,0.374314,0.374314,0.526837,0.623999,...,0.592971,0.787497,0.710136,0.653809,0.653809,0.736808,2.414654e-01,1.213674,-0.150302,
zwint,0.878675,0.980440,0.792424,1.000000,0.578322,0.733345,0.401169,0.520281,0.504763,0.667161,...,0.319408,0.381838,0.073559,0.455398,0.361804,0.690117,5.362963e-08,9.177731,-1.352568,DOWN
zyx,0.700168,0.706865,0.703533,0.285593,0.982704,0.791586,0.741518,1.000000,0.764681,0.726201,...,0.653113,0.408540,0.756178,0.347859,0.589291,0.000000,7.798187e-02,1.875668,-0.432397,


In [15]:
def remove_unsignificant_rows(df):
    df.reset_index(inplace = True)
    for row in range(len(df)):
        if df['direction'][row] == 'N/A':
            df.drop(row, inplace = True)  
    df.reset_index(drop=True, inplace=True)
    return df

In [16]:
%%time

df_cleaned = remove_unsignificant_rows(new_df)

Wall time: 7.49 s


In [17]:
np.unique(df_cleaned['direction'], return_counts = True)

(array(['DOWN', 'UP'], dtype=object), array([1258,  617], dtype=int64))

In [18]:
down_genes = df_cleaned['index'][df_cleaned['direction'] == 'DOWN'].to_list()
up_genes = df_cleaned['index'][df_cleaned['direction'] == 'UP'].to_list()

In [19]:
df_cleaned[df_cleaned['index'] == 'Csf1r']

Unnamed: 0_level_0,index,E18,E18,E18,E18,P14,P14,P14,P14,P14,...,P60,P60,P60,P60,P60,P60,pvals,sig,logdiff,direction
Unnamed: 0_level_1,Unnamed: 1_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,SAL,SAL,SAL,SAL,SAL,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,...,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3


In [20]:
def generate_index(df):
    
    samples = df.columns[1:-4]
    
    index_per_sample = [0] * len(samples)
    i = -1
    
    for sample in samples:
        i = i + 1
        index_per_sample[i] = np.mean(df[sample][df['direction'] == 'UP']) / np.mean(df[sample][df['direction'] == 'DOWN'])
        
    return index_per_sample

In [21]:
final_df = pd.DataFrame([generate_index(df_cleaned)], columns = df_cleaned.columns[1:-4])
final_df

Unnamed: 0_level_0,E18,E18,E18,E18,P14,P14,P14,P14,P14,P14,...,P60,P60,P60,P60,P60,P60,P60,P60,P60,P60
Unnamed: 0_level_1,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,SAL,...,LPS,LPS,LPS,LPS,LPS,SAL,SAL,SAL,SAL,SAL
Unnamed: 0_level_2,F,F,F,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
Unnamed: 0_level_3,F_E18 1,F_E18 2,F_E18 3,F_E18 4,F_P14 1,F_P14 2,F_P14 3,F_P14 4,F_P14 5,F_P14 6,...,M_P60_LPS 2,M_P60_LPS 3,M_P60_LPS 4,M_P60_LPS 5,M_P60_LPS 6,M_P60_Sal 1,M_P60_Sal 2,M_P60_Sal 3,M_P60_Sal 4,M_P60_Sal 5
0,0.648589,0.5002,0.636448,0.205458,1.316009,1.301369,1.209574,1.314075,1.064893,1.269319,...,1.815599,2.329467,1.749386,1.752557,1.978583,1.652564,1.806533,1.449062,1.517751,1.602073


In [22]:
## scale data to between 0 and 1
final_df_scaled = pd.DataFrame(final_df.iloc[0] - np.min(final_df.iloc[0])) / (np.max(final_df.iloc[0] - np.min(final_df.iloc[0])))
final_df_scaled.reset_index(inplace = True)

In [23]:
final_df_scaled.columns = ['age', 'tx', 'sex', 'sample_name', 'index']
final_df_scaled.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\mdi\\unique_data_index.csv')

In [31]:
regulated_genes = pd.DataFrame((dict([ (k,pd.Series(v)) for k,v in {'gene' : df_cleaned['index'], 'direction' : df_cleaned['direction'], 'valence' : df_cleaned['logdiff']}.items() ])))
regulated_genes = regulated_genes.sort_values(by = 'valence', ascending = False).set_index('gene')
regulated_genes.to_csv('C:\\Users\\Ben\\Dropbox\\bilbo_lab_spr2020\\microglia-seq_website\\microglia-seq\\mdi\\unique_data_index_gene_list.csv')