# Cleaning microglia bulk-seq data from [Hanamsagar 2017](https://onlinelibrary.wiley.com/doi/abs/10.1002/glia.23176). 
### - Data is accessible from the NCBI GEO database [GSE99622](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99622)
### - Microglia transcriptional profiles from E18, P4, P14, and P60 male and female mice treated w/ LPS or saline 

In [139]:
import numpy as np
import pandas as pd

df = pd.read_csv('H:\\DATA\\microglia_sequencing\\GSE99622_hanamsagar2017.csv')

In [140]:
df

Unnamed: 0,Gene,Transcript,Best,[[B1]]_tube2_E18_1.25.14_F,[[D1]]_tube4_E18_1.27.14_F,[[F1]]_tube6_E18_3.15.14_F,[[H1]]_tube8_E18_4.11.14_F,[[D3]]_tube20_P14_2.24.14_F-4,[[E3]]_tube21_P14_2.24.14_F-5,[[F3]]_tube22_P14_2.24.14_F-6,...,[[D8]]_tube60_P60_LPS_4.9.14_M-3,[[E8]]_tube61_P60_LPS_4.9.14_M-4,[[F8]]_tube62_P60_LPS_4.9.14_M-5,[[E9]]_tube69_P60_LPS_4.10.14_M-1,[[G9]]_tube71_P60_LPS_4.10.14_M-3,[[F5]]_tube38_P60_SAL_3.27.14_M-2,[[B6]]_tube42_P60_SAL_3.31.14_M-1,[[H6]]_tube48_P60_SAL_4.3.14_M-1,[[A7]]_tube49_P60_SAL_4.3.14_M-2,[[B7]]_tube50_P60_SAL_4.3.14_M-3
0,Zfp85-rs1,NM_001001130,1,7,18,9,0,4,5,1,...,4,1,1,11,5,6,19,10,17,6
1,Scap,NM_001001144,1,133,181,132,53,98,152,141,...,26,35,39,59,36,121,207,128,110,8
2,Zfp458,NM_001001152,1,20,13,14,0,5,9,4,...,9,6,5,9,7,18,22,7,10,5
3,Fbxo41,NM_001001160,1,41,15,29,7,1,3,7,...,0,1,1,0,21,2,4,0,1,0
4,Taf9b,NM_001001176,1,38,32,37,15,30,43,64,...,18,30,18,31,1,65,41,52,55,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28639,4933401P06Rik,NR_045505,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
28640,4933405E24Rik,NR_045506,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28641,4933412O06Rik,NR_045507,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
28642,4933413L06Rik,NR_045508,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [141]:
%%time

for i in range(len(df)):
    if df['Best'][i] == 0:
        df.drop(i, axis = 0, inplace = True)

Wall time: 59.9 s


In [142]:
df.drop(columns = ['Transcript', 'Best'], inplace = True)

In [143]:
cols = ['gene', 'E18_F', 'E18_F', 'E18_F', 'E18_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F', 'P14_F',
                 'P4_F', 'P4_F', 'P4_F', 'P4_F', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_LPS', 'P60_F_SAL',
                          'P60_F_SAL', 'P60_F_SAL', 'P60_F_SAL', 'P60_F_SAL', 'P60_F_SAL', 'P60_F_SAL', 'E18_M', 'E18_M', 'E18_M', 'P14_M', 'P14_M', 
                          'P14_M', 'P14_M', 'P14_M', 'P14_M', 'P14_M', 'P14_M', 'P14_M', 'P14_M', 'P4_M', 'P4_M', 'P4_M', 'P4_M',
                        'P60_M_LPS', 'P60_M_LPS', 'P60_M_LPS', 'P60_M_LPS', 'P60_M_LPS', 'P60_M_LPS', 'P60_M_LPS', 'P60_M_SAL', 'P60_M_SAL', 'P60_M_SAL', 'P60_M_SAL', 'P60_M_SAL']

In [144]:
df.columns = cols

In [145]:
melted = df.melt(id_vars = 'gene')

In [146]:
melted.set_index('gene', inplace = True)

In [147]:
import re

keys = list(df1.columns[1:]) 

age = []

for i in range(len(keys)):
    e18 = re.search('E18', keys[i])
    p4 = re.search('P4', keys[i])
    p14 = re.search('P14', keys[i])
    p60 = re.search('P60.*SAL', keys[i])
    p60_LPS = re.search('P60.*LPS', keys[i])

    if e18:
        age = np.append(age, 'E18')
    if p4:
        age = np.append(age, 'P4')
    if p14:
        age = np.append(age, 'P14')
    if p60:
        age = np.append(age, 'P60')
    if p60_LPS:
        age = np.append(age, 'P60 + LPS')
        
        
d1 = dict(zip(keys,age))
        
melted['age'] = melted['variable'].map(d1)

In [148]:
sex = []

for i in range(len(keys)):
    m = re.search('M', keys[i])
    f = re.search('F', keys[i])

    if m:
        sex = np.append(sex, 'Male')
    if f:
        sex = np.append(sex, 'Female')
        
d2 = dict(zip(keys,sex))
        
melted['sex'] = melted['variable'].map(d2)

In [149]:
melted

Unnamed: 0_level_0,variable,value,age,sex
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Zfp85-rs1,E18_F,7,E18,Female
Scap,E18_F,133,E18,Female
Zfp458,E18_F,20,E18,Female
Fbxo41,E18_F,41,E18,Female
Taf9b,E18_F,38,E18,Female
...,...,...,...,...
4933401P06Rik,P60_M_SAL,0,P60,Male
4933405E24Rik,P60_M_SAL,0,P60,Male
4933412O06Rik,P60_M_SAL,0,P60,Male
4933413L06Rik,P60_M_SAL,0,P60,Male


In [150]:
melted.drop(columns = ['variable'], inplace = True)

In [151]:
melted.rename(columns = {'value':'expression'}, inplace = True)

In [152]:
melted

Unnamed: 0_level_0,expression,age,sex
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Zfp85-rs1,7,E18,Female
Scap,133,E18,Female
Zfp458,20,E18,Female
Fbxo41,41,E18,Female
Taf9b,38,E18,Female
...,...,...,...
4933401P06Rik,0,P60,Male
4933405E24Rik,0,P60,Male
4933412O06Rik,0,P60,Male
4933413L06Rik,0,P60,Male


In [133]:
melted.to_csv('H:\\DATA\\microglia_sequencing\\GSE99622_hanamsagar2017_cleaned_melted.csv')