In [1]:
import numpy as np
import pandas as pd

Read in csv files: the training data along with its general and specific labels.

In [2]:
general_labels = pd.read_csv('data/ref_data_labels_general.csv')
specific_labels = pd.read_csv('data/ref_data_labels_specific.csv')
train = pd.read_csv('data/ref_data.csv')

The following code block is how I found the general and specific labels from the data sets.

In [3]:
generals = [*set(general_labels['x'][0:])]
generals.sort()
print(generals)
specifics = [*set(specific_labels['x'][0:])]
specifics.sort()
print(specifics)

['astrocyte', 'endothelial cell', 'microglial cell', 'neuron', 'oligodendrocyte']
['astrocyte', 'brain microvascular endothelial cell', 'excitatory neuron', 'inhibitory neuron', 'microglial cell', 'oligodendrocyte', 'oligodendrocyte precursor cell']


The following for loop adds each of the cell labels (i.e. the sequence of DNA the cell is tagged with) to a list called cols. I popped off the first column because it was just an unnamed empty column.

In [4]:
cols = []
for col in train.columns:
    cols.append(col)
cols.pop(0)

'Unnamed: 0'

The function **generalRemovals** takes in **name** as a parameter and returns a DataFrame with all of the cells of type **name** removed from the training data. I used the **drop()** function to remove the columns with the cell labels stored in **removals**.

In [5]:
def generalRemovals(name: str):
    indices = []
    general_train = train
    for i in range(len(general_labels)):
        if general_labels['x'][i] == name:
            indices.append(i)
    removals = []
    for x in indices:
        removals.append(cols[x])
    general_train = general_train.drop(columns=removals)
    return(general_train)

In [6]:
# ASTROCYTE - general
astrocyte_general_train = generalRemovals('astrocyte')

In [7]:
# ENDOTHELIAL CELL - general
endothelialcell_general_train = generalRemovals('endothelial cell')

In [8]:
# MICROGLIAL CELL - general
microglialcell_general_train = generalRemovals('microglial cell')

In [9]:
# NEURON - general
neuron_general_train = generalRemovals('neuron')

In [10]:
# OLIGOGENDROCYTE - general
oligodendrocyte_general_train = generalRemovals('oligodendrocyte')

This is a sanity check to compare with the number of cells that remain in the general labels after the corresponding classes are removed.

In [11]:
print(len(astrocyte_general_train.columns))
print(len(endothelialcell_general_train.columns))
print(len(microglialcell_general_train.columns))
print(len(neuron_general_train.columns))
print(len(oligodendrocyte_general_train.columns))

3027
3249
3273
550
3066


The modified training data can now be exported as csv files.

In [153]:
astrocyte_general_train.to_csv('general_train/astrocyte_general_train.csv', index=False)
endothelialcell_general_train.to_csv('general_train/endothelialcell_general_train.csv', index=False)
microglialcell_general_train.to_csv('general_train/microglialcell_general_train.csv', index=False)
neuron_general_train.to_csv('general_train/neuron_general_train.csv', index=False)
oligodendrocyte_general_train.to_csv('general_train/oligodendrocyte_general_train.csv', index=False)

Similar to **generalRemovals**, the function **specificRemovals** takes in **name** as a parameter and returns a DataFrame with all of the cells of type **name** removed from the training data. I used the **drop()** function to remove the columns with the cell labels stored in **removals**.

In [12]:
def specificRemovals(name: str):
    indices = []
    specific_train = train
    for i in range(len(specific_labels)):
        if specific_labels['x'][i] == name:
            indices.append(i)
    removals = []
    for x in indices:
        removals.append(cols[x])
    specific_train = specific_train.drop(columns=removals)
    return(specific_train)

In [13]:
# ASTROCYTE - specific
astrocyte_specific_train = specificRemovals('astrocyte')

In [14]:
# BRAIN MICROVASCULAT ENDOTHELIAL CELL - specific
brainmicrovascularendothelialcell_specific_train = specificRemovals('brain microvascular endothelial cell')

In [15]:
# EXCITATORY NEURON - specific
excitatoryneuron_specific_train = specificRemovals('excitatory neuron')

In [16]:
# INHIBITORY NEURON - specific
inhibitoryneuron_specific_train = specificRemovals('inhibitory neuron')

In [17]:
# MICROGLIAL CELL - specific
microglialcell_specific_train = specificRemovals('microglial cell')

In [18]:
# OLIGODENDROCYTE - specific
oligodendrocyte_specific_train = specificRemovals('oligodendrocyte')

In [19]:
# OLIGODENDROCYTE PRECURSOR CELL - specific
oligodendrocyteprecursorcell_specific_train = specificRemovals('oligodendrocyte precursor cell')

This is a sanity check to compare with the number of cells that remain in the specific labels after the corresponding classes are removed.

In [20]:
# check lengths
print(len(astrocyte_specific_train.columns))
print(len(brainmicrovascularendothelialcell_specific_train.columns))
print(len(excitatoryneuron_specific_train.columns))
print(len(inhibitoryneuron_specific_train.columns))
print(len(microglialcell_specific_train.columns))
print(len(oligodendrocyte_specific_train.columns))
print(len(oligodendrocyteprecursorcell_specific_train.columns))

3027
3249
800
3041
3273
3176
3181


The modified training data can now be exported as csv files.

In [31]:
astrocyte_specific_train.to_csv('specific_train/astrocyte_specific_train.csv', index=False)
brainmicrovascularendothelialcell_specific_train.to_csv('specific_train/brainmicrovascularendothelialcell_specific_train.csv', index=False)
excitatoryneuron_specific_train.to_csv('specific_train/excitatoryneuron_specific_train.csv', index=False)
inhibitoryneuron_specific_train.to_csv('specific_train/inhibitoryneuron_specific_train.csv', index=False)
microglialcell_specific_train.to_csv('specific_train/microglialcell_specific_train.csv', index=False)
oligodendrocyte_specific_train.to_csv('specific_train/oligodendrocyte_specific_train.csv', index=False)
oligodendrocyteprecursorcell_specific_train.to_csv('specific_train/oligodendrocyteprecursorcell_specific_train.csv', index=False)

Now that we have removed the necessary cells from the training data, we must also remove them from the associated labels. This is accomplished with the function **drop_general**, which takes **name** as a parameter and returns the general labels with all the labels of type **name** removed.

In [22]:
def drop_general(name: str):
    to_drop = []
    general = general_labels
    for i in range(len(general)):
        if general['x'][i] == name:
            to_drop.append(i)
    general = general.drop(to_drop)
    return general

In [23]:
astrocyte_general_labels = drop_general('astrocyte')
endothelialcell_general_labels = drop_general('endothelial cell')
microglialcell_general_labels = drop_general('microglial cell')
neuron_general_labels = drop_general('neuron')
oligodendrocyte_general_labels = drop_general('oligodendrocyte')

We use these values in the sanity check that was mentioned earlier.

In [24]:
print(len(astrocyte_general_labels))
print(len(endothelialcell_general_labels))
print(len(microglialcell_general_labels))
print(len(neuron_general_labels))
print(len(oligodendrocyte_general_labels))

3026
3248
3272
549
3065


Export all the DataFrames to csv files.

In [25]:
astrocyte_general_labels.to_csv('general_labels/astrocyte_general_labels.csv', index=False)
endothelialcell_general_labels.to_csv('general_labels/endothelialcell_general_labels.csv', index=False)
microglialcell_general_labels.to_csv('general_labels/microglialcell_general_labels.csv', index=False)
neuron_general_labels.to_csv('general_labels/neuron_general_labels.csv', index=False)
oligodendrocyte_general_labels.to_csv('general_labels/oligodendrocytes_general_labels.csv', index=False)

The function **drop_specific**, which takes **name** as a parameter and returns the specific labels with all the labels of type **name** removed.

In [26]:
def drop_specific(name: str):
    to_drop = []
    specific = specific_labels
    for i in range(len(specific)):
        if specific['x'][i] == name:
            to_drop.append(i)
    specific = specific.drop(to_drop)
    return specific

In [27]:
astrocyte_specific_labels = drop_specific('astrocyte')
brainmicrovascularendothelialcell_specific_labels = drop_specific('brain microvascular endothelial cell')
excitatoryneuron_specific_labels = drop_specific('excitatory neuron')
inhibitoryneuron_specific_labels = drop_specific('inhibitory neuron')
microglialcell_specific_labels = drop_specific('microglial cell')
oligodendrocyte_specific_labels = drop_specific('oligodendrocyte')
oligodendrocyteprecursorcell_specific_labels = drop_specific('oligodendrocyte precursor cell')

We use these values in the sanity check that was mentioned earlier.

In [28]:
print(len(astrocyte_specific_labels))
print(len(brainmicrovascularendothelialcell_specific_labels))
print(len(excitatoryneuron_specific_labels))
print(len(inhibitoryneuron_specific_labels))
print(len(microglialcell_specific_labels))
print(len(oligodendrocyte_specific_labels))
print(len(oligodendrocyteprecursorcell_specific_labels))

3026
3248
799
3040
3272
3175
3180


Export all the DataFrames to csv files.

In [30]:
# export to csv files
astrocyte_specific_labels.to_csv('specific_labels/astrocyte_specific_labels.csv', index=False)
brainmicrovascularendothelialcell_specific_labels.to_csv('specific_labels/brainmicrovascularendothelialcell_specific_labels.csv', index=False)
excitatoryneuron_specific_labels.to_csv('specific_labels/excitatoryneuron_specific_labels.csv', index=False)
inhibitoryneuron_specific_labels.to_csv('specific_labels/inhibitoryneuron_specific_labels.csv', index=False)
microglialcell_specific_labels.to_csv('specific_labels/microglialcell_specific_labels.csv', index=False)
oligodendrocyte_specific_labels.to_csv('specific_labels/oligodendrocyte_specific_labels.csv', index=False)
oligodendrocyteprecursorcell_specific_labels.to_csv('specific_labels/oligodendrocyteprecursorcell_specific_labels.csv', index=False)