In [1]:
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

Read in the testing data that the SVM will classify.

In [2]:
data = pd.read_csv('data/test_fulldata.csv', index_col = 0, sep = ',')

data.shape

(27998, 24803)

The function **SVM_FullData_Function** is an adaptation of a program Christy provided that returns the SVM's predicitons as a csv file. I wanted to make it into a function so I wouldn't have to rerun the program for every single class and change the file paths.

In [31]:
def SVM_FullData_Function(ref_data, ref_data_labels_general, ref_data_labels_specific, SVM_Pred_Labels_general, SVM_Pred_Labels_specific):

    # These are the names of the files we want to load 
    train_data = ref_data
    general_labels = ref_data_labels_general
    specific_labels = ref_data_labels_specific

    # These are the file names the output will be saved as
    general_pred = SVM_Pred_Labels_general
    specific_pred = SVM_Pred_Labels_specific

    # Read in data, remember python starts indexing at 0, 1, 2... (vs R which starts at 1, 2, 3..)
    train = pd.read_csv(train_data, index_col = 0, sep = ',')
    test = data

    labels = pd.read_csv(specific_labels, header = 0, index_col =None)
    
    train.shape
    labels.shape

    # We only want the column with cell labels, not the cell barcodes (barcodes = cell names)
    labels = labels.iloc[:, 1]

    # input files are: rows = genes, columns = cells. For svm in python, you need: (n_samples, n_features), so you need to transpose the data matrix 

    train = train.T
    test = test.T


    train = np.log1p(train)
    test = np.log1p(test)

    # Define the classifier 
    Classifier = LinearSVC()
    clf = CalibratedClassifierCV(Classifier)
    threshold = 0.7


    pred = []
    # Training
    clf.fit(train, labels)

    print('Training done')

    # Prediction
    predicted = clf.predict(test)
    # get the max probability predicted per cell
    prob = np.max(clf.predict_proba(test), axis = 1)

    # If max probability below 0.7 (threshold) then call it unknown
    unlabeled = np.where(prob < threshold)
    predicted[unlabeled] = 'Unknown'

    print('Classification done')
    print(tm.time())


    pred.extend(predicted)   

    pred = pd.DataFrame(pred)


    pred.to_csv(specific_pred, index = False)


    # Repeat same thing but this time the labels are the general labels 
    labels = pd.read_csv(general_labels, header = 0, index_col =None)
    labels = labels.iloc[:, 1]
    Classifier = LinearSVC()
    clf = CalibratedClassifierCV(Classifier)
    threshold = 0.7

    pred = []

    clf.fit(train, labels)

    print('Training done')
    print(tm.time())

    predicted = clf.predict(test)
    prob = np.max(clf.predict_proba(test), axis = 1)
    unlabeled = np.where(prob < threshold)
    predicted[unlabeled] = 'Unknown'



    pred.extend(predicted)   

    pred = pd.DataFrame(pred)



    pred.to_csv(general_pred, index = False)


    print('done')



All the following lines of code pass in the training data and test data that has the specific classes removed, and outputs the SVM's predictions as csv files. In the classes that are only general or specific and not both, I output a dummy file to be deleted called **DELETEclassname_SVM_Pred_Labels_general.csv** for example, so I knew to delete it.

In [16]:
# ASTROCYTE - general and specific
SVM_FullData_Function('general_train/astrocyte_general_train.csv', 'general_labels/astrocyte_general_labels.csv', 
                      'specific_labels/astrocyte_specific_labels.csv',
                      'SVM_output/astrocyte_SVM_Pred_Labels_general.csv', 'SVM_output/astrocyte_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678418317.5503428
Training done
1678418324.7567801
done


In [18]:
# BRAIN MICROVASCULAR ENDOTHELIAL CELL - specific
SVM_FullData_Function('specific_train/brainmicrovascularendothelialcell_specific_train.csv', 'specific_labels/brainmicrovascularendothelialcell_specific_labels.csv', 
                      'specific_labels/brainmicrovascularendothelialcell_specific_labels.csv',
                      'SVM_output/DELETEbrainmicrovascularendothelialcell_SVM_Pred_Labels_general.csv', 'SVM_output/brainmicrovascularendothelialcell_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678418883.66164
Training done
1678418891.6141179
done


In [19]:
# ENDOTHELIAL CELL - general
SVM_FullData_Function('general_train/endothelialcell_general_train.csv', 'general_labels/endothelialcell_general_labels.csv', 
                      'general_labels/endothelialcell_general_labels.csv',
                      'SVM_output/endothelialcell_SVM_Pred_Labels_general.csv', 'SVM_output/DELETEendothelialcell_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678419046.4458501
Training done
1678419053.99188
done


In [20]:
# EXCITATORY NEURON - specific
SVM_FullData_Function('specific_train/excitatoryneuron_specific_train.csv', 'specific_labels/excitatoryneuron_specific_labels.csv', 
                      'specific_labels/excitatoryneuron_specific_labels.csv',
                      'SVM_output/DELETEexcitatoryneuron_SVM_Pred_Labels_general.csv', 'SVM_output/excitatoryneuron_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678419252.763104
Training done
1678419255.612958
done


In [22]:
# INHIBITORY NEURON - specific
SVM_FullData_Function('specific_train/inhibitoryneuron_specific_train.csv', 'specific_labels/inhibitoryneuron_specific_labels.csv', 
                      'specific_labels/inhibitoryneuron_specific_labels.csv',
                      'SVM_output/DELETEinhibitoryneuron_SVM_Pred_Labels_general.csv', 'SVM_output/inhibitoryneuron_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678419518.5677838
Training done
1678419526.63181
done


In [23]:
# MICROGLIAL CELL - general and specific
SVM_FullData_Function('general_train/microglialcell_general_train.csv', 'general_labels/microglialcell_general_labels.csv', 
                      'specific_labels/microglialcell_specific_labels.csv',
                      'SVM_output/microglialcell_SVM_Pred_Labels_general.csv', 'SVM_output/microglialcell_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678419707.076803
Training done
1678419715.42853
done


In [24]:
# NEURON - general
SVM_FullData_Function('general_train/neuron_general_train.csv', 'general_labels/neuron_general_labels.csv', 
                      'general_labels/neuron_general_labels.csv',
                      'SVM_output/neuron_SVM_Pred_Labels_general.csv', 'SVM_output/DELETEneuron_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678420693.4461591
Training done
1678420695.847661
done


In [34]:
# OLIGODENDROCYTE - general
SVM_FullData_Function('general_train/oligodendrocyte_general_train.csv', 'general_labels/oligodendrocytes_general_labels.csv', 
                      'general_labels/oligodendrocytes_general_labels.csv',
                      'SVM_output/oligodendrocyte_SVM_Pred_Labels_general.csv', 'SVM_output/DELETEoligodendrocyte_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678500635.059972
Training done
1678500642.9794948
done


In [35]:
# OLIGODENDROCYTE - general
SVM_FullData_Function('specific_train/oligodendrocyte_specific_train.csv', 'specific_labels/oligodendrocyte_specific_labels.csv', 
                      'specific_labels/oligodendrocyte_specific_labels.csv',
                      'SVM_output/DELETEoligodendrocyte_SVM_Pred_Labels_general.csv', 'SVM_output/oligodendrocyte_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678500788.8971012
Training done
1678500796.8018408
done


In [36]:
# OLIGODENDROCYTE PRECURSOR CELL - specific
SVM_FullData_Function('specific_train/oligodendrocyteprecursorcell_specific_train.csv', 'specific_labels/oligodendrocyteprecursorcell_specific_labels.csv', 
                      'specific_labels/oligodendrocyteprecursorcell_specific_labels.csv',
                      'SVM_output/DELETEoligodendrocyteprecursorcell_SVM_Pred_Labels_general.csv', 'SVM_output/oligodendrocyteprecursorcell_SVM_Pred_Labels_specific.csv')

Training done
Classification done
1678501824.5963252
Training done
1678501831.694295
done
