# Tumor Classification Using One-vs-one Classification

The code below aims to classify 5 different classes of tumors based on cleaned tabular data with 800+ features. As there are very few samples for each class, here we adopted one-vs-one logistic regression with votings to do the classification.

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.externals import joblib
import os
os.chdir('./data')
print(os.getcwd())

/Users/andrewchen/Desktop/Tumor Classification (ML)/multiclass_classification/data




### Read data and split into X and Y

In [2]:
data = pd.read_excel('Features of IA subtypes.xlsx')
labels = ['Lepidic', 'Papillary', 'Acinar', 'Micropapillary', 'Solid']
X = data.drop('Unnamed: 0', axis = 1)
X_unlabeled = X.drop('IA_type', 1)
Y = data['IA_type']
X.to_csv('X.csv', header = True, index = False)
Y.to_csv('Y.csv', header = True, index = False)
X.shape

(136, 831)

### Train a Logistic Regression Classifier for each pair of classes

In [3]:
# Create a list of class labels
list = [1, 2, 3, 4, 5]
# Create a list of tuples for binary classifier. e.g. (1, 2)
class_pair_combo = [(x, y) for x in list for y in list if x != y]
# Initiate random seed
random_seed = 0
# Initiate matrices for OVO result aggregation
weight_matrix = []
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = random_seed)
X_train.to_csv('X_train.csv', header = True, index = False)
X_test.to_csv('X_test.csv', header = True, index = False)
Y_train.to_csv('Y_train.csv', header = True, index = False)
Y_test.to_csv('Y_test.csv', header = True, index = False)

# Train different binary logistic regression classifiers
for element in class_pair_combo:
    # Update random seed to increate randomness
    random_seed += 1
    
    # Find the class labels from the tuples
    first_class = element[0]
    second_class = element[1]
    
    # Set the name for the binary classification model
    model_name = 'model_' + str(element[0]) + '_' + str(element[1]) + '.pkl'
    
    # Print out two classes
    print("Training class 1: %d" % first_class)
    print("Training class 2: %d" % second_class)
    
    # Filter the training sets to keep only the samples of the two classes
    X_train_filtered = X_train.loc[X_train['IA_type'].isin(element)]
    X_train_filtered_unlabeled = X_train_filtered.drop('IA_type', 1)
    X_train_filtered_unlabeled = StandardScaler().fit_transform(X_train_filtered_unlabeled)
    X_train_filtered_unlabeled = pd.DataFrame(data = X_train_filtered_unlabeled)
    Y_train_filtered = X_train_filtered['IA_type']
    
    X_test_filtered = X_test.loc[X_test['IA_type'].isin(element)]
    X_test_filtered_unlabeled = X_test_filtered.drop('IA_type', 1)
    X_test_filtered_unlabeled = StandardScaler().fit_transform(X_test_filtered_unlabeled)
    X_test_filtered_unlabeled = pd.DataFrame(data = X_test_filtered_unlabeled)
    Y_test_filtered = X_test_filtered['IA_type']

    # Remove outlier for X_train_filtered_unlabeled
    Q1 = X_train_filtered_unlabeled.quantile(q = 0.25, axis = 0, numeric_only = False)
    Q3 = X_train_filtered_unlabeled.quantile(q = 0.75, axis = 0, numeric_only = False)
    lowerBound = Q1 - 1.5 * (Q3 - Q1)
    upperBound = Q3 + 1.5 * (Q3 - Q1)
    for index, row in X_train_filtered_unlabeled.iterrows():
        for col in range(len(X_train_filtered_unlabeled.columns)):
            if row[col] < lowerBound[col]:
                row[col] = np.nan
            elif row[col] > upperBound[col]:
                row[col] = np.nan
    Q2 = X_train_filtered_unlabeled.quantile(q = 0.50, axis = 0, numeric_only = False)
    for index, row in X_train_filtered_unlabeled.iterrows():
        for col in range(len(X_train_filtered_unlabeled.columns)):
            if pd.isna(row[col]):
                row[col] = Q2[col]
                
    # Remove outlier for X_test_filtered_unlabeled
    Q1 = X_test_filtered_unlabeled.quantile(q = 0.25, axis = 0, numeric_only = False)
    Q3 = X_test_filtered_unlabeled.quantile(q = 0.75, axis = 0, numeric_only = False)
    lowerBound = Q1 - 1.5 * (Q3 - Q1)
    upperBound = Q3 + 1.5 * (Q3 - Q1)
    for index, row in X_test_filtered_unlabeled.iterrows():
        for col in range(len(X_test_filtered_unlabeled.columns)):
            if row[col] < lowerBound[col]:
                row[col] = np.nan
            elif row[col] > upperBound[col]:
                row[col] = np.nan
    Q2 = X_test_filtered_unlabeled.quantile(q = 0.50, axis = 0, numeric_only = False)
    for index, row in X_test_filtered_unlabeled.iterrows():
        for col in range(len(X_test_filtered_unlabeled.columns)):
            if pd.isna(row[col]):
                row[col] = Q2[col]
    
    loo = LeaveOneOut()
    feature_rank = np.zeros(X_train_filtered_unlabeled.shape[1])

    for train_index, test_index in loo.split(X_train_filtered_unlabeled):
        x_train_loo, x_test_loo, y_train_loo, y_test_loo = X_train_filtered_unlabeled.iloc[train_index], X_train_filtered_unlabeled.iloc[test_index], Y_train_filtered.iloc[train_index], Y_train_filtered.iloc[test_index]

        # Create the RFE object and compute a cross-validated score.
        svc = SVC(kernel = "linear")
        # The "accuracy" scoring is proportional to the number of correct classifications
        rfecv = RFECV(estimator = svc, step = 1, cv = 3, scoring = 'accuracy').fit(x_train_loo, y_train_loo)
        optimal_feature_num = rfecv.n_features_
        feature_ranking = rfecv.ranking_
        feature_ranking_filtered = np.where(feature_ranking <= 1, feature_ranking, 0)
        feature_rank = np.add(feature_rank, feature_ranking_filtered)
#         print("Optimal number of features : %d" % optimal_feature_num)
#         print("Feature Ranking            : ", feature_rank)

    #keep_col_index = np.where(feature_rank > np.percentile(feature_rank, 95))[0]
    keep_col_index = np.where(feature_rank > X_train_filtered_unlabeled.shape[0] * 0.5)[0]
    print("Columns kept: ", keep_col_index)
    
    column_kept_file_name = 'col_kept_' + str(element[0]) + '_' + str(element[1]) + '_df.csv'
    keep_col_index_df = pd.DataFrame(data = keep_col_index)
    keep_col_index_df.to_csv(column_kept_file_name, header = False, index = False)

    # Train logistic regression
    lr = LogisticRegression(random_state = 0, solver = 'lbfgs')
    
    X_train_filtered_unlabeled_FS = X_train_filtered_unlabeled[X_train_filtered_unlabeled.columns[keep_col_index]]
    X_test_filtered_unlabeled_FS = X_test_filtered_unlabeled[X_test_filtered_unlabeled.columns[keep_col_index]]

    lr.fit(X_train_filtered_unlabeled_FS, Y_train_filtered)
    print("Score:", lr.score(X_test_filtered_unlabeled_FS, Y_test_filtered))
    
    joblib.dump(lr, model_name)
    
    weight_matrix.append(len(X_train_filtered))
    
weight_matrix = (np.array(weight_matrix)).reshape(5, 4)
weight_matrix_df = pd.DataFrame(data = weight_matrix)
weight_matrix_df.to_csv('weight_matrix.csv', header = False, index = False)

Training class 1: 1
Training class 2: 2
Columns kept:  [  6  20  42  44  48  49  56  58  92 101 202 249 253 269 339 340 345 395
 450 451 460 633 692 702 724 812 820]
Score: 0.8571428571428571
Training class 1: 1
Training class 2: 3


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [ 23  38  43  68 175 203 459 619 634 733]
Score: 0.45454545454545453
Training class 1: 1
Training class 2: 4
Columns kept:  [115 765]
Score: 0.8
Training class 1: 1
Training class 2: 5
Columns kept:  [115]
Score: 0.8333333333333334
Training class 1: 2
Training class 2: 1
Columns kept:  [  6  20  42  44  48  49  56  58  92 101 202 249 253 269 339 340 345 395
 450 451 460 633 692 702 724 812 820]
Score: 0.8571428571428571
Training class 1: 2
Training class 2: 3


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [722]
Score: 0.75
Training class 1: 2
Training class 2: 4
Columns kept:  [188 819]
Score: 0.5
Training class 1: 2
Training class 2: 5
Columns kept:  [668 705 724 788 827]
Score: 0.6666666666666666
Training class 1: 3
Training class 2: 1


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [ 23  38  43  68 175 203 459 619 634 733]
Score: 0.45454545454545453
Training class 1: 3
Training class 2: 2


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [722]
Score: 0.75
Training class 1: 3
Training class 2: 4


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [  1   2   3   4   6   9  11  16  21  23  28  33  38  42  43  48  71  88
 100 102 111 122 126 158 184 198 199 210 212 214 247 269 270 293 294 295
 296 297 298 308 312 313 320 344 357 359 360 362 377 378 397 440 462 473
 536 570 579 585 588 601 614 616 627 647 649 679 692 714 722 733 738 740
 742 759 770 772 777 780 783 784 787 790 809 811 813 822]
Score: 1.0
Training class 1: 3
Training class 2: 5


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [826]
Score: 0.8571428571428571
Training class 1: 4
Training class 2: 1
Columns kept:  [115 765]
Score: 0.8
Training class 1: 4
Training class 2: 2
Columns kept:  [188 819]
Score: 0.5
Training class 1: 4
Training class 2: 3


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [  1   2   3   4   6   9  11  16  21  23  28  33  38  42  43  48  71  88
 100 102 111 122 126 158 184 198 199 210 212 214 247 269 270 293 294 295
 296 297 298 308 312 313 320 344 357 359 360 362 377 378 397 440 462 473
 536 570 579 585 588 601 614 616 627 647 649 679 692 714 722 733 738 740
 742 759 770 772 777 780 783 784 787 790 809 811 813 822]
Score: 1.0
Training class 1: 4
Training class 2: 5
Columns kept:  [ 23  38  41  43  85 114 115 154 159 166 168 305 383 404 411 439 442 469
 486 534 538 567 608 610 611 614 615 616 647 651 712 713 721 722 738 741
 746 772 782 783 813 815 818 827]
Score: 1.0
Training class 1: 5
Training class 2: 1
Columns kept:  [115]
Score: 0.8333333333333334
Training class 1: 5
Training class 2: 2
Columns kept:  [668 705 724 788 827]
Score: 0.6666666666666666
Training class 1: 5
Training class 2: 3


  sqr = np.multiply(arr, arr, out=arr)


Columns kept:  [826]
Score: 0.8571428571428571
Training class 1: 5
Training class 2: 4
Columns kept:  [ 23  38  41  43  85 114 115 154 159 166 168 305 383 404 411 439 442 469
 486 534 538 567 608 610 611 614 615 616 647 651 712 713 721 722 738 741
 746 772 782 783 813 815 818 827]
Score: 1.0


### Use the weight matrix to calculate the probability matrix

To aggregate the results of the OVO classifiers, we need to obtain the probability matrix M, where M<sub>ijk</sub> = the confidence of kth testing data being the class represented by i over the class represented by j. (i, j) represents the unique pair of classes such that class i is different from class j.

In [4]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
Y_train = pd.read_csv('Y_train.csv')
Y_test = pd.read_csv('Y_test.csv')
weight_matrix = pd.read_csv('weight_matrix.csv', header = None).values
prob_matrix = []

for row_index in range(5):
    for col_index in range(5):
        if row_index != col_index:
            model_name = 'model_' + str(row_index + 1) + '_' + str(col_index + 1) + '.pkl'
            clf = joblib.load(model_name) 
            col_kept_csv = 'col_kept_' + str(row_index + 1) + '_' + str(col_index + 1) + '_df.csv'
            col_kept = pd.read_csv(col_kept_csv, header = None)
            
            X_test_filtered_unlabeled = StandardScaler().fit_transform(X_test)
            X_test_filtered_unlabeled = pd.DataFrame(data = X_test_filtered_unlabeled)
            print(X_test_filtered_unlabeled.shape)
            print(col_kept.shape)
            
            X_test_FS = X_test_filtered_unlabeled[X_test_filtered_unlabeled.columns[np.transpose(col_kept.values)[0]]]
            if (row_index < col_index):
                print(clf.predict_proba(X_test_FS))
                prob_matrix.append(clf.predict_proba(X_test_FS)[:, 0])
            else:
                prob_matrix.append(clf.predict_proba(X_test_FS)[:, 1])

prob_matrix = np.array(prob_matrix).reshape(5, 4, 14)

(14, 831)
(27, 1)
[[9.90080524e-01 9.91947617e-03]
 [6.96117085e-01 3.03882915e-01]
 [9.99779192e-01 2.20808277e-04]
 [9.99988535e-01 1.14649837e-05]
 [7.68765044e-01 2.31234956e-01]
 [2.91072158e-01 7.08927842e-01]
 [4.13719129e-01 5.86280871e-01]
 [6.47009545e-02 9.35299045e-01]
 [8.30566255e-01 1.69433745e-01]
 [9.85945993e-01 1.40540070e-02]
 [9.95888230e-01 4.11176969e-03]
 [4.95987422e-02 9.50401258e-01]
 [9.99905232e-01 9.47683658e-05]
 [9.99696398e-01 3.03602065e-04]]
(14, 831)
(10, 1)
[[8.92205694e-01 1.07794306e-01]
 [5.57684775e-02 9.44231523e-01]
 [9.39531634e-01 6.04683658e-02]
 [9.99829203e-01 1.70796712e-04]
 [2.19103839e-04 9.99780896e-01]
 [2.47127750e-02 9.75287225e-01]
 [1.02483835e-02 9.89751617e-01]
 [1.20346224e-04 9.99879654e-01]
 [5.51738186e-01 4.48261814e-01]
 [7.08426486e-01 2.91573514e-01]
 [7.49840591e-01 2.50159409e-01]
 [1.68689806e-02 9.83131019e-01]
 [9.98757323e-01 1.24267744e-03]
 [9.90866853e-01 9.13314655e-03]]
(14, 831)
(2, 1)
[[9.05422907e-01 9.45

### Aggregate the classification result with Learning Valued Preference for Classification (LVPC)

Learning Valued Preference for Classification (LVPC) E. Hüllermeier and K. Brinker. Learning valued preference structures for solving classification problems. Fuzzy Sets and Systems, 159(18):2337–2352, 2008 AND J.C. Huhn and E. Hüllermeier. FR3: A fuzzy rule learner for inducing reliable classifiers. IEEE Transactions on Fuzzy Systems, 17(1):138–149, 2009. This method consider the score matrix as a fuzzy preference relation, based in fuzzy preference modeling the original relation is decomposed in three new relations with different meanings, the strict preference, the conflict and the ignorance. A decision rule based on voting strategy is proposed to obtain the output class from them (https://sci2s.ugr.es/ovo-ova):

\begin{equation*}
Class = argmax_{i = 1,...,m} \sum_{1\leq j \neq i \leq m} P_{ijk} + \frac{1}{2}C_{ijk} + \frac{N_{i}}{N_{i} + N_{j}} I_{ijk}
\end{equation*}

where:
- N<sub>i</sub> is the number of examples from class i in the training data (an unbiased estimate of the class -- probability)
- C<sub>ijk</sub> is the degree of conflict (the degree to which both classes are supported)
- I<sub>ijk</sub> is the degree of ignorance (the degree to which none of the classes is supported) and finally,
- P<sub>ijk</sub> and P<sub>jik</sub> are respectively the strict preference for i and j. Preference, confidence and ignorance degrees are computed as follows:

$$
\begin{align}
P_{ijk} = r_{ijk} - min(r_{ijk}, r_{jik}) \\
P_{jik} = r_{jik} - min(r_{ijk}, r_{jik}) \\
C_{ijk} = min(r_{ijk}, r_{jik}) \\
I_{ijk} = 1 - min(r_{ijk}, r_{jik}) \\
\end{align}
$$

In [5]:
minimum_r = []
maximum_r = []
# Create a list of class labels
list = [1, 2, 3, 4, 5]
# Create a list of tuples for binary classifier. e.g. (1, 2)
class_pair_combo_2 = [(x, y) for x in list for y in list if x != y]
for pair in class_pair_combo_2:
    if pair[0] < pair[1]:
        small_class = pair[0]
        large_class = pair[1]
    else:
        small_class = pair[1]
        large_class = pair[0]
    temp_min = np.minimum(prob_matrix[small_class - 1][large_class - 2], prob_matrix[large_class - 1][small_class - 1])
    temp_max = np.maximum(prob_matrix[small_class - 1][large_class - 2], prob_matrix[large_class - 1][small_class - 1])
    minimum_r.append(temp_min)
    maximum_r.append(temp_max)
    
minimum_r = np.array(minimum_r).reshape(5, 4, 14)
maximum_r = np.array(maximum_r).reshape(5, 4, 14)

# Calculating P, C, and I
P = np.subtract(prob_matrix, minimum_r)
C = minimum_r
I = np.ones((5, 4, 14), dtype = float) - maximum_r
print('P:')
print(P)
print('C:')
print(C)
print('I:')
print(I)

P:
[[[0.98016105 0.39223417 0.99955838 0.99997707 0.53753009 0.
   0.         0.         0.66113251 0.97189199 0.99177646 0.
   0.99981046 0.9993928 ]
  [0.78441139 0.         0.87906327 0.99965841 0.         0.
   0.         0.         0.10347637 0.41685297 0.49968118 0.
   0.99751465 0.98173371]
  [0.81084581 0.85873908 0.84348021 0.99956716 0.0728065  0.
   0.02956777 0.71461514 0.72238959 0.9000969  0.878211   0.99929004
   0.94728314 0.89884607]
  [0.94103177 0.90642465 0.94103177 0.94103177 0.         0.
   0.06630025 0.90109626 0.77439437 0.76708027 0.94103177 0.99995194
   0.94103177 0.94103177]]

 [[0.         0.         0.         0.         0.         0.41785568
   0.17256174 0.87059809 0.         0.         0.         0.90080252
   0.         0.        ]
  [0.         0.         0.         0.         0.         0.38216946
   0.         0.0950001  0.         0.         0.         0.
   0.         0.        ]
  [0.09790159 0.         0.61313754 0.85661878 0.         0.
   0. 

In [6]:
class_size_matrix = [[49, 49, 49, 49],
                     [16, 16, 16, 16],
                     [45, 45, 45, 45],
                     [14, 14, 14, 14],
                     [12, 12, 12, 12]]
# Calculate the proportion size matrix later for LVPC
proportion_size_matrix = np.divide(class_size_matrix, weight_matrix)

In [7]:
weighted_prob = []
class_score = []
for c in range(5):
    weighted_prob.append(np.average(prob_matrix[c], axis = 0, weights = weight_matrix[c]) / np.sum(weight_matrix[c]))
    class_score.append((np.average(P[c], axis = 0) + 0.5 * np.average(C[c], axis = 0) + np.average(prob_matrix[c], axis = 0, weights = proportion_size_matrix[c])) * 4)
weighted_prob = np.transpose(np.array(weighted_prob))
class_score = np.transpose(np.array(class_score))
print(class_score)
print(weighted_prob)

[[7.4125617  1.41375277 5.00705465 1.0699921  4.71677139]
 [5.24103877 3.3630873  7.01964278 3.30441313 0.66827075]
 [7.58554446 3.34765887 4.69303949 3.30301544 0.77264241]
 [7.92167011 4.00872309 3.92056008 1.71716557 1.76603608]
 [2.96605417 1.04172592 6.79970212 2.30448345 5.44242455]
 [1.2832444  2.15957035 2.16791648 5.1599324  6.96927637]
 [2.38021799 1.90151534 7.13026951 3.73564511 2.68726404]
 [3.79515521 4.90859826 6.55310082 1.79357721 2.47675838]
 [5.90694723 3.7154132  6.16681494 1.48911918 1.8111035 ]
 [6.87570659 3.57527157 5.89877919 1.53160137 1.67082636]
 [7.19431313 4.54591079 4.8943981  1.11103588 1.74812266]
 [4.28724552 4.7297134  6.94881252 1.46247244 2.37592716]
 [7.8519773  3.91604428 5.299018   2.96329438 0.39316918]
 [7.7715917  3.39367572 3.60109626 4.35964261 0.78466068]]
[[0.00367925 0.00109213 0.00239699 0.00076615 0.00265268]
 [0.00234568 0.0023516  0.0037988  0.00162323 0.00053144]
 [0.00376353 0.00186483 0.0022142  0.00208228 0.00052253]
 [0.00391145 

In [8]:
class_size = [49, 16, 45, 14, 12]
for col in range(5):
    weighted_prob[:,col] *= class_size[col]
print(weighted_prob)

[[0.18028337 0.01747405 0.10786456 0.01072605 0.03183221]
 [0.11493816 0.03762555 0.17094614 0.02272523 0.00637731]
 [0.18441276 0.0298373  0.09963889 0.02915191 0.00627033]
 [0.19166095 0.03349725 0.08588426 0.01517941 0.02055892]
 [0.07338216 0.02009332 0.16943365 0.02731927 0.0449774 ]
 [0.03559997 0.04763801 0.0777242  0.06670924 0.0670452 ]
 [0.06381867 0.03468655 0.17418254 0.03558015 0.02807612]
 [0.08032992 0.06782223 0.16421068 0.01557824 0.01491168]
 [0.14636525 0.0360866  0.14139497 0.01421662 0.01390228]
 [0.16684865 0.03075493 0.12990069 0.01284033 0.01351638]
 [0.17304846 0.03654213 0.1123968  0.01253362 0.01597213]
 [0.08905531 0.06358556 0.17085783 0.01048687 0.01340185]
 [0.19043813 0.0327206  0.10668052 0.02065328 0.00357743]
 [0.18886467 0.03007272 0.07221332 0.04360135 0.00666791]]


In [9]:
predicted_class = []
for r in range(len(weighted_prob)):
    predicted_class.append(np.where(class_score[r] == np.amax(class_score[r]))[0][0] + 1)
predicted_class = np.transpose(predicted_class)

In [10]:
num_correct = 0
for index in range(len(predicted_class)):
    if predicted_class[index] == np.transpose(Y_test.values)[0][index]:
        num_correct += 1
print(num_correct/len(predicted_class))

0.5714285714285714
