In [1]:
import scipy.io
import os
import glob
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearnex import patch_sklearn
import matplotlib.pyplot as plt
import seaborn as sns
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
def read_mat_file(filename):
    mat = scipy.io.loadmat(filename)
    # do something with the contents of the MATLAB file
    return mat

In [3]:
def read_mat_files(filenames):
    data = []
    for filename in filenames:
        data.append(read_mat_file(filename))
    return data

In [94]:
# benign_data_labels = ["Q1", "Q2", "Q4", "Q5", "Q6", "Q7", "Q9", "Q10", "Q11", "Q13", "Q14", "Q16", "Q20"
#                     , "Q21", "Q27", "Q34", "Q35", "Q37", "Q38", "Q40", "Q42", "Q43", "Q46", "Q49", "Q50"
#                     , "Q52", "Q54", "Q56", "Q59", "Q62", "Q63", "Q65", "Q66", "Q67", "Q69"]
# benign_data_labels = ["Q1", "Q2", "Q7", "Q9", "Q10", "Q11", "Q13", "Q14", "Q16", "Q20"
#                     , "Q21", "Q27", "Q34", "Q35", "Q37", "Q38", "Q40", "Q42", "Q43", "Q46", "Q49", "Q50"
#                     , "Q52", "Q54", "Q56", "Q59", "Q62", "Q63", "Q65", "Q66", "Q67", "Q69"]
# scc_data_labels = ["Q3", "Q8", "Q12", "Q15", "Q17", "Q22", "Q24", "Q26", "Q28", "Q29","Q30","Q31", "Q32"
#                     ,"Q33","Q36","Q39","Q41","Q44","Q45","Q47","Q48", "Q51","Q53","Q55","Q57","Q58","Q61"
#                     ,"Q64","Q68"] (from nn_gpu.jl, this list is wrong)
scc_data_labels = ["Q3", "Q33","Q36","Q39","Q41","Q44","Q45","Q47","Q48", "Q51","Q53","Q55","Q57","Q58","Q61"
                    ,"Q64","Q68"] 
# (Q8 & Q32 have 8 and 7 features respectively for some reason)
# Use every 1/4 pixels so we still include every patient
benign_data_labels = ["Q1", "Q2", "Q7", "Q9", "Q11", "Q13", "Q14", "Q16", "Q20", "Q21", "Q27", "Q34", "Q35", 
                      "Q37", "Q38"]

In [95]:
# benign_data_labels = ["Q1", "Q2", "Q4", "Q5", "Q6", "Q7"]
# scc_data_labels = ["Q8"]
# Q4, Q5, Q6 give bad results for some reason
target = []
data = []

benign_dict = {}
normal_dict = {}


for label in benign_data_labels:
    data_lesion = read_mat_files(glob.glob("/Users/byronpetselis/Desktop/Cancer_Data/Yeoh/ben/" + label + "_*_lesion*"))
    feat_15 = []
    feat_length = 0
    
    for feature in range(15):
        feat_name = list(data_lesion[feature].keys())[3]
        # '__header__', '__version__', '__globals__', 'tfast390m' are the key names
        feat_vals = data_lesion[feature][feat_name][0]
        # get 160x160 1D array associated with key name, [0] to get rid of nesting
        feat_15.append(feat_vals)
        # appends 160x160 1D array to array of 15 iteratively
        feat_length = len(feat_vals)
        # get actual length of 160x160 1D array
    
    feat_reshape = np.reshape(feat_15, (feat_length, 15))
    # reshapes (15, 160x160) to (160x160, 15) 2D array
    
#     target_app = [0] * feat_length
#     data.append(feat_reshape)
    
    for row in range(feat_length):
        if row % 2 == 0:
            target.append(1)
            data.append(feat_reshape[row])

for label in benign_data_labels:
    data_normal = read_mat_files(glob.glob("/Users/byronpetselis/Desktop/Cancer_Data/Yeoh/ben/" + label + "_*_normal*"))
    feat_15 = []
    feat_length = 0
    
    for feature in range(15):
        feat_name = list(data_normal[feature].keys())[3]
        feat_vals = data_normal[feature][feat_name][0]
        feat_15.append(feat_vals)
        feat_length = len(feat_vals)
    
    feat_reshape = np.reshape(feat_15, (feat_length, 15))
        
    for row in range(feat_length):
        if row % 2 == 0:
            target.append(0)
            data.append(feat_reshape[row])
            
print(len(data))
print(f"SCC: {target.count(1)}")
print(f"Benign: {target.count(0)}")

318919
SCC: 158238
Benign: 160681


In [96]:
# Error Checking

print(type(data))
print(type(target))
print(f"Cancer: {target.count(1)}")
print(f"Benign: {target.count(0)}")
print(f"Any Nan?: {np.any(np.isnan(data))}")
print(f"Any Inf?: {np.any(np.isinf(data))}")
print(f"Any NegInf?: {np.any(np.isneginf(data))}")
print(np.max(data))
print(np.min(data))
# print(np.sum(value > 100 for row in data for value in row))

for row in data:
    for value in row:
        if not np.issubdtype(type(value), np.float64):
            print("Non-Integer Exists")

<class 'list'>
<class 'list'>
Cancer: 158238
Benign: 160681
Any Nan?: False
Any Inf?: False
Any NegInf?: False
232358.10778177824
-11.07772779076678


In [97]:
# Attempt to Remove Outliers

def replace_outliers_with_mean(data, threshold=3):
    # Calculate mean and standard deviation of the data
    mean = np.mean(data)
    std = np.std(data)

    # Calculate z-scores for each data point
    z_scores = np.abs((data - mean) / std)
    
    outliers = np.where(z_scores > threshold)

    for index in range(len(outliers[0])):
        row = outliers[0][index]
        col = outliers[1][index]
        data[row][col] = mean

    return data

In [98]:
new_data = replace_outliers_with_mean(data)

In [99]:
# 30000 samples takes 88 seconds (w/ outliers)
# 30000 samples takes 60 seconds (w/o outliers) (Very inconsistent, took 2 seconds once)
# 60000 samples takes 332 seconds
# 120000 samples takes >2500 seconds (didn't finish)
# 320000 samples takes 169 seconds
# 640000 samples takes 417 seconds

In [100]:
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.8, random_state=209)

In [101]:
np.isfinite(X_train).all()

True

In [102]:
cls = svm.SVC(kernel="linear", C=1, verbose=1)
# generate the model

In [103]:
start = time.time()
cls.fit(X_train, Y_train)
# train the model

SVC(C=1, kernel='linear', verbose=1)

In [104]:
pred = cls.predict(X_test)
end = time.time()
print(end - start)
# predict the response

217.02362489700317


In [105]:
print(np.count_nonzero(pred))
# Compare prediction made from X_test to the correct values (Y_test)

20127


In [106]:
print("accuracy:", metrics.accuracy_score(Y_test,pred))
print()
print("precision:", metrics.precision_score(Y_test,pred))
print()
print("recall:", metrics.recall_score(Y_test,pred))
print()
print(metrics.classification_report(Y_test,pred))

accuracy: 0.5156191207826414

precision: 0.5748000198738014

recall: 0.09138663759735849

              precision    recall  f1-score   support

           0       0.51      0.93      0.66    128542
           1       0.57      0.09      0.16    126594

    accuracy                           0.52    255136
   macro avg       0.54      0.51      0.41    255136
weighted avg       0.54      0.52      0.41    255136



In [None]:
# Possible Problems
    # Unscaled Features
    # Improper Hyperparameters
    # Data Imbalance? 58959 vs 65038 is pretty decent though
    # Convergence Issues