# Use CellProfiler Features Extracted to Construct a knn Network

## Import data and split into training/test set

In [None]:
import pandas as pd
import numpy as np

raw_data = pd.read_csv(r"..\data\processed\cellprofiler_data.csv")
data = raw_data.drop(["Phenotype", "ImageNumber"], axis=1)
labels = raw_data["Phenotype"]
data = data.to_numpy()
labels = labels.to_numpy()
phenotype = {"alveolar": 0,
             "marrow": 1,
             "monocyte": 2}
labels = [phenotype[label] for label in labels]

phenotype = {"TR": 0,
             "BMD": 1,
             "Monocyte": 2}

from random import shuffle
train_index = [[] for _ in range(len(phenotype))] 
test_index = [[] for _ in range(len(phenotype))]

[train_index[labels[i]].append(i) for i in range(len(labels))]

for i in range(len(train_index)):
    tmp = train_index[i]
    shuffle(tmp)

    test_len = int(len(tmp)/5)
    test_index[i].extend(tmp[:test_len])
    train_index[i] = tmp[test_len:]

tmp = []
[tmp.extend(inds) for inds in train_index]
train_index = tmp

train_samples = np.stack([data[ind] for ind in train_index])
train_labels = np.stack([labels[ind] for ind in train_index])

tmp = []
[tmp.extend(inds) for inds in test_index]
test_index = tmp

test_samples = np.stack([data[ind] for ind in test_index])
test_labels = np.stack([labels[ind] for ind in test_index])

## PCA of data

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(train_samples)
std_data = sc.transform(train_samples)
std_test_data = sc.transform(test_samples)

num_comp = 2

pca = PCA(n_components=num_comp)
pca.fit(std_data)
pca_result = pca.transform(std_data)

graph_data = pd.DataFrame()
graph_data['pca-one'] = pca_result[:,0]
graph_data['pca-two'] = pca_result[:,1] 
#graph_data['pca-three'] = pca_result[:,2]  

reverse_pheno = {v: k for k, v in phenotype.items()}

graph_data['y'] = [reverse_pheno[label] for label in train_labels]


print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

import seaborn as sns
from matplotlib import pyplot as plt
plt.figure(figsize=(10,10))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", 3),
    data=graph_data,
    legend="full",
    alpha=0.3
)
plt.show()

## T-SNE of Data

In [None]:
from sklearn.manifold import TSNE

TSNE = TSNE(n_components=2, perplexity=50, n_iter=5000, learning_rate=200)

tsne_results = TSNE.fit_transform(std_data)

In [None]:
# visualize
df_tsne = pd.DataFrame(tsne_results, columns=['t-sne-one', 't-sne-two'])
df_tsne['label'] = [reverse_pheno[label] for label in train_labels]

plt.figure(figsize=(10,10))
sns.scatterplot(
    x="t-sne-one", y="t-sne-two",
    hue="label",
    palette=sns.color_palette("hls", 3),
    data=df_tsne,
    legend="full",
    alpha=0.5
)
plt.show()

## Confusion Matrix Graphing Function 

In [None]:
def confusion_matrix(out, test_labels):
    num_classes = max(out) + 1

    confusion_matrix = np.zeros((num_classes, num_classes))

    for t, p in zip(out, test_labels):
        confusion_matrix[int(t), int(p)] += 1

    matrix = np.array(confusion_matrix)
    matrix = np.array([i/sum(i) for i in matrix])

    test_accuracy = 0
    for i in range(num_classes):
        test_accuracy += confusion_matrix[i,i]
    test_accuracy = test_accuracy / sum(confusion_matrix.flatten())
    return matrix, test_accuracy

def show_matrix(matrix, labels, title):

    matrix_df = pd.DataFrame(matrix, index=labels, columns=labels)
    sn.set(font_scale=1.4) # for label size
    sn.heatmap(matrix_df, annot=True, fmt='.2%') # font size
    plt.title(title)
    plt.show()
    

## Visualization of Boundaries in PCA
https://stackoverflow.com/questions/56153726/plot-k-nearest-neighbor-graph-with-8-features

In [None]:
import seaborn as sn
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.ticker as ticker
from matplotlib.colors import ListedColormap

relevant_indices = np.where(train_labels != phenotype["Monocyte"])[0]
knn_pca_graph_input = pca_result[relevant_indices]
knn_pca_graph_output = train_labels[relevant_indices]

knn_pca_graph = KNeighborsClassifier(n_neighbors=30)
knn_pca_graph.fit(knn_pca_graph_input, knn_pca_graph_output)

x_min, x_max = pca_result[:, 0].min() - 1, pca_result[:, 0].max() + 1
y_min, y_max = pca_result[:, 1].min() - 1, pca_result[:, 1].max() + 1

mesh_density = 0.01
x_grid, y_grid = np.meshgrid(np.arange(x_min, x_max), np.arange(y_min, y_max))
Z = knn_pca_graph.predict(np.c_[x_grid.ravel(), y_grid.ravel()])
Z = Z.reshape(x_grid.shape)

fig, ax = plt.subplots(figsize=(10,10))
cmap_light = ListedColormap(['#ffd5d1', '#c4ffcf'])
cmap_bold  = ['#ed210e', '#43de64', '#7f78ff']
plt.pcolormesh(x_grid,y_grid, Z, cmap=cmap_light, shading="auto")
for i, pheno in enumerate(["TR", "BMD", "Monocyte"]):
    ind = np.where(train_labels == phenotype[pheno])[0]
    ax.scatter(pca_result[ind, 0], pca_result[ind, 1], c=cmap_bold[i], label=pheno,
               s=25, alpha=0.3, edgecolor="k")
plt.xlabel("pca-one")
plt.ylabel("pca-two")
plt.legend()
#plt.legend()
plt.show()

out = list(knn_pca_graph.predict(pca_result))
out.append(2)
matrix, accuracy = confusion_matrix(out, train_labels)

print(f"accuracy of knn: {accuracy:>0.2%}")

axis_labels = ["TR", "BMD", "monocyte"]

show_matrix(matrix, axis_labels, 'PCA Gating')



# KNN With Standardized Raw Data

In [None]:
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(std_data, train_labels)

out = list(knn.predict(std_test_data))

matrix, accuracy = confusion_matrix(out, test_labels)

print(f"accuracy of knn: {accuracy:>0.2%}")

axis_labels = ["TR", "BMD", "monocyte"]

show_matrix(matrix, axis_labels, 'KNN w/ Standardized Raw Features')


## Using PCA modified dataset with KNN

Sometimes features that are not correlated with the correct class is detrimental to KNN. Using PCA dimensionality reduction we try to filter these out.

In [None]:
knn_pca = KNeighborsClassifier(n_neighbors=len(phenotype))
knn_pca.fit(pca_result, train_labels)

pca_test_data = pca.transform(std_test_data)

out = list(knn_pca.predict(pca_test_data))
matrix, accuracy = confusion_matrix(out, test_labels)

print(f"accuracy of PCA knn: {accuracy:>0.2%}")
show_matrix(matrix, axis_labels, 'KNN w/ PCA Features')

## Random forest with raw data

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(std_data, train_labels)

out = list(rfc.predict(std_test_data))

matrix, accuracy = confusion_matrix(out, test_labels)
print(f"accuracy of random forest: {accuracy:>0.2%}")
show_matrix(matrix, axis_labels, 'Random Forest w/ Standardized Raw Features')

## Random forest with extracted features

In [None]:
rfc_pca = RandomForestClassifier()
rfc_pca.fit(pca_result, train_labels)

out = list(rfc_pca.predict(pca_test_data))
matrix, accuracy = confusion_matrix(out, test_labels)

print(f"accuracy of pca random forest: {accuracy:>0.2%}")
show_matrix(matrix, axis_labels,'Random Forest w/ PCA Features')

## Decision Tree to extract most relevant features

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=0)
decision_tree.fit(std_data, train_labels)
out = list(decision_tree.predict(std_test_data))
matrix, accuracy = confusion_matrix(out, test_labels)

print(f"accuracy of decision tree: {accuracy:>0.2%}")
show_matrix(matrix, axis_labels, 'Decision Tree w/ Standardized Raw Features')

In [None]:
f_1 = np.argmax(decision_tree.feature_importances_)
feature_importances = decision_tree.feature_importances_
importances_ranked = feature_importances.argsort()[::-1]
feature_names = list(raw_data.keys())

important_features = []
print("Most Important Features from Decision Tree")
for i, feature in enumerate(importances_ranked[:30]):
    important_features.append(feature_names[feature])
    print(i, important_features[i], f"{feature_importances[importances_ranked[i]]:.2%}")


## Training KNN with N Most relevant Features

In [None]:
selected_data = raw_data[important_features].to_numpy()

selected_train_samples = np.stack([selected_data[ind] for ind in train_index])
selected_test_samples = np.stack([selected_data[ind] for ind in test_index])

sc = StandardScaler()
sc.fit(selected_train_samples)
selected_train_samples = sc.transform(selected_train_samples)
selected_test_samples = sc.transform(selected_test_samples)


In [None]:
knn_selected = KNeighborsClassifier(n_neighbors=len(phenotype))
knn_selected.fit(selected_train_samples, train_labels)

out = list(knn_selected.predict(selected_test_samples))

matrix, accuracy = confusion_matrix(out, test_labels)
print(f"accuracy of knn: {accuracy:>0.2%}")
show_matrix(matrix, axis_labels, 'KNN w/ Selected Features')

In [None]:
rfc_select = RandomForestClassifier()
rfc_select.fit(selected_train_samples, train_labels)

out = list(rfc_select.predict(selected_test_samples))
matrix, accuracy = confusion_matrix(out, test_labels)

print(f"accuracy of pca random forest: {accuracy:>0.2%}")
show_matrix(matrix, axis_labels, 'Random Forest w/ Selected Features')