## Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.metrics import accuracy_score
import random

import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df

In [None]:
labels = pd.read_csv('labels.csv')
labels

In [None]:
data = pd.merge(labels, df, on='Unnamed: 0')
data = data.rename(columns={'Unnamed: 0': 'Sample'})
data

## Cancer Gene Identification

In [None]:
types = data.groupby(data['Class']).count().reset_index()
types

In [None]:
# Genes important to certain cancer types:
# BRCA1 = gene_1588 important in breast/prostate (BRCA/ PRAD)
# BRCA2 = gene_1589   ..
# FAM83A = gene_6160 important in lung cancer (LUAD)
# CLCA1 = gene_3813 important in colon cancer (COAD)
# WT1 = gene_19636 important in kidney cancer (KIRC)

In [None]:
cancer_genes = data[['Class','Sample','gene_1588','gene_1589','gene_6160','gene_3813','gene_19636']]

In [None]:
pd.options.mode.chained_assignment = None
cancer_genes['gene_1588_s'] = (cancer_genes.gene_1588 - cancer_genes.gene_1588.mean()) / cancer_genes.gene_1588.std()
cancer_genes['gene_1589_s'] = (cancer_genes.gene_1589 - cancer_genes.gene_1589.mean()) / cancer_genes.gene_1589.std()
cancer_genes['gene_6160_s'] = (cancer_genes.gene_6160 - cancer_genes.gene_6160.mean()) / cancer_genes.gene_6160.std()
cancer_genes['gene_3813_s'] = (cancer_genes.gene_3813 - cancer_genes.gene_3813.mean()) / cancer_genes.gene_3813.std()
cancer_genes['gene_19636_s'] = (cancer_genes.gene_19636 - cancer_genes.gene_19636.mean()) / cancer_genes.gene_19636.std()
cancer_genes

## BRCA/PRAD Tumor Analysis

In [None]:
cancer_copy = cancer_genes.copy()
dfMap = {'PRAD':1, 'BRCA':1, 'LUAD':0, 'COAD':0, 'KIRC':0}
cancer_copy = cancer_copy.replace({'Class': dfMap})

def get_scores(k):
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean',algorithm='auto')
    
    X = cancer_copy[['gene_1588_s', 'gene_1589_s']].values
    
    fit = nn.fit(X)

    n = 100

    patients = cancer_copy.sample(n)
    patientsX = patients[['gene_1588_s', 'gene_1589_s']].values
    patientsY = patients[['Class']].values
    
    distances, indices = fit.kneighbors(patientsX)

    y_pred = []
    for i in range(n):
        nbrs = cancer_copy.iloc[indices[i]]
        nbrs = nbrs.drop(patients.index[i], errors='ignore')
    
        falseCancer = nbrs[nbrs.Class == 0].count().Class
        trueCancer = nbrs[nbrs.Class == 1].count().Class
        predict = 0 if (falseCancer > trueCancer) else 1
    
        y_pred.append(predict)
    
    return precision_recall_fscore_support(patientsY, y_pred, labels=[1])

kvals = range(2, 40)

scores = [get_scores(k) for k in kvals]

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.xlabel("K-Value for BRCA & PRAD")
plt.ylabel("F1 Score")
plt.savefig("BRCA_PRAD_kvalue.pdf")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 7
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = cancer_copy[['gene_1588_s', 'gene_1589_s']].values
y = cancer_copy[['Class']].values

precision, recall, f1 = [], [], []

for x in range(20):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    fit = nn.fit(X_train)

    y_pred = []

    distances, indices = fit.kneighbors(X_test)

    for i in range(len(y_test)):
        # print('nearest neighbors to patient: {}:'.format(patientsX[i]))
        nbrs = y_train[indices[i]]
        # Drop the patient of interest
        currentPatient = y_test[i][0]

        healthy = 0
        sick = 0
        for j in range(len(nbrs) - 1):
            if nbrs[j] == 0:
                healthy += 1
            elif nbrs[j] == 1:
                sick += 1
        predict = 0 if (healthy > sick) else 1
        y_pred.append(predict)
        
    (p, r, f, s) = precision_recall_fscore_support(y_test, y_pred, labels=[1])
    
    precision.append(p)
    recall.append(r)
    f1.append(f)

print("Precision Scores:")
print(precision)
print("Recall Scores:")
print(recall)
print("F1 Scores:")
print(f1)
meanF1 = sum(f1) / len(f1)
print("Mean F1 Score: " + str(meanF1))

## LUAD Tumor Analysis

In [None]:
cancer_copy = cancer_genes.copy()
dfMap = {'PRAD':0, 'BRCA':0, 'LUAD':1, 'COAD':0, 'KIRC':0}
cancer_copy = cancer_copy.replace({'Class': dfMap})

def get_scores(k):
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean',algorithm='auto')
    
    X = cancer_copy[['gene_6160_s']].values
    
    fit = nn.fit(X)

    n = 100

    patients = cancer_copy.sample(n)
    patientsX = patients[['gene_6160_s']].values
    patientsY = patients[['Class']].values
    
    distances, indices = fit.kneighbors(patientsX)

    y_pred = []
    for i in range(n):
        nbrs = cancer_copy.iloc[indices[i]]
        nbrs = nbrs.drop(patients.index[i], errors='ignore')
    
        falseCancer = nbrs[nbrs.Class == 0].count().Class
        trueCancer = nbrs[nbrs.Class == 1].count().Class
        predict = 0 if (falseCancer > trueCancer) else 1
    
        y_pred.append(predict)
    
    return precision_recall_fscore_support(patientsY, y_pred, labels=[1])

kvals = range(2, 50)

scores = [get_scores(k) for k in kvals]

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.xlabel("K-Value for LUAD")
plt.ylabel("F1 Score")
plt.savefig("LUAD_kvalue.pdf")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 44
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = cancer_copy[['gene_6160_s']].values
y = cancer_copy[['Class']].values

precision, recall, f1 = [], [], []

for x in range(20):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    fit = nn.fit(X_train)

    y_pred = []

    distances, indices = fit.kneighbors(X_test)

    for i in range(len(y_test)):
        # print('nearest neighbors to patient: {}:'.format(patientsX[i]))
        nbrs = y_train[indices[i]]
        # Drop the patient of interest
        currentPatient = y_test[i][0]

        healthy = 0
        sick = 0
        for j in range(len(nbrs) - 1):
            if nbrs[j] == 0:
                healthy += 1
            elif nbrs[j] == 1:
                sick += 1
        predict = 0 if (healthy > sick) else 1
        y_pred.append(predict)
        
    (p, r, f, s) = precision_recall_fscore_support(y_test, y_pred, labels=[1])
    
    precision.append(p)
    recall.append(r)
    f1.append(f)

print("Precision Scores:")
print(precision)
print("Recall Scores:")
print(recall)
print("F1 Scores:")
print(f1)
meanF1 = sum(f1) / len(f1)
print("Mean F1 Score: " + str(meanF1))

## KIRC Tumor Analysis

In [None]:
cancer_copy = cancer_genes.copy()
dfMap = {'PRAD':0, 'BRCA':0, 'LUAD':0, 'COAD':0, 'KIRC':1}
cancer_copy = cancer_copy.replace({'Class': dfMap})

def get_scores(k):
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean',algorithm='auto')
    
    X = cancer_copy[['gene_19636_s']].values
    
    fit = nn.fit(X)

    n = 100

    patients = cancer_copy.sample(n)
    patientsX = patients[['gene_19636_s']].values
    patientsY = patients[['Class']].values
    
    distances, indices = fit.kneighbors(patientsX)

    y_pred = []
    for i in range(n):
        nbrs = cancer_copy.iloc[indices[i]]
        nbrs = nbrs.drop(patients.index[i], errors='ignore')
    
        falseCancer = nbrs[nbrs.Class == 0].count().Class
        trueCancer = nbrs[nbrs.Class == 1].count().Class
        predict = 0 if (falseCancer > trueCancer) else 1
    
        y_pred.append(predict)
    
    return precision_recall_fscore_support(patientsY, y_pred, labels=[1], zero_division=0)

kvals = range(2, 30)

scores = [get_scores(k) for k in kvals]

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.xlabel("K-Value for KIRC")
plt.ylabel("F1 Score")
plt.savefig("KIRC_kvalue.pdf")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 5
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = cancer_copy[['gene_19636_s']].values
y = cancer_copy[['Class']].values

precision, recall, f1 = [], [], []

for x in range(20):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    fit = nn.fit(X_train)

    y_pred = []

    distances, indices = fit.kneighbors(X_test)

    for i in range(len(y_test)):
        # print('nearest neighbors to patient: {}:'.format(patientsX[i]))
        nbrs = y_train[indices[i]]
        # Drop the patient of interest
        currentPatient = y_test[i][0]

        healthy = 0
        sick = 0
        for j in range(len(nbrs) - 1):
            if nbrs[j] == 0:
                healthy += 1
            elif nbrs[j] == 1:
                sick += 1
        predict = 0 if (healthy > sick) else 1
        y_pred.append(predict)
        
    (p, r, f, s) = precision_recall_fscore_support(y_test, y_pred, zero_division=0, labels=[1])
    
    precision.append(p)
    recall.append(r)
    f1.append(f)

print("Precision Scores:")
print(precision)
print("Recall Scores:")
print(recall)
print("F1 Scores:")
print(f1)
meanF1 = sum(f1) / len(f1)
print("Mean F1 Score: " + str(meanF1))

## COAD Tumor Analysis

In [None]:
cancer_copy = cancer_genes.copy()
dfMap = {'PRAD':0, 'BRCA':0, 'LUAD':0, 'COAD':1, 'KIRC':0}
cancer_copy = cancer_copy.replace({'Class': dfMap})

def get_scores(k):
    nn = NearestNeighbors(n_neighbors=k, metric='euclidean',algorithm='auto')
    
    X = cancer_copy[['gene_3813_s']].values
    
    fit = nn.fit(X)

    n = 100

    patients = cancer_copy.sample(n)
    patientsX = patients[['gene_3813_s']].values
    patientsY = patients[['Class']].values
    
    distances, indices = fit.kneighbors(patientsX)

    y_pred = []
    for i in range(n):
        nbrs = cancer_copy.iloc[indices[i]]
        nbrs = nbrs.drop(patients.index[i], errors='ignore')
    
        falseCancer = nbrs[nbrs.Class == 0].count().Class
        trueCancer = nbrs[nbrs.Class == 1].count().Class
        predict = 0 if (falseCancer > trueCancer) else 1
    
        y_pred.append(predict)
    
    return precision_recall_fscore_support(patientsY, y_pred, labels=[1], zero_division=0)

kvals = range(2, 50)

scores = [get_scores(k) for k in kvals]

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.xlabel("K-Value for COAD")
plt.ylabel("F1 Score")
plt.savefig("COAD_kvalue.pdf")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 34
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = cancer_copy[['gene_3813_s']].values
y = cancer_copy[['Class']].values

precision, recall, f1 = [], [], []

for x in range(20):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    fit = nn.fit(X_train)

    y_pred = []

    distances, indices = fit.kneighbors(X_test)

    for i in range(len(y_test)):
        # print('nearest neighbors to patient: {}:'.format(patientsX[i]))
        nbrs = y_train[indices[i]]
        # Drop the patient of interest
        currentPatient = y_test[i][0]

        healthy = 0
        sick = 0
        for j in range(len(nbrs) - 1):
            if nbrs[j] == 0:
                healthy += 1
            elif nbrs[j] == 1:
                sick += 1
        predict = 0 if (healthy > sick) else 1
        y_pred.append(predict)
        
    (p, r, f, s) = precision_recall_fscore_support(y_test, y_pred, zero_division=0, labels=[1])
    
    precision.append(p)
    recall.append(r)
    f1.append(f)

print("Precision Scores:")
print(precision)
print("Recall Scores:")
print(recall)
print("F1 Scores:")
print(f1)
meanF1 = sum(f1) / len(f1)
print("Mean F1 Score: " + str(meanF1))