In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import sklearn

In [3]:
X = pd.read_csv('inputs_cleaned.tsv', sep='\t', index_col=0)
y = pd.read_csv('targets_cleaned.tsv', sep='\t', index_col=0)
y.columns

Index(['Recurrence status (1, yes; 0, no)',
       'Survial status (1, dead; 0, alive)', 'histologic_grade',
       'histologic_type',
       'measure_of_success_of_outcome_at_last_available_follow-up',
       'pathologic_staging_primary_tumor'],
      dtype='object')

# predicting survival 

In [25]:
X = pd.read_csv('inputs_cleaned.tsv', sep='\t', index_col=0)
y = pd.read_csv('targets_cleaned.tsv', sep='\t', index_col=0)

cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns='cancer_type')
X = cancer_type_one_hot.join(X)

# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)


# for survival fill missing with 2
y = y[['Survial status (1, dead; 0, alive)']]
y = y.fillna(2)

y = y.to_numpy()
X = X.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

  clf.fit(X_train, y_train)


0.7777777777777778

# all targets


In [20]:
X = pd.read_csv('inputs_cleaned.tsv', sep='\t', index_col=0)
y = pd.read_csv('targets_cleaned.tsv', sep='\t', index_col=0)

cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns='cancer_type')
X = cancer_type_one_hot.join(X)

# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)


ys = {}

for col in y.columns:
    ys[col] = pd.get_dummies(y[col])

In [28]:
def baseline(X, ys):
    
    results = {}
    for target, y in ys.items():
        knn = KNeighborsClassifier()
        results[target] = sklearn.model_selection.cross_validate(knn, X, y, cv=10)
        
    return results

In [29]:
baseline_acc = baseline(X, ys)

In [31]:
for target, res in baseline_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.78
                          Survial status (1, dead; 0, alive): 0.79
                                            histologic_grade: 0.41
                                             histologic_type: 0.34
   measure_of_success_of_outcome_at_last_available_follow-up: 0.27
                            pathologic_staging_primary_tumor: 0.28


# Optimized

In [34]:
def random_search(
    X, 
    ys,
    n_neighbors,
    algorithm,
    leaf_size,
    p,
    n_iter
):
    
    
    scores = {}
    params = {}
    for target, y in ys.items():
        knn = KNeighborsClassifier()
        search = sklearn.model_selection.RandomizedSearchCV(
            estimator=knn,
            param_distributions={
                "n_neighbors": n_neighbors,
                "algorithm": algorithm,
                "leaf_size": leaf_size,
                "p": p,
            },
            n_iter=n_iter,
            random_state=0,
        ).fit(X, y)
        
        scores[target] = search.best_score_
        params[target] = search.best_params_
        
    return scores, params

In [40]:
optimized_acc, optimized_params = random_search(
    X, 
    ys,
    n_neighbors=[3,6,8,10,20],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[20,30,40],
    p = [1,2],
    n_iter=10
)

for target, res in optimized_acc.items():
    print(f"{target: >60}: {round(res, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.40
                                             histologic_type: 0.24
   measure_of_success_of_outcome_at_last_available_follow-up: 0.33
                            pathologic_staging_primary_tumor: 0.25


In [41]:
optimized_acc, optimized_params = random_search(
    X, 
    ys,
    n_neighbors=[3,6,8,10,20],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[20,30,40],
    p = [1,2],
    n_iter=20
)

for target, res in optimized_acc.items():
    print(f"{target: >60}: {round(res, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.40
                                             histologic_type: 0.24
   measure_of_success_of_outcome_at_last_available_follow-up: 0.33
                            pathologic_staging_primary_tumor: 0.25
