In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import sklearn
import altair as alt

In [3]:
X = pd.read_csv('inputs_cleaned.tsv', sep='\t', index_col=0)
y = pd.read_csv('targets_cleaned.tsv', sep='\t', index_col=0)
y.columns

Index(['Recurrence status (1, yes; 0, no)',
       'Survial status (1, dead; 0, alive)', 'histologic_grade',
       'histologic_type',
       'measure_of_success_of_outcome_at_last_available_follow-up',
       'pathologic_staging_primary_tumor'],
      dtype='object')

# predicting survival 

In [25]:
X = pd.read_csv('inputs_cleaned.tsv', sep='\t', index_col=0)
y = pd.read_csv('targets_cleaned.tsv', sep='\t', index_col=0)

cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns='cancer_type')
X = cancer_type_one_hot.join(X)

# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)


# for survival fill missing with 2
y = y[['Survial status (1, dead; 0, alive)']]
y = y.fillna(2)

y = y.to_numpy()
X = X.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

  clf.fit(X_train, y_train)


0.7777777777777778

# all targets


In [20]:
X = pd.read_csv('inputs_cleaned.tsv', sep='\t', index_col=0)
y = pd.read_csv('targets_cleaned.tsv', sep='\t', index_col=0)

cancer_type_one_hot = pd.get_dummies(X["cancer_type"])
X = X.drop(columns='cancer_type')
X = cancer_type_one_hot.join(X)

# Now we just need to cast bools to ints
X.loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]] = X.\
loc[:, X.columns[X.columns.str.startswith("above_reg_line_")]].astype(int)


ys = {}

for col in y.columns:
    ys[col] = pd.get_dummies(y[col])

In [165]:
def testingk(X, ys, k):
    
    results = {}
    for target, y in ys.items():
        knn = KNeighborsClassifier(n_neighbors=k)
        results[target] = sklearn.model_selection.cross_validate(knn, X, y, cv=10)
        
    return results

In [166]:
k_acc = testingk(X, ys, k=3)
for target, res in k_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.74
                          Survial status (1, dead; 0, alive): 0.75
                                            histologic_grade: 0.40
                                             histologic_type: 0.39
   measure_of_success_of_outcome_at_last_available_follow-up: 0.30
                            pathologic_staging_primary_tumor: 0.30


In [168]:
k_acc = testingk(X, ys, k=10)
for target, res in k_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.79
                          Survial status (1, dead; 0, alive): 0.81
                                            histologic_grade: 0.25
                                             histologic_type: 0.22
   measure_of_success_of_outcome_at_last_available_follow-up: 0.21
                            pathologic_staging_primary_tumor: 0.15


In [169]:
k_acc = testingk(X, ys, k=20)
for target, res in k_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.31
                                             histologic_type: 0.12
   measure_of_success_of_outcome_at_last_available_follow-up: 0.23
                            pathologic_staging_primary_tumor: 0.16


In [172]:
k_acc = testingk(X, ys, k=30)
for target, res in k_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.30
                                             histologic_type: 0.03
   measure_of_success_of_outcome_at_last_available_follow-up: 0.24
                            pathologic_staging_primary_tumor: 0.15


In [170]:
k_acc = testingk(X, ys, k=40)
for target, res in k_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.33
                                             histologic_type: 0.01
   measure_of_success_of_outcome_at_last_available_follow-up: 0.24
                            pathologic_staging_primary_tumor: 0.13


In [171]:
k_acc = testingk(X, ys, k=100)
for target, res in k_acc.items():
    mean_acc = res["test_score"].mean()
    print(f"{target: >60}: {round(mean_acc, 2):0<4}")

                           Recurrence status (1, yes; 0, no): 0.81
                          Survial status (1, dead; 0, alive): 0.83
                                            histologic_grade: 0.38
                                             histologic_type: 0.00
   measure_of_success_of_outcome_at_last_available_follow-up: 0.29
                            pathologic_staging_primary_tumor: 0.10


# Optimized

In [101]:
def random_search(
    X, 
    ys,
    n_neighbors,
    algorithm,
    weights,
    leaf_size,
    p,
    n_iter,
    optimized_scores_list,
    optimized_params_list,
    optimized_charts_list
):
    
    
    scores = {}
    params = {}
    for target, y in ys.items():
        knn = KNeighborsClassifier()
        search = sklearn.model_selection.RandomizedSearchCV(
            estimator=knn,
            param_distributions={
                "n_neighbors": n_neighbors,
                'weights' : weights,
                "algorithm": algorithm,
                "leaf_size": leaf_size,
                "p": p,
            },
            n_iter=n_iter,
            random_state=0,
            cv=10,
            n_jobs=-1, # Use all processors,
            scoring={
                "accuracy": "accuracy",
                "precision": sklearn.metrics.make_scorer(
                    sklearn.metrics.precision_score,
                    average="samples",
                    zero_division=0
                ),
                "recall": sklearn.metrics.make_scorer(
                    sklearn.metrics.recall_score,
                    average="samples",
                    zero_division=0
                ),
            },
            refit="accuracy",
        ).fit(X, y)
        
        scores[target] = search.cv_results_
        params[target] = search.best_params_
        
#    return scores, params
    # Extract the overall scores
    scores_list = []
    for target, tscores in scores.items():
#         import pdb;pdb.set_trace()
        df = pd.DataFrame(tscores)[
            ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall']
        ]
        df = df.mean(axis=0).to_frame()
        df.columns = [target]
        scores_list.append(df)
    
    scores_df = pd.DataFrame()
    for df in scores_list:
        scores_df = scores_df.join(df, how="outer")
        
    # Make a chart
    scores_df.index.name = "metric"
    chart_df = scores_df.reset_index().melt(
        id_vars="metric",
        var_name="target",
        value_name="score",
    )
    
    chart = alt.Chart(chart_df).mark_bar().encode(
        x="target",
        y=alt.Y(
            "score",
            scale=alt.Scale(
                domain=[0, 1]
            )
        ),
        color="target",
        column="metric"
    )
    
    scores_df.columns.name = "target"
    scores_df.index.name = None
    scores_df = scores_df.T.sort_index()
    
    optimized_scores_list.append(scores_df)
    optimized_params_list.append(params)
    optimized_charts_list.append(chart)


       

In [157]:
optimized_scores_list = []
optimized_params_list = []
optimized_charts_list = []

In [None]:
#test different ranges of k

In [143]:
random_search(
    X, 
    ys,
    n_neighbors=range(0,10),
    weights = ['uniform', 'distance'],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[20,30,40],
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)


In [144]:
random_search(
    X, 
    ys,
    n_neighbors=range(10,15),
    weights = ['uniform', 'distance'],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[20,30,40],
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)



In [145]:
random_search(
    X, 
    ys,
    n_neighbors=range(15,20),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=[20,30,40],
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)



In [None]:
#k stopd improving after 20

In [None]:
#test different ranges of leaf size

In [129]:
random_search(
    X, 
    ys,
    n_neighbors=range(15,20),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(10,20),
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)



In [131]:
random_search(
    X, 
    ys,
    n_neighbors=range(15,20),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(20,30),
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)

In [133]:
random_search(
    X, 
    ys,
    n_neighbors=range(15,20),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(30,40),
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)

In [None]:
random_search(
    X, 
    ys,
    n_neighbors=range(10,100),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(30,40),
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)

In [None]:
#test n_iter

In [158]:
random_search(
    X, 
    ys,
    n_neighbors=range(10,200),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(2,50),
    p = [1,2],
    n_iter=10,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)

In [159]:
random_search(
    X, 
    ys,
    n_neighbors=range(10,200),
    weights = ['uniform', 'distance'],    
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],
    leaf_size=range(2,50),
    p = [1,2],
    n_iter=100,
    optimized_scores_list=optimized_scores_list,
    optimized_params_list=optimized_params_list,
    optimized_charts_list=optimized_charts_list
)

In [160]:
optimized_scores_list[0]

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Recurrence status (1, yes; 0, no)",0.812879,0.812879,0.812879
"Survial status (1, dead; 0, alive)",0.825284,0.825284,0.825284
histologic_grade,0.378731,0.378731,0.378731
histologic_type,0.003636,0.003636,0.003636
measure_of_success_of_outcome_at_last_available_follow-up,0.299687,0.299687,0.299687
pathologic_staging_primary_tumor,0.107405,0.107405,0.107405


In [161]:
optimized_scores_list[1]

Unnamed: 0_level_0,mean_test_accuracy,mean_test_precision,mean_test_recall
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Recurrence status (1, yes; 0, no)",0.812571,0.812571,0.812571
"Survial status (1, dead; 0, alive)",0.825039,0.825039,0.825039
histologic_grade,0.38822,0.38822,0.38822
histologic_type,0.016351,0.016351,0.016351
measure_of_success_of_outcome_at_last_available_follow-up,0.306613,0.306613,0.306613
pathologic_staging_primary_tumor,0.104345,0.104345,0.104345


In [173]:
print(optimized_scores_list[1].round(4).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  mean\_test\_accuracy &  mean\_test\_precision &  mean\_test\_recall \\
target                                             &                     &                      &                   \\
\midrule
Recurrence status (1, yes; 0, no)                  &              0.8126 &               0.8126 &            0.8126 \\
Survial status (1, dead; 0, alive)                 &              0.8250 &               0.8250 &            0.8250 \\
histologic\_grade                                   &              0.3882 &               0.3882 &            0.3882 \\
histologic\_type                                    &              0.0164 &               0.0164 &            0.0164 \\
measure\_of\_success\_of\_outcome\_at\_last\_available... &              0.3066 &               0.3066 &            0.3066 \\
pathologic\_staging\_primary\_tumor                   &              0.1043 &               0.1043 &            0.1043 \\
\bottomrule
\end{tabular}

