Import packages for training KNN

In [1]:
import tensorflow as tf

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report




**Load data with or without varinced columns**

In [2]:
def load(per = 0, isVar = False):
    df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
    
    df.drop('Id', axis=1, inplace=True)
    
    train_df = df.drop('Class', axis=1)
    
    test_df = df['Class']
    
    # map A to 0 B to 1 - will also try use other numbers or ignore this column
    train_df['EJ'] = train_df['EJ'].map({'A': 0, 'B': 1})
    
    # replace null values with the mean of the column
    train_df.fillna(train_df.mean(), inplace=True)
    
    # remove top 'per' varianced columns
    def remove_var(X_train, isVar, per = 0.1):
        # Calculate the variance of each column
        variances = X_train.var()
        
        # Sort variances in descending order and select the top 20% most variable columns
        top_var = int(len(variances) * per)
        high_variance_columns = variances.nlargest(top_var).index
        # print("HELP. PER = ", per, " ISVAR", isVar)
        if not isVar:
            # Drop these high variance columns from the train_df
            X_train.drop(columns=high_variance_columns, inplace=True)
        else: 
            X_train = X_train[high_variance_columns]
        
        return X_train, high_variance_columns
    if per == 0: return train_df, test_df, None
        
    train_df, high_variance_columns = remove_var(train_df, isVar, per)
    return train_df, test_df, high_variance_columns

**Helper Function**

In [3]:
# KNN function to return probability for class 1
def k_NN(data, labels, k, testData):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(data, labels)
    return knn.predict_proba(testData)[:, 1]

# Calculate average probabilities across multiple KNN models
def getResult(max_k, x_train, y_train, x_test):
    arrays = [k_NN(x_train, y_train, i, x_test) for i in range(3, max_k + 1, 1)]  # KNN for odd k values
    return np.mean(np.stack(arrays), axis=0)  # Average across all models

# Convert probabilities to class predictions (0 or 1)
def getClassesResult(max_k, x_train, y_train, x_test):
    return np.where(getResult(max_k, x_train, y_train, x_test) >= 0.5, 1, 0)

# Generate final DataFrame with probabilities for both classes
def final_sub(max_k, x_train, y_train, x_test, ids):
    class_1 = getResult(max_k, x_train, y_train, x_test)
    class_0 = 1 - class_1
    return pd.DataFrame({'Id': ids, 'class_0': class_0, 'class_1': class_1})

# Calculate percentage of equal elements between two arrays
def percentage_equal(arr1, arr2):
    return (np.sum(arr1 == arr2) / arr1.size) * 100


**Main function**

In [4]:
def plot_accuracy_vs_i(train_df, test_df, message):
    """
    Function to plot accuracy vs i values for different KNN models.

    Parameters:
    train_df (DataFrame): Training data features.
    test_df (DataFrame): Test data features.
    
    """
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(train_df, test_df, test_size=0.2, random_state=4)
    X_train_rows = [row.to_numpy() for _, row in X_train.iterrows()]
    X_test_rows = [row.to_numpy() for _, row in X_test.iterrows()]

    # Initialize lists for storing i values and accuracies
    i_values, accuracies = [], []

    # Loop through odd values of i
    for i in range(3, 76,1):
        y_pred = getClassesResult(i, X_train_rows, y_train, X_test_rows)
        accuracy = percentage_equal(y_pred, y_test)
        i_values.append(i)
        accuracies.append(accuracy)
        
    j , acc = i_values[np.argmax(accuracies)], np.max(accuracies)
    # Plot the results
    # plt.plot(i_values, accuracies, linestyle='-', color='b')
    # plt.xlabel('i values')
    # plt.ylabel('Test Accuracy')
    # plt.title('Accuracy vs i')
    # plt.grid(True)
    # plt.show()
    #print(f"{message} Best i is: {j} with acc of: {acc:.3f}")
    return j , acc

def run_diff_var():
    def run(isVar, mes):
        i_values, accuracies = [], []
        for i in np.arange(0.05, 1, 0.05):
            percentage = round(i * 100, 3)
            message_with_percent = f"{mes} at {percentage}% varianced columns"
            # Call your load function with the appropriate parameters
            x, y, _ = load(per=i, isVar=isVar)  
            j,accuracy = plot_accuracy_vs_i(x, y, message_with_percent)
            i_values.append((i,j))
            accuracies.append(accuracy)
        print(f"Best (per, K) is: {i_values[np.argmax(accuracies)]} with acc of: {np.max(accuracies):.3f}")

    run(True, "Using variance    ")  # Message when variance is used
    run(False, "Not using variance")  # Message when variance is not used


**Run Test**

In [5]:
# x, y, _ = load()  # Load without any variance
# plot_accuracy_vs_i(x, y, "No variance applied")
# run_diff_var()

**best: remove 0.89 var use 56 or 6 neighbors**

In [6]:
x, y, high_variance_columns = load(per=0.89, isVar=False) 

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 4)
X_train_rows = [row.to_numpy() for _, row in x.iterrows()]

test_df_1 = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

test_ds_pd = test_df_1.drop('Id' ,axis=1)

test_ds_pd['EJ'] = test_ds_pd['EJ'].map({'A': 0, 'B': 1})
test_ds_pd.fillna(test_ds_pd.mean(), inplace=True)

# avoid var
test_ds_pd.drop(columns=high_variance_columns, inplace=True)

X_test_rows = [row.to_numpy() for _, row in test_ds_pd.iterrows()]

df = final_sub(101, X_train_rows, y, X_test_rows, test_df_1.Id)
df.to_csv('/kaggle/working/submission.csv', index=False)
df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.860971,0.139029
1,010ebe33f668,0.860971,0.139029
2,02fa521e1838,0.860971,0.139029
3,040e15f562a2,0.860971,0.139029
4,046e85c7cc7f,0.860971,0.139029
