## Part 1

### Imports

In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from scipy.stats import ttest_ind, chi2_contingency

import matplotlib.pyplot as plt

### Add csv

In [None]:
df = pd.read_csv('cleveland.csv')

# Rename 'num' column to 'disease' and change 1,2,3,4 to 1
df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))

# get rid of missing data
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
df

### Finding good features

here, we categorize the different kinds of data that we have, and then compare the distribution of that feature in both populations

In [None]:
numeric = [
    'age',
    'trestbps',
    'chol',
    'thalach',
    'oldpeak',
]

categorical = [
    'sex',
    'cp',
    'fbs',
    'restecg',
    'exang',
    'slope',
    'ca',
    'thal',
]

fig, axs = plt.subplots(ncols=5, figsize=(20, 5))
fig.suptitle('Numeric Features by presense of disease')
for i in range(len(numeric)):
    sns.kdeplot(data=df, x=numeric[i], hue='disease', ax=axs[i], fill=True)
plt.tight_layout()
plt.show()

fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
fig.suptitle('Categorical Features by presense of disease')
for i in range(len(categorical)):
    sns.countplot(data=df, x=categorical[i], hue='disease', ax=axs[i//4][i%4])
plt.tight_layout()
plt.show()

From those graphs, there are some suspicious features. specifically, I will investigate if there is a correlation between disease and:
1. age
2. thalach
3. oldpeak
4. sex
5. cp
6. slope
7. ca
8. thal

In [None]:
# test if there is a difference in the means between the two groups for each numeric feature
print('t-test for age')
print(ttest_ind(df[df.disease == 0].age, df[df.disease == 1].age))
print('t-test for thalach')
print(ttest_ind(df[df.disease == 0].thalach, df[df.disease == 1].thalach))
print('t-test for oldpeak')
print(ttest_ind(df[df.disease == 0].oldpeak, df[df.disease == 1].oldpeak))

# test if there is a difference in the distribution of the categorical features between the two groups
print('chi2 test for sex')
print(chi2_contingency(pd.crosstab(df.sex, df.disease)))
print('chi2 test for cp')
print(chi2_contingency(pd.crosstab(df.cp, df.disease)))
print('chi2 test for slope')
print(chi2_contingency(pd.crosstab(df.slope, df.disease)))
print('chi2 test for ca')
print(chi2_contingency(pd.crosstab(df.ca, df.disease)))
print('chi2 test for thal')
print(chi2_contingency(pd.crosstab(df.thal, df.disease)))

From that it looks like all of the selected attributes are extremely likely to be correlated with the presence of heart dis|ease.

We will select all of these features for the model

In [34]:
selected_features = [
    'age_s',
    'thalach_s',
    'oldpeak_s',
    'sex',
    'cp',
    'slope',
    'ca',
    'thal',
]

### Standardizing Attributes

In this case, the categorical features will not be standardized in any way. This is fine because all of the selected categorical features are either binary or ordinal, and thus the numeric representations make sense as euclidean dimensions.

In [35]:

# standardize numeric features
df['age_s'] = (df.age - df.age.mean())/df.age.std()
df['thalach_s'] = (df.thalach - df.thalach.mean())/df.thalach.std()
df['oldpeak_s'] = (df.oldpeak - df.oldpeak.mean())/df.oldpeak.std()



### K optimization

We will now use k-fold cross-validation to test the performance of the model with k values ranging from 1 to 200

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.neighbors import NearestNeighbors


# Create a function for prediction and evaluation
def predict_and_evaluate(X, y, k_values, n_splits=10):
    results = []

    # Perform K-Fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True)

    for k in k_values:
        y_pred = []
        y_true = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Create nearest neighbors object
            nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
            nn.fit(X_train)

            # Find the k nearest neighbors to the test set
            distances, indices = nn.kneighbors(X_test)

            for i in range(len(X_test)):
                nbrs_diseased = y_train[indices[i]].flatten()
                predict = pd.Series(nbrs_diseased).mode()[0]  # Most common label
                y_pred.append(predict)
                y_true.append(y_test[i][0])

        # Calculate precision, recall, and F1 scores
        (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, labels=[1])
        results.append((k, p, r, f, s))

    return results

# Main execution
X = df[selected_features].values
y = df[['disease']].values


# Define the range of k values to try
k_values = range(1, 201)  # Test k from 1 to 120
results = predict_and_evaluate(X, y, k_values)

# # Print the results
# for k, p, r, f, s in results:
#     print(f'k={k}, precision={p}, recall={r}, f-score={f}, support={s}')

plt.plot([x[0] for x in results], [x[3] for x in results])
plt.xlabel('k')
plt.ylabel('F1 Score')


The f1-score appears to be the highest when k is near 10, so we will select that as our k value. Interestingly, we see two maxima for this graph.

In [None]:
### Challenge Dataset (replace with name of test file)
df = pd.read_csv('cleveland-test-sample.csv')

# Rename 'num' column to 'disease' and change 1,2,3,4 to 1
df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))

# get rid of missing data
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# standardize numeric features
df['age_s'] = (df.age - df.age.mean())/df.age.std()
df['thalach_s'] = (df.thalach - df.thalach.mean())/df.thalach.std()
df['oldpeak_s'] = (df.oldpeak - df.oldpeak.mean())/df.oldpeak.std()

X = df[selected_features].values
y = df[['disease']].values

nn = NearestNeighbors(n_neighbors=10, metric='euclidean', algorithm='auto')

nn.fit(X)

distances, indices = nn.kneighbors(X)

y_pred = []
y_true = []
for i in range(len(X)):
    nbrs_diseased = y[indices[i]].flatten()
    predict = pd.Series(nbrs_diseased).mode()[0]  # Most common label
    y_pred.append(predict)
    y_true.append(y[i][0])

(p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, labels=[1])
print(f'precision={p}, recall={r}, f-score={f}, support={s}')

## Part 2

In this part, we will analyze a breast cancer database from wisconson. The "diagnosis" label indicates if the tumor was malignant ("M") or benign ("B"). The rest of the features are extracted from a digitized image of a fine needle aspirate of a breast mass.

### Import and clean data

In [None]:
df = pd.read_csv('wdbc.csv')

# rename diagnosis to positive, and change M to 1 and B to 0
df = df.rename({'diagnosis':'positive'}, axis=1)
df['positive'] = df.positive.apply(lambda x: 1 if x == 'M' else 0)

# drop the id column
df.drop('id', axis=1, inplace=True)
df

### Find good features

In this case, all of the features are continuous and numeric, so we can use the same tools for all

In [None]:
features = df.columns[1:]

fig, axs = plt.subplots(ncols=5, nrows=6, figsize=(20, 20))
for i in range(len(features)):
    sns.kdeplot(data=df, x=features[i], hue='positive', ax=axs[i//5][i%5], fill=True)
plt.tight_layout()
plt.show()


We will further investigate the following promising features:
1. radius1
2. perimeter1
3. area1
4. concavity1
5. concave_points1
6. radius2
7. perimeter2
8. area2
9. radius3
10. perimeter3
11. area3
12. concavity3
13. concave_points3

However, that's a lot of dimensions for a KNN model, so we will limit ourselves to the best options. We can score the options by their t-statistic.

In [None]:
selected_round1 = [
    'radius1',
    'perimeter1',
    'area1',
    'concavity1',
    'concave_points1',
    'radius2',
    'perimeter2',
    'area2',
    'radius3',
    'perimeter3',
    'area3',
    'concavity3',
    'concave_points3',
]

ttests = {}

for feature in selected_round1:
    ttests[feature] = ttest_ind(df[df.positive == 0][feature], df[df.positive == 1][feature])

# sort by magnitude of statistic
results = sorted(ttests.items(), key=lambda x: abs(x[1].statistic), reverse=True)

pd.DataFrame(results, columns=['feature', 'ttest'])

Now that we have a ordered list of our selected features in order of best to worst, we can do another kind of optimization! Starting with one parameter, we will increase the number of parameters that we consider one at a time, from top to bottom in that list. In this way we can determine the optimal number of parameters for this dataset.

In [None]:
# standardize selected features
for feature in selected_round1:
    df[feature] = (df[feature] - df[feature].mean())/df[feature].std()

# find the best number of parameters
fscore_by_features = []
for i in range(1, len(results)):
    selected = [x[0] for x in results[:i]]
    X = df[selected].values
    y = df[['positive']].values

    nn = NearestNeighbors(metric='euclidean', algorithm='auto')
    nn.fit(X)
    distances, indices = nn.kneighbors(X)

    y_pred = []
    y_true = []
    for i in range(len(X)):
        nbrs_diseased = y[indices[i]].flatten()
        predict = pd.Series(nbrs_diseased).mode()[0]  # Most common label
        y_pred.append(predict)
        y_true.append(y[i][0])

    (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, labels=[1])
    fscore_by_features.append(f)

plt.plot(range(1, len(results)), fscore_by_features)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')


we see a peak at 7 parameters, so we will use the first 7.

In [43]:
selected_round2 = [
    'concave_points3',
    'perimeter3',
    'concave_points1',
    'radius3',
    'perimeter1',
    'area3',
    'radius1'
]

### Optimize for K

In [None]:
# Main execution
X = df[selected_round2].values
y = df[['positive']].values


# Define the range of k values to try
k_values = range(1, 201)  # Test k from 1 to 120
results = predict_and_evaluate(X, y, k_values)

plt.plot([x[0] for x in results], [x[3] for x in results])
plt.xlabel('k')
plt.ylabel('F1 Score')


This time we see an early peak, and then a linear dropoff. It seems like the peak stays around .93 from k=3 to k=25, but there is much more variance in the early k values, so we will select k=25 in the hope that the ouptput will be more consistient.