In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd "/content/drive/MyDrive/CS/Project4"
%ls 

# Part I

In [None]:
df = pd.read_csv('cleveland.csv')
df

## k-nearest neighbors

This dataset was obtained from https://archive.ics.uci.edu/ml/datasets/Heart+Disease (this is a great resource for datasets to try machine learning on). It has data on patients that are and are not diagnosed with heart disease.

The attributes are:
* age: age in years 
* sex: sex (1 = male; 0 = female) 
* cp: chest pain type 
 * -- Value 1: typical angina 
 * -- Value 2: atypical angina 
 * -- Value 3: non-anginal pain 
 * -- Value 4: asymptomatic 
* trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
* chol: serum cholestoral in mg/dl 
* fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
* restecg: resting electrocardiographic results 
 * -- Value 0: normal 
 * -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 * -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
* thalach: maximum heart rate achieved 
* exang: exercise induced angina (1 = yes; 0 = no) 
* oldpeak = ST depression induced by exercise relative to rest 
* slope: the slope of the peak exercise ST segment 
 * -- Value 1: upsloping 
 * -- Value 2: flat 
 * -- Value 3: downsloping 
* ca: number of major vessels (0-3) colored by flourosopy 
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
* num: diagnosis of heart disease (angiographic disease status) 
 * -- Value 0: absence.
 * -- Value 1,2,3,4: presence of heart disease


## Explore the data

Read in the data, modify the dependent variable name and plot a histogram of the ages of patients, both healthy and those with heart disease.

In [None]:
# Rename 'num' column to 'disease' and change 1,2,3,4 to 1
df = df.rename({'num':'disease'}, axis=1)
df

In [None]:
df['disease'] = df.disease.apply(lambda x: min(x, 1))
display(df.head(5))

print(len(df[df['disease']==1]))
print(len(df[df['disease']==0]))

In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist(df[df.disease == 0].age);
ax1.set_xlabel('age');
ax1.set_ylabel('number of patients');
ax1.set_xlim(20, 100);
ax1.set_ylim(0, 70);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].age, color='orange');
ax2.set_xlabel('age');
ax2.set_ylabel('number of patients');
ax2.set_xlim(20, 100);
ax2.set_ylim(0, 70);
ax2.set_title('has heart disease');
plt.savefig('partI-age.pdf')

In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist(df[df.disease == 0].chol);
ax1.set_xlabel('chol');
ax1.set_ylabel('number of patients');
ax1.set_xlim(50, 500);
ax1.set_ylim(0, 70);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].chol, color='orange');
ax2.set_xlabel('chol');
ax2.set_ylabel('number of patients');
ax2.set_xlim(50, 500);
ax2.set_ylim(0, 70);
ax2.set_title('has heart disease');
plt.savefig('partI-chol.pdf')

In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist(df[df.disease == 0].trestbps);
ax1.set_xlabel('trestbps');
ax1.set_ylabel('number of patients');
ax1.set_xlim(50, 500);
ax1.set_ylim(0, 70);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].trestbps, color='orange');
ax2.set_xlabel('trestbps');
ax2.set_ylabel('number of patients');
ax2.set_xlim(50, 500);
ax2.set_ylim(0, 70);
ax2.set_title('has heart disease');

plt.savefig('partI-trestbps.pdf')

In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist(df[df.disease == 0].cp);
ax1.set_xlabel('cp');
ax1.set_ylabel('number of patients');
ax1.set_xlim(0, 5);
#ax1.set_ylim(0, 70);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].cp, color='orange');
ax2.set_xlabel('cp');
ax2.set_ylabel('number of patients');
ax2.set_xlim(0, 5);
#ax2.set_ylim(0, 70);
ax2.set_title('has heart disease');


In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.disease == 0].sex);
ax1.set_xlabel('sex');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
ax1.set_ylim(0, 120);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].sex, color='orange');
ax2.set_xlabel('sex');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
ax2.set_ylim(0, 120);
ax2.set_title('has heart disease');

#Used to reduce space between bars
plt.xticks([0, 1], ['0', '1'])


In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist((df[df.disease == 0].thalach));
ax1.set_xlabel('thalach');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
#ax1.set_ylim(0, 70);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].thalach, color='orange');
ax2.set_xlabel('thalach');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
#ax2.set_ylim(0, 70);
ax2.set_title('has heart disease');
plt.savefig('partI-thalach.pdf')

In [None]:
print('The number of people who have heart disease is',len(df[df['disease']==1]),'which is', len(df[df['disease']==1])/len(df), 'percentage of the dataset')
print('The number of people without heart disease is',len(df[df['disease']==0]),'which is',len(df[df['disease']==0])/len(df), 'percentage of the dataset')
print('Total amount of people is', len(df))

In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist((df[df.disease == 0].fbs));
ax1.set_xlabel('fbs');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
#ax1.set_ylim(0, 70);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].fbs, color='orange');
ax2.set_xlabel('fbs');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
#ax2.set_ylim(0, 70);
ax2.set_title('has heart disease');


In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist((df[df.disease == 0].restecg));
ax1.set_xlabel('restecg');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
ax1.set_ylim(0, 100);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].restecg, color='orange');
ax2.set_xlabel('restecg');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
ax2.set_ylim(0, 100);
ax2.set_title('has heart disease');


In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist((df[df.disease == 0].exang));
ax1.set_xlabel('exang');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
#ax1.set_ylim(0, 110);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].exang, color='orange');
ax2.set_xlabel('exang');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
ax2.set_ylim(0, 110);
ax2.set_title('has heart disease');


In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist((df[df.disease == 0].oldpeak));
ax1.set_xlabel('oldpeak');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
#ax1.set_ylim(0, 110);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].oldpeak, color='orange');
ax2.set_xlabel('oldpeak');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
ax2.set_ylim(0, 110);
ax2.set_title('has heart disease');


In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist((df[df.disease == 0].slope));
ax1.set_xlabel('slope');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
#ax1.set_ylim(0, 110);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].slope, color='orange');
ax2.set_xlabel('slope');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
ax2.set_ylim(0, 110);
ax2.set_title('has heart disease');


In [None]:
df=df.dropna()
df
df[~(df['ca']=='?')]

In [None]:
df['ca'] = df['ca'].str.replace('?', '',regex=True)
df=df.dropna()

In [None]:
# Plot histograms
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.75)

ax1.hist((df[df.disease == 0].ca));
ax1.set_xlabel('ca');
ax1.set_ylabel('number of patients');
#ax1.set_xlim(0, 1);
#ax1.set_ylim(0, 110);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].ca, color='orange');
ax2.set_xlabel('ca');
ax2.set_ylabel('number of patients');
#ax2.set_xlim(0, 1);
#ax2.set_ylim(0, 110);
ax2.set_title('has heart disease');


## Multiple Dimensions

Data is standarized

In [None]:
#standardized variables
#df['age_s'] = (df.age-df.age.mean())/df.age.std()
#df['chol_s'] = (df.age-df.chol.mean())/df.chol.std()
#df['trestbps_s'] = (df.age-df.trestbps_S.mean())/df.trestbps_s.std()
df['age'] = (df.age-df.age.mean())/df.age.std()
df['chol'] = (df.chol-df.chol.mean())/df.chol.std()
df['trestbps'] = (df.trestbps-df.trestbps.mean())/df.trestbps.std()
df['thalach'] = (df.thalach-df.thalach.mean())/df.thalach.std()

In [None]:
# df[['age', 'trestbps']].values
X = df[['age', 'trestbps','chol']].values
print(X[0])
print(X[1])
X

In [None]:
y = df[['disease']].values
y

In [None]:
# create a nearest neighbors object.
nn = NearestNeighbors(n_neighbors=7, metric='euclidean', algorithm='auto')
print(nn)

In [None]:
# This builds an index data structure under the hood for query performance
fit = nn.fit(X)
fit

In [None]:
# Get a random patient to test on
i = random.randint(0,len(X)-1)
i

In [None]:
# finding the age and trestbps for random i , for first row the i=0, second i=1 , ...
patientX = X[i]
patientX

In [None]:
# finding whether the random i row has heart disease or not(0 or 1)
patienty = y[i]
patienty

In [None]:
# display all variables for rows i in data frame 
display(df.iloc[i])

In [None]:
# Find the k nearest neighbors to the patient. Problem: the patient
# itself will be found in the list of neighbors!
distances, indices = fit.kneighbors([patientX])
print('distances=', distances, 'indices=', indices)

distances= [[0.         0.11588175 0.29697409 0.32925024 0.34938849 0.39593285
  0.46338588]] indices= [[263 128 134 260 186 255  99]]


In [None]:
df.iloc[indices[0]]

In [None]:
nbrs = df.iloc[indices[0]]
display(nbrs)

In [None]:
healthy = nbrs[nbrs.disease == 0].count().disease
sick = nbrs[nbrs.disease == 1].count().disease
print('healthy: {}\nsick: {}'.format(healthy, sick))

In [None]:
predict = 0 if (healthy > sick) else 1
actual = 0 if (patienty == 0) else 1
success = predict == actual
print(success)

## Performing Multiple Tests

In [None]:
X = df[['age', 'trestbps']].values
X

In [None]:
y = df[['disease']].values

# This builds an index data structure under the hood for query performance
fit = nn.fit(X)

In [None]:
# Get random patients to test on

n = 7
pindices = [random.randint(0,len(X)-1) for _ in range(n)]
pindices

In [None]:
#X = df[['age', 'trestbps']].values
# call row of x, x has two information, age and tresbps
patientsX = X[pindices]
patientsX

In [None]:
#patientsy contain 0 or 1, having heart dieases or not
patientsy = y[pindices]
patientsy

In [None]:
# Find the k nearest neighbors to the patient. Problem: we still
# have the problem of the patient itself being found!
distances, indices = fit.kneighbors(patientsX)
print('indices of k-nearest neighbors for each patient(n=7):')
display(indices)

In [None]:
#n=7 i=0,1,2,3,4,5,6
for i in range(n):
    print('nearest neighbors to patient: {}:'.format(patientsX[i]))
    nbrs = df.iloc[indices[i]]
    display(nbrs)

### Finding a good value of k

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def get_scores(k):
  # Use knn on age. First create a nearest neighbors object.
  nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

  # This builds an index data structure under the hood for query performance
  X = df[['age', 'trestbps','chol']].values
  fit = nn.fit(X)

  # Get random patients to test on
  n = 50
  patients = df.sample(n)
  patientsX = patients[['age','trestbps','chol']].values
  patientsy = patients[['disease']].values

  # Find the k nearest neighbors to the patient.
  distances, indices = fit.kneighbors(patientsX)

  y_pred = []
  for i in range(n):
      nbrs = df.iloc[indices[i]]
      # Drop the patient of interest
      nbrs = nbrs.drop(patients.index[i], errors='ignore')

      healthy = nbrs[nbrs.disease == 0].count().disease
      sick = nbrs[nbrs.disease == 1].count().disease
      predict = 0 if (healthy > sick) else 1
      y_pred.append(predict)
  return precision_recall_fscore_support(patientsy, y_pred, labels=[1])

kvals = range(2, 50)
scores = [get_scores(k) for k in kvals]
# print(scores)

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.ylabel('Precison, Recall and F1 scores')
plt.xlabel('K values')
plt.savefig('partI-kvalues.pdf')

## Split data into train/test and get precision/recall/f score by cross validation

In [None]:
#standardized variables
df['age'] = (df.age-df.age.mean())/df.age.std()
df['chol'] = (df.chol-df.chol.mean())/df.chol.std()
df['trestbps'] = (df.trestbps-df.trestbps.mean())/df.trestbps.std()

df['thalach'] = (df.thalach-df.thalach.mean())/df.thalach.std()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 23
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = df[['age', 'trestbps', 'thalach', 'chol']].values
y = df[['disease']].values

precision = []
recall = []
f1 = []
support = []

for _ in range(20):
    # Use random_state if you want the same values each time you run for debugging,
    # but you should select the split randomly when you're ready to actually train
    # and test on the data.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    # This builds an index data structure under the hood for query performance
    fit = nn.fit(X_train)

    # Find the k nearest neighbors to the patient.
    distances, indices = fit.kneighbors(X_test)

    y_pred = []
    for i in range(len(X_test)):
        positive = y_test[i][0]
        nbr_values = y_train[indices[i]]
        y_pred.append(0 if sum(nbr_values[:,0]) < k/2 else 1)

    (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred)

    print(p[0],r[0],f[0],s[0])
    precision.append(p[0])
    recall.append(r[0])
    f1.append(f[0])
    support.append(s[0])

print('mean of f1 scores=',sum(f1)/len(f1))

# Part II

In [None]:
df = pd.read_csv('winequality-red.csv')
df

In [None]:
df['quality'] = (df['quality'] > 5).astype(int)
display(df.head(10))

In [None]:
print('quality level of 6 7 8 =',len(df[df['quality']==1]))
print('quality level of 1 2 3 4 5 =',len(df[df['quality']==0]))

Explore the data

In [None]:
#Rename  column 

df = df.rename({'fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity','citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide','total sulfur dioxide':'total_sulfur_dioxide'}, axis=1)
df.head()

In [None]:
# Plot histograms for PH
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].pH);
ax1.set_xlabel('pH');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].pH, color='orange');
ax2.set_xlabel('pH');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');
plt.savefig('partII-pH.pdf')

In [None]:
# Plot histograms for alcohol
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].alcohol);
ax1.set_xlabel('alcohol');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].alcohol, color='orange');
ax2.set_xlabel('alcohol');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');
plt.savefig('partII-alcohol.pdf')

In [None]:
# Plot histograms for chlorides
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].chlorides);
ax1.set_xlabel('chlorides');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].chlorides, color='orange');
ax2.set_xlabel('chlorides');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');

In [None]:
# Plot histograms for residual_sugar
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].residual_sugar);
ax1.set_xlabel('residual_sugar');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].residual_sugar, color='orange');
ax2.set_xlabel('residual_sugar');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');
plt.savefig('partII-residual-sugar.pdf')

In [None]:
#Plot histograms for citric_acid
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].citric_acid);
ax1.set_xlabel('citric_acid');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].citric_acid, color='orange');
ax2.set_xlabel('citric_acid');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');

In [None]:
#Plot histograms for citric_acid
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].citric_acid);
ax1.set_xlabel('citric_acid');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].citric_acid, color='orange');
ax2.set_xlabel('citric_acid');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');

In [None]:
#Plot histograms for volatile_acidity
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].volatile_acidity);
ax1.set_xlabel('volatile_acidity');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].volatile_acidity, color='orange');
ax2.set_xlabel('volatile_acidity');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');

In [None]:
#Plot histograms for volatile_acidity
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].volatile_acidity);
ax1.set_xlabel('volatile_acidity');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].volatile_acidity, color='orange');
ax2.set_xlabel('volatile_acidity');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');

In [None]:
#Plot histograms for density
fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.7)

ax1.hist(df[df.quality == 0].density);
ax1.set_xlabel('density');
ax1.set_ylabel('number of wine');
#ax1.set_xlim(20, 80);
#ax1.set_ylim(0, 50);
ax1.set_title('bad wine');

ax2.hist(df[df.quality == 1].density, color='orange');
ax2.set_xlabel('density');
ax2.set_ylabel('nnumber of wine');
#ax2.set_xlim(20, 80);
#ax2.set_ylim(0, 50);
ax2.set_title('good wine');

## standarized data

In [None]:
df['pH'] = (df.pH-df.pH.mean())/df.pH.std()
df['alcohol'] = (df.alcohol-df.alcohol.mean())/df.alcohol.std()
df['residual_sugar'] = (df.residual_sugar-df.residual_sugar.mean())/df.residual_sugar.std()

## Finding a good value of k

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def get_scores(k):
  # Use knn on age. First create a nearest neighbors object.
  nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

  # This builds an index data structure under the hood for query performance
  X = df[['pH', 'alcohol','residual_sugar']].values
  fit = nn.fit(X)

  # Get random patients to test on
  n = 50
  wines = df.sample(n)
  winesX = wines[['pH', 'alcohol','residual_sugar']].values
  winesy = wines[['quality']].values

  # Find the k nearest neighbors to the patient.
  distances, indices = fit.kneighbors(winesX)

  y_pred = []
  for i in range(n):
      nbrs = df.iloc[indices[i]]
      # Drop the patient of interest
      nbrs = nbrs.drop(wines.index[i], errors='ignore')

      bad = nbrs[nbrs.quality == 0].count().quality
      good = nbrs[nbrs.quality == 1].count().quality
      predict = 0 if (good < bad) else 1
      y_pred.append(predict)
  return precision_recall_fscore_support(winesy, y_pred, labels=[1])

kvals = range(2, 50)
scores = [get_scores(k) for k in kvals]
# print(scores)

scores = [(p[0], r[0], f[0], s[0]) for (p,r,f,s) in scores]
scores = list(zip(*scores))

plt.plot(kvals, scores[2])
plt.ylabel('Precison, Recall and F1 scores')
plt.xlabel('K values')
plt.savefig('partII-kvalues.pdf')

Split data into train/test and get precision/recall/f score by cross validation, 10 iteration

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

k = 25
nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')

X = df[['pH', 'alcohol','residual_sugar']].values
y = df[['quality']].values

precision = []
recall = []
f1 = []
support = []

for _ in range(10):
    # Use random_state if you want the same values each time you run for debugging,
    # but you should select the split randomly when you're ready to actually train
    # and test on the data.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    # This builds an index data structure under the hood for query performance
    fit = nn.fit(X_train)

    # Find the k nearest neighbors to the patient.
    distances, indices = fit.kneighbors(X_test)

    y_pred = []
    for i in range(len(X_test)):
        positive = y_test[i][0]
        nbr_values = y_train[indices[i]]
        y_pred.append(0 if sum(nbr_values[:,0]) < k/2 else 1)

    (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred)

    print(p[0],r[0],f[0],s[0])
    precision.append(p[0])
    recall.append(r[0])
    f1.append(f[0])
    support.append(s[0])

print('mean of f1 scores=',sum(f1)/len(f1))