In [30]:
%matplotlib inline
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

# Load and print.
data = pd.read_csv('transfusion.data')
data

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
...,...,...,...,...,...
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0


In [36]:
''' The following is done because of Underfitting and Overfitting.
    Underfitting is when there is NOT enough training data and 
        therefore the results aren't too accurate when predicting.
    
    Overfitting is when training is too good or great, so it doesn't
        work well with unseen data.

    So Validation is done to fix this issue. First, split the data 
    into essientially 'input' and 'output' sections. This is done 
    because soon those variables will be passed to 'train_test_split' 
    which is a function that will split the data into testing and 
    training data.
    
    The drop function simply gets us new data with the specified 
    column removed, without altering the previous data.
'''
X = data.drop(columns='whether he/she donated blood in March 2007')
y = data['whether he/she donated blood in March 2007']

# test_size is 0.1 for testing data, majority is in training for better results.
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)

''' Init the model, call fit to train it, then get the score to see how its done.
    Score will be random because train_test_split gives us random training and 
    testing data. '''
model = KNeighborsClassifier()
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print(f'Score is: {score}.')

Score is: 0.8533333333333334.


In [37]:
# Begin predictions.
predicted = model.predict(x_test)

# List to hold whether or not the individual has given blood in text form. (0) - They did not. (1) - They did.
predictionValues = ['No', 'Yes']

# Use label encoder to transform y_test from series into array for printing prediction results.
le = preprocessing.LabelEncoder()
actualValues = le.fit_transform(list(y_test))

# See what the model got right and what it did not.
for i in range(len(predicted)):
    print(f'Model predicted: {predictionValues[predicted[i]]}. Actual value: {predictionValues[actualValues[i]]}.')

Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: Yes.
Model predicted: No. Actual value: No.
Model predicted: Yes. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: Yes. Actual value: Yes.
Model predicted: Yes. Actual value: No.
Model predicted: No. Actual value: Yes.
Model predicted: No. Actual value: Yes.
Model predicted: No. Actual value: No.
Model predicted: Yes. Actual value: Yes.
Model predicted: No. Actual value: No.
Model predicted: Yes. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: No. Actual value: No.
Model predicted: Yes. Actual value: Yes.
Model predicted: No. Actual value: No.
Model predict