In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
import matplotlib.pyplot as plt

In [55]:
# load and clean data
data = pd.read_csv('data.csv')

column_dict = {
    "What year are you in?": "current_year",
    "What faculty are you in?": "faculty", 
    "What was your high school average when you applied to the University of Waterloo?": "hs_average",
    "What is your nationality status?": "nationality_status",
    "What is the highest education level your parents have completed?  [Parent 1]": "parent1_education",
    "What is the highest education level your parents have completed?  [Parent 2]": "parent2_education",
    "On average, how much time do you spend per week participating in social activities during an academic term? (i.e. extracurricular activities, movies, eating out, bars, parties, hanging out, etc.)": "social_time",
    "On average, what percentage of classes do you feel that you attend during an academic term? ": "class_attendance",
    "On average, excluding studying, how much time do you spend looking at a screen, during an academic term? (i.e. Phone, Laptop, TV, etc) ": "screen_time",
    "On average, how much sleep do you get per night during an academic term? ": "sleep_time",
    "On average, how many days do you exercise each week during an academic term?": "excercise_time",
    "On average, how much time do you spend doing school work / studying on a given day during an academic term?": "school_work_time",
    "In relation to school work, how much time do you spend on coop preparation during an academic term? (ie. applications. interview prep. practice, interviews, etc)": "coop_time",
    "Is it a high priority for you to achieve an 80%+ average": "academic_priority",
    'What is your current cumulative average?': "current_average"
}
nationality_dict = {
    'International (You are not a Canadian Citizen and are here on a Visa)':'Internationl',
    '1st Generation Canadian Citizen (You were not born in Canada and You are a Canadian Citizen)':'1st_Gen',
    '2nd+ Generation Canadian Citizen (You were born in Canada and you are a Canadian Citizen)':'2nd+_Gen'
}
sleep_dict = {
    "Not enough (I'm always tired)":"Not Enough",
    "Enough (I'm rested most of the time)":"Enough",
    "More than enough (I'm always well rested)":"More than Enough"
}
screen_dict = {
    'Regularly, but not a significant amount':'Regularly',
    "I'm almost always looking at a screen":'Almost Always',
    'A significant amount':'A significant amount',
    'Almost never':'Almost never'
}
coop_dict = {
    'Almost none':'Almost none',
    'About the same if not more as school work':"Same or More",
    'A significant amount but still less than school work':'Significant,but less than school',
    'A lot less than school work':'A lot less than school work' 
}
social_dict = {
    'Once or twice a week':'Once/Twice Weekly',
    'Multiple days a week':'Multiple Weekly',
    'Rarely':'Rarely'
}
data = data.rename(index=str,columns = column_dict)
for i in range(0,12):
    data.faculty[i] ='Engineering'
    
data = data.drop(["Enter your email address OR phone number if you'd like to be entered for a chance to win 1 of 4 $20 amazon gift cards", "Timestamp", "faculty"], axis=1)
data.nationality_status = data.nationality_status.map(nationality_dict)
data.sleep_time = data.sleep_time.map(sleep_dict)
data.social_time = data.social_time.map(social_dict)
data.coop_time = data.coop_time.map(coop_dict)
data.screen_time = data.screen_time.map(screen_dict)

real_data = data.copy()

# bin data values to numerical points (0,1,2,3,4)
data['sleep_time'] = np.where(data.sleep_time == 'Not Enough', 0, data.sleep_time)
data['sleep_time'] = np.where(data.sleep_time == 'Enough', 1, data.sleep_time)
data['sleep_time'] = np.where(data.sleep_time == 'More than Enough', 2, data.sleep_time)

data['screen_time'] = np.where(data.screen_time == 'Almost never', 0, data.screen_time)
data['screen_time'] = np.where(data.screen_time == 'Regularly', 1, data.screen_time)
data['screen_time'] = np.where(data.screen_time == 'A significant amount', 2, data.screen_time)
data['screen_time'] = np.where(data.screen_time == 'Almost Always', 2, data.screen_time)

data['social_time'] = np.where(data.social_time == 'Rarely', 0, data.social_time)
data['social_time'] = np.where(data.social_time == 'Once/Twice Weekly', 1, data.social_time)
data['social_time'] = np.where(data.social_time == 'Multiple Weekly', 2, data.social_time)

values = ['current_average', 'social_time', 'screen_time', 'sleep_time']
data = data[values]
print(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


       current_average social_time screen_time sleep_time
0               70-74%           1           1          1
1               70-74%           0           1          1
2               70-74%           1           1          1
3               80-84%           0           2          0
4               75-79%           1           2          1
5               75-79%           1           2          0
6               65-69%           1           2          1
7               80-84%           2           2          0
8               85-89%           1           2          1
9               85-89%           1           2          0
10              80-84%           2           2          0
11              80-84%           1           2          1
12              80-84%           2           2          1
13              75-79%           1           2          1
14              85-89%           1           2          1
15              70-74%           2           2          1
16            

In [56]:
# prepare datasets to be fed in the model
# predict attend class given extra hours and grade

CV = data.current_average.values.reshape((len(data.current_average), ))
print(CV)
CV_data = (data[['social_time','screen_time','sleep_time']].values).reshape((len(data.current_average), 3))
print(CV_data)

['70-74%' '70-74%' '70-74%' '80-84%' '75-79%' '75-79%' '65-69%' '80-84%'
 '85-89%' '85-89%' '80-84%' '80-84%' '80-84%' '75-79%' '85-89%' '70-74%'
 '70-74%' '75-79%' '75-79%' '70-74%' '70-74%' '75-79%' '85-89%' '70-74%'
 '75-79%' '80-84%' '65-69%' '85-89%' '65-69%' '75-79%' '65-69%' '60-64%'
 '60-64%' '85-89%' '60-64%' '70-74%' '75-79%' '75-79%' '75-79%' '70-74%'
 '85-89%' '90-94%' '75-79%' '75-79%' '80-84%' '70-74%' 'Prefer not to say'
 '75-79%' '80-84%' '60-64%' '65-69%' '80-84%' '85-89%' '80-84%' '70-74%'
 '70-74%' 'Prefer not to say' '80-84%' '80-84%' '80-84%' '90-94%' '90-94%'
 '90-94%' '90-94%' '80-84%' '75-79%' '85-89%' '70-74%' '75-79%' '80-84%'
 '80-84%' '85-89%' '80-84%' '90-94%' '80-84%' '80-84%' '80-84%'
 'Prefer not to say' '70-74%' '80-84%' '85-89%' '80-84%' '75-79%' '80-84%'
 '65-69%' 'Prefer not to say' '70-74%' '75-79%' '80-84%' '65-69%' '80-84%'
 '65-69%' '75-79%' '80-84%' '70-74%' '80-84%' '80-84%' '80-84%' '65-69%'
 '75-79%' '80-84%' '70-74%' '75-79%' '80-84%' '75-79

In [57]:
# Use the KNeighboursClassifer
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(CV_data, CV)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [5]:
# Create a KNN object
KNN = KNeighborsClassifier(n_neighbors=3)

In [60]:
# Train the model using the training sets
KNN.fit(CV_data, CV)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [63]:
#predict the class for each data point
predicted = KNN.predict(CV_data)
print(predicted)
print("Predictions: \n", np.array([predicted]).T)
print(len(np.array([predicted]).T))

['60-64%' '85-89%' '60-64%' '75-79%' '75-79%' '65-69%' '75-79%'
 'Prefer not to say' '75-79%' '65-69%' 'Prefer not to say' '75-79%'
 '80-84%' '75-79%' '75-79%' '80-84%' '80-84%' 'Prefer not to say' '80-84%'
 '75-79%' '80-84%' '75-79%' '75-79%' '75-79%' '80-84%' '80-84%' '65-69%'
 '80-84%' 'Prefer not to say' '80-84%' 'Prefer not to say'
 'Prefer not to say' '65-69%' 'Prefer not to say' '80-84%'
 'Prefer not to say' 'Prefer not to say' 'Prefer not to say' '80-84%'
 '70-74%' '80-84%' '70-74%' '75-79%' '75-79%' '80-84%' '80-84%'
 'Prefer not to say' '80-84%' '70-74%' '70-74%' '65-69%' '75-79%' '80-84%'
 '80-84%' '75-79%' '80-84%' '75-79%' 'Prefer not to say' '70-74%' '75-79%'
 '80-84%' '70-74%' 'Prefer not to say' '65-69%' '65-69%'
 'Prefer not to say' '65-69%' '70-74%' '75-79%' '80-84%' '65-69%' '75-79%'
 'Prefer not to say' 'Prefer not to say' '65-69%' 'Prefer not to say'
 '75-79%' '65-69%' '75-79%' '75-79%' '75-79%' '80-84%' '80-84%' '70-74%'
 '75-79%' '75-79%' '65-69%' '60-64%' '75-79

In [65]:
# predict the probability/likelihood of the prediction
print("Probability of prediction: \n",KNN.predict_proba(CV_data))

Probability of prediction: 
 [[0.33333333 0.         0.33333333 ... 0.         0.         0.        ]
 [0.         0.         0.33333333 ... 0.         0.         0.        ]
 [0.33333333 0.         0.33333333 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.66666667]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.33333333 0.         0.33333333 ... 0.         0.         0.        ]]


In [66]:
print("Neighbors and their Distance: \n",KNN.kneighbors(CV_data, return_distance=True))

Neighbors and their Distance: 
 (array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
 

In [67]:
print("Accuracy score for the model: \n", KNN.score(CV_data,CV))

Accuracy score for the model: 
 0.234375


In [11]:
print(metrics.confusion_matrix(CV, predicted, labels=["Yes","No"]))

[[3 2]
 [3 2]]


In [12]:
# Calculating 5 fold cross validation results
model = KNeighborsClassifier()
kf = KFold(n_splits=5)
scores = cross_val_score(model, data, CV, cv=kf)
print("Accuracy of every fold in 5 fold cross validation: ", abs(scores))
print("Mean of the 5 fold cross-validation: %0.2f" % abs(scores.mean()))

Accuracy of every fold in 5 fold cross validation:  [0.5 0.5 0.5 0.5 0.5]
Mean of the 5 fold cross-validation: 0.50


In [13]:
datapoint = np.array([100,60]).reshape(1, -1)
print("Does he attend class, if he gets 60 after putting 100 hours of effort?\n ",
      KNN.predict(datapoint),'\n' ,KNN.predict_proba(datapoint), '\n' ,KNN.kneighbors(datapoint))

Does he attend class, if he gets 60 after putting 100 hours of effort?
  ['No'] 
 [[0.66666667 0.33333333]] 
 (array([[20.4       , 22.14158983, 25.35527559]]), array([[6, 4, 7]]))


### try different k

In [14]:
KNN_1 = KNeighborsClassifier(n_neighbors=1)
KNN_1.fit(data, CV)
print("Accuracy score for the model:", KNN_1.score(data,CV))
scores = cross_val_score(KNN_1, data, CV, cv=KFold(n_splits=5))
print("Accuracy of every fold in 5 fold cross validation: ", abs(scores))
print("Mean of the 5 fold cross-validation: %0.2f" % abs(scores.mean()))

Accuracy score for the model: 1.0
Accuracy of every fold in 5 fold cross validation:  [0.5 0.5 0.5 0.  0.5]
Mean of the 5 fold cross-validation: 0.40


In [15]:
KNN_2 = KNeighborsClassifier(n_neighbors=2)
KNN_2.fit(data, CV)
print("Accuracy score for the model:", KNN_2.score(data,CV))
scores = cross_val_score(KNN_2, data, CV, cv=KFold(n_splits=5))
print("Accuracy of every fold in 5 fold cross validation: ", abs(scores))
print("Mean of the 5 fold cross-validation: %0.2f" % abs(scores.mean()))

Accuracy score for the model: 0.6
Accuracy of every fold in 5 fold cross validation:  [0.5 0.5 0.5 0.5 0.5]
Mean of the 5 fold cross-validation: 0.50


In [16]:
KNN_4 = KNeighborsClassifier(n_neighbors=4)
KNN_4.fit(data, CV)
print("Accuracy score for the model:", KNN_4.score(data,CV))
scores = cross_val_score(KNN_4, data, CV, cv=KFold(n_splits=5))
print("Accuracy of every fold in 5 fold cross validation: ", abs(scores))
print("Mean of the 5 fold cross-validation: %0.2f" % abs(scores.mean()))

Accuracy score for the model: 0.5
Accuracy of every fold in 5 fold cross validation:  [0.5 0.5 0.5 0.5 0.5]
Mean of the 5 fold cross-validation: 0.50


In [17]:
KNN_10 = KNeighborsClassifier(n_neighbors=6)
KNN_10.fit(data, CV)
print("Accuracy score for the model:", KNN_10.score(data,CV))
scores = cross_val_score(KNN_10, data, CV, cv=KFold(n_splits=5))
print("Accuracy of every fold in 5 fold cross validation: ", abs(scores))
print("Mean of the 5 fold cross-validation: %0.2f" % abs(scores.mean()))

Accuracy score for the model: 0.5
Accuracy of every fold in 5 fold cross validation:  [0.5 0.5 0.5 0.5 0.5]
Mean of the 5 fold cross-validation: 0.50
