In [1]:
# Problem 1
# Fieldname        Sunny            Rainy

# Windy
#   Yes:    -((2/6)log2(2/6) + (4/6)log2(4/6)) = -( 1/3 * (-1.58) + 2/3 * (-0.58)) = 0.91
#   No :    -((6/8)log2(6/8) + (2/8)log2(2/8)) = -(.75 * -0.41 + .25 * -2)         = 0.81
#   Weighted : (6/14 * 0.91 + 8/14 * 0.81)     =  .852 bits of entropy

# Humidity
#   low :  -((3/3)log2(3/3) + 0) = 0 bits
#   mild:  -((4/6)log2(4/6) + (2/6)log2(2/6)) = -( 1/3 * -1.58 + 2/3 * -0.58) = 0.91
#   high:  -((1/5)log2(1/5) + (4/5)log2(4/5)) = -( .2 * -2.32 + .8 * -0.32)   = 0.72
#   Weighted: (3/14 * 0 + 6/14 * 0.91 + 5/14 * 0.72) = .647 bits of entropy

# Temp
#   low :  -((0/4) + (1)log2(1)) = 0 bits
#   mild:  -((3/5)log2(3/5) + (2/5)log2(2/5)) = -(0.6 * -.74 + 0.4 * -1.32) = .972 bits
#   high:  -((5/5)log2(5/5) + 0) = 0 bits
#   Weighted:  (4/14 * 0 + 5/14 * .972 + 5/14 * 0) = .347 bits of entropy

# Under this data set, Temperature is the best feature to be placed on top of the tree

In [2]:
# import the necessary libraries for creating a classification
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd

# read the iris csv file and assign it to a Pandas DataFrame
iris_df = pd.read_csv('https://raw.githubusercontent.com/mpourhoma/CS4661/master/iris.csv')


In [5]:
# Creating the Feature Matrix for iris dataset:

# create a python list of feature names that would like to pick from the dataset:
feature_cols = ['sepal_length','sepal_width','petal_length','petal_width']

# use the above list to select the features from the original DataFrame
X = iris_df[feature_cols] 

# select a Series of labels (the last column) from the DataFrame
y = iris_df['species'] # this is the original categorical labels (the latest revision of sklearn accepts non-numerical labels)


In [33]:
# import the test splitting function
from sklearn.model_selection import train_test_split

# create and assign the testing & training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)

In [34]:
# assign a variable for the number of neighbors considered
k = 3

# create the K Neighbors Classifier object
knn = KNeighborsClassifier(n_neighbors=k)

# train the data on both the x and y training sets
knn.fit(X_train, y_train)

# test the model on the X_test set
print(knn.predict(X_test))

['setosa' 'virginica' 'setosa' 'setosa' 'virginica' 'versicolor'
 'virginica' 'setosa' 'virginica' 'versicolor' 'virginica' 'versicolor'
 'virginica' 'virginica' 'versicolor' 'versicolor' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'virginica' 'setosa' 'setosa'
 'versicolor' 'versicolor' 'versicolor' 'virginica' 'setosa' 'versicolor'
 'setosa' 'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica'
 'versicolor' 'virginica' 'versicolor' 'setosa' 'setosa' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa'
 'setosa' 'versicolor' 'virginica' 'virginica' 'virginica' 'setosa'
 'virginica' 'setosa' 'setosa' 'setosa' 'versicolor' 'virginica']


In [24]:
# create an iterable for the alternate k values
alt_k_vals = [1,5,7,11,15,27,59]

from sklearn.metrics import accuracy_score
previous_accuracy = -1

for val in alt_k_vals:
    # re-train the data set with the new k value
    knn = KNeighborsClassifier(n_neighbors=val)
    knn.fit(X_train, y_train)
    prediction = knn.predict(X_test)
    
    current_accuracy = accuracy_score(y_test, prediction)
    # if there is no change between this prediction and the previous prediction, point it out
    if previous_accuracy >= current_accuracy:
        print("\nThere is increase in accuracy at k = ", val,"\nChange = ", current_accuracy-previous_accuracy)
    # else print normally
    else:
        print("k =", val, "\n", prediction)
    previous_accuracy = current_accuracy
    
    
# Increasing from 1 to 5 increased the accuracy; 
# however, increasing the number further generally resulted in a loss of accuracy

k = 1 
 ['setosa' 'virginica' 'setosa' 'setosa' 'virginica' 'versicolor'
 'versicolor' 'setosa' 'virginica' 'versicolor' 'virginica' 'versicolor'
 'virginica' 'virginica' 'versicolor' 'versicolor' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'virginica' 'setosa' 'setosa'
 'versicolor' 'versicolor' 'versicolor' 'virginica' 'setosa' 'versicolor'
 'setosa' 'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica'
 'versicolor' 'virginica' 'versicolor' 'setosa' 'setosa' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa'
 'setosa' 'versicolor' 'virginica' 'virginica' 'virginica' 'setosa'
 'virginica' 'setosa' 'setosa' 'setosa' 'versicolor' 'virginica']
k = 5 
 ['setosa' 'virginica' 'setosa' 'setosa' 'virginica' 'versicolor'
 'virginica' 'setosa' 'virginica' 'versicolor' 'virginica' 'versicolor'
 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica' 'versicolor'
 'versicolor' 'setosa' 'setosa' 'virginica' 'setosa' 'setosa' 'versicolor'
 'versicolo

In [42]:
# re-create the K Neighbors Classifier object using k=3
knn = KNeighborsClassifier(n_neighbors=k)

features = ['species', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width']
accuracies = {}

for i in features:
    X_train, X_test, y_train, y_test = train_test_split(X, iris_df[i], test_size=0.4, random_state=6)
    knn.fit(X_train, y_train)
    accuracies[i] = accuracy_score(y_test, knn.predict(X_test))
    
print(accuracies)

ValueError: Unknown label type: 'continuous'