# Classfy CAR Value problem 
## Import the libraries that we will use

In [1]:
import numpy as np
import pandas as pd
from itertools import product

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

## Inspecting the data and prepare it

The ﬁrst step in every data analysis experiment involves inspecting the data and to make sure it is properly formatted.
You will ﬁnd that the features in the provided dataset are categorical.
However, KNN requires the features to be real-valued numbers. To convert a categorical feature with K categories to a real-valued number, you can create K new binary features. 
The ith binary feature indicates whether the original feature belongs to the ith category or not.
This strategy is called ‘one-hot encoding.’

In [2]:
# We will use pandas library to work work with this dataset 
# Define the headers since the data does not have any
headers = ["buying", "maint", "doors", "persons", "lug_boot","safety","target"]

# Load the data as pandas dataframe
car_df = pd.read_csv("data/caroriginal.data",
                  header=None, names=headers )

car_df.head() # print some of the begining of the file

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
car_df[car_df.isnull().any(axis=1)] # Check if there is any null value in the data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,target


In [4]:
car_df.dtypes  # check the types of the data 

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
target      object
dtype: object

In [5]:
# Check the values in each category and convert to one-hot encoding using pandas method "get_dummies"
car_df["buying"].value_counts()
car_df = pd.get_dummies(car_df, columns=["buying"])

In [6]:
car_df["maint"].value_counts()
car_df = pd.get_dummies(car_df, columns=["maint"])

In [7]:
car_df["doors"].value_counts()
car_df = pd.get_dummies(car_df, columns=["doors"])

In [8]:
car_df["persons"].value_counts()
car_df = pd.get_dummies(car_df, columns=["persons"])

In [9]:
car_df["lug_boot"].value_counts()
car_df = pd.get_dummies(car_df, columns=["lug_boot"])

In [10]:
car_df["safety"].value_counts()
car_df = pd.get_dummies(car_df, columns=["safety"])

In [11]:
car_df["target"].value_counts() # Check the target categories 

unacc    1210
acc       384
good       69
vgood      65
Name: target, dtype: int64

In [12]:
# Categorize the target 
cleanup_target = {"target":     {"unacc": 0, "acc": 1, "good": 2, "vgood":3}}
car_df = car_df.replace(cleanup_target)
car_df.head()

Unnamed: 0,target,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,...,doors_5more,persons_2,persons_4,persons_more,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
1,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,0,0,1
2,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,0,1,1,0,0
3,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
4,0,0,0,0,1,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [13]:
# Put the target values in y and the feature vectors in X
y = car_df['target'] 
X = car_df.drop('target', axis=1) # features are all the columns except the "target" column 

## Now the data is ready to be used for training: 

Define a method that can split the data into train/validate/test for cross-validation

In [14]:
def train_valid_test_split (X, y, train_size=0.75):
    """Inputs:
    the features: X, 
    the target: y, 
    train_size: % of data 
     Returns:
     X_train, y_train, X_valid, y_valid, X_test, y_test
    """
    
    # In the first step we will split the data in training and remaining dataset
    X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=train_size)
    # Now since we want the valid and test size to be equal (10% each of overall data). 
    # we have to define valid_size=0.5 (that is 50% of remaining data)
    test_size = 0.5
    X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)
    
    # Uncomment this for debugging
    print(X_train.shape)
    print(y_train.shape)
    print(X_valid.shape)
    print(y_valid.shape)
    print(X_test.shape)
    print(y_test.shape)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

Make sure to set the random number seed to 0 in general, for every assignment. 
This ensures consistent behavior each time the code is run.

In [15]:
# 1-  Use numpy library to set the seed to 0:
np.random.seed(0)

In cross-validation, we use:

(a) train data: to train the model

(b) validate data: to tune the hyper parameters and choose the best parameters. In KNN, it is k.

(c) test data: to test the model with the chosen tuned parameter.


In [16]:
# 2-  In order to split the data into train/validate/test. Use the predefined "train_valid_test_split" to split data into 75% train and the rest is validate and test 

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(X, y, 0.75) #Complete the code

(1296, 21)
(1296,)
(216, 21)
(216,)
(216, 21)
(216,)


 Consider k = 1,3,5,...,23. For each k, report the training and validation accuracy. 
Identify the k with the highest validation accuracy, and report the test accuracy with this choice of k.
Note: if multiple values of k result in the highest validation accuracy, then report test accuracies for all such values of k.     

In [17]:
# 3- Use "KNeighborsClassifier" from sklearn to create a KNN model for the following k values: k = 1; 3; 5; ... ; 23.

scoresVal = []
scoresK = []

print("k", '\t', "train", "\t\t\t", "validate" "\t\t", "test")
for k in range (1, 23+1, 2): 
    # Complete the for loop
    neigh = KNeighborsClassifier(n_neighbors = k)
    neigh.fit(X_train,y_train)
    
    y_expTrain = neigh.predict(X_train)
    y_expValid = neigh.predict(X_valid)
    y_expTest = neigh.predict(X_test)
    
    # Use "accuracy_score" to find the accuracy of the prediction on the training data/validate data/test data.
    scoreTrainEx = accuracy_score(y_train, y_expTrain)
    scoreValidEx = accuracy_score(y_valid, y_expValid)
    scoreTestEx = accuracy_score(y_test, y_expTest)
    
    scoresVal.append(scoreValidEx)
    scoresK.append(k)
    
    # print the accuracy score for all the k on the prediction of the training data and the validate data. 
    print(k, '\t', scoreTrainEx,'\t', scoreValidEx, '\t', scoreTestEx)
    

# 4- Determine and print the best k that gives the best accuracy for your validate data and report the test accuracy with this choice of k
# Note: if multiple values of k result in the highest validation accuracy, then report test accuracies for all such values of k.

print('\n')
bestScore = max(scoresVal)

count = 0
for vals in scoresVal:
    if (bestScore == vals):
        count = count + 1
        
if count == 1:
    print("The best k is ", scoresK[scoresVal.index(bestScore)] , " with a score of ", bestScore)
    
elif count > 1:
    print("There are multiple best k:", '\n')
    for i in range(0, len(scoresVal), 1):
        if scoresVal[i] == bestScore:
            print("The best k is ", scoresK[i] , " with a score of ", bestScore)
        
    
    
    
        
        
    
    


k 	 train 			 validate		 test
1 	 1.0 	 0.7546296296296297 	 0.7546296296296297
3 	 0.9197530864197531 	 0.8009259259259259 	 0.7777777777777778
5 	 0.9282407407407407 	 0.8703703703703703 	 0.8703703703703703
7 	 0.9359567901234568 	 0.8981481481481481 	 0.8842592592592593
9 	 0.9421296296296297 	 0.9027777777777778 	 0.875
11 	 0.9498456790123457 	 0.9027777777777778 	 0.9074074074074074
13 	 0.9382716049382716 	 0.8796296296296297 	 0.8657407407407407
15 	 0.9313271604938271 	 0.8518518518518519 	 0.8564814814814815
17 	 0.9182098765432098 	 0.8379629629629629 	 0.8379629629629629
19 	 0.9074074074074074 	 0.8287037037037037 	 0.8472222222222222
21 	 0.9012345679012346 	 0.8101851851851852 	 0.8518518518518519
23 	 0.8858024691358025 	 0.8148148148148148 	 0.8287037037037037


There are multiple best k: 

The best k is  9  with a score of  0.9027777777777778
The best k is  11  with a score of  0.9027777777777778
