In [1]:
import adspy_shared_utilities as asu

In [2]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

fruits = pd.read_csv('fruit_data_with_colors.txt',sep='\t')

In [3]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
a = ("John", "Charles", "Mike", "Mike","Charles")
b = ("Jenny", "Christy", "Monica")

x = zip(a, b)

for i in x:
    print(i)

('John', 'Jenny')
('Charles', 'Christy')
('Mike', 'Monica')


In [5]:
(fruits.fruit_label.unique())

array([1, 2, 3, 4], dtype=int64)

In [6]:
(fruits.fruit_name.unique())

array(['apple', 'mandarin', 'orange', 'lemon'], dtype=object)

In [7]:
lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))   
lookup_fruit_name

{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

#### Examine the data a little bit


In [8]:
fruits.head()


Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [9]:
X = fruits[['mass','width', 'height',  'color_score']]
#X = fruits[['color_score']]

X.head()


Unnamed: 0,mass,width,height,color_score
0,192,8.4,7.3,0.55
1,180,8.0,6.8,0.59
2,176,7.4,7.2,0.6
3,86,6.2,4.7,0.8
4,84,6.0,4.6,0.79


In [10]:
y = fruits['fruit_label']
y.head()

0    1
1    1
2    1
3    2
4    2
Name: fruit_label, dtype: int64

In [11]:
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(X['width'], X['height'], X['color_score'], c = y, marker = 'o', s=100)
ax.set_xlabel('width')
ax.set_ylabel('height')
ax.set_zlabel('color_score')
plt.show()

<IPython.core.display.Javascript object>

In [12]:
# For the first step is to split into train/test


# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
#Setting random_state a fixed value will guarantee that 
# same sequence of random numbers are generated each time you run the code

In [13]:
#Create the classifier object. 

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)

In [14]:
### Train the classifier (fit the estimator) using the training data
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [15]:
### Estimate the accuracy of the classifier on future data, using the test data
print(knn.score(X_train, y_train),knn.score(X_test, y_test))

0.7954545454545454 0.5333333333333333


In [None]:
### Use the trained k-NN classifier model to classify new, previously unseen objects
# first example: a small fruit with mass 20g, width 4.3 cm, height 5.5 cm and color socore of 0.5 

fruit_prediction = knn.predict([[250, 4.3, 8.5, 0.1]])

fruit_prediction[0]



In [None]:
lookup_fruit_name[fruit_prediction[0]]

In [None]:
# second example: a larger, elongated fruit with mass 100g, width 6.3 cm, height 8.5 cm
fruit_prediction = knn.predict([[100, 8.3, 4.5, 0.31]])
lookup_fruit_name[fruit_prediction[0]]

In [None]:
### Plot the decision boundaries of the k-NN classifier
asu.plot_fruit_knn(X_train, y_train, 5, 'uniform')   # we choose 5 nearest neighbors


### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?


In [None]:
k_range = range(1,20)
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20]);

 



### We can do it  with only colors.


In [None]:

X = fruits[['color_score']]

#X = fruits[['mass','width', 'height',  'color_score']]


# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
k_range = range(1,20)
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20]);



In [None]:
scores

### How sensitive is k-NN classification accuracy to the train/test split proportion?
#### Try at home

In [None]:
t = [0.9, 0.8, 0.70, 0.6, 0.5, 0.4, 0.3, 0.2]

knn = KNeighborsClassifier(n_neighbors = 5)

plt.figure()

for s in t: # s is the training size. 

    scores = []
    for i in range(1,50):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_test, y_test))
    plt.plot(s, np.mean(scores), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy for test set');

# kNN regression.

In [None]:
# kNN regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression #function used for generating synthetic datasets

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso

### Try it for our house data set. 

In [None]:
from sklearn.neighbors import KNeighborsRegressor

house_price = pd.DataFrame (np.genfromtxt("house_price_data_2000.csv", \
            delimiter=";", skip_header=1, dtype=None))
house_price.head()

X_house = house_price.iloc[:  , 0:3] #from 0(inclusive) to 3(exclusive)
y_house = house_price.iloc[:, -1]


X_train, X_test, y_train, y_test = train_test_split(X_house, y_house, random_state = 2)

knnreg = KNeighborsRegressor(n_neighbors = 3).fit(X_train, y_train)

print('R-squared test score: {:.3f}'
     .format(knnreg.score(X_test, y_test)))

# let's make it with cross validation or let's look at the value of neighbourhoods. 

## Now try it for customized computer. 
### We will also see a graph for C and alpha for ridge and k. 

In [None]:
CostData1 = pd.read_excel('CostV1.xlsx')
CostData1.head()

print("Dim-->", CostData1.ndim, "Length->",CostData1.shape)

CostData1["AG-2"] = CostData1["AG-2"].fillna(0) #fill the NaNs with Zeros
CostData1.head()


In [None]:
CostData1 = pd.get_dummies(CostData1) # 

y_CostData1 = CostData1['Cost']
y_CostData1.head()

In [None]:
X_CostData1 = CostData1.drop("Cost",axis=1)
X_CostData1.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_CostData1, y_CostData1,random_state=5)
linridge = Ridge(alpha=0).fit(X_train, y_train)                                                   

print('LR: Training',round(linridge.score(X_train, y_train),3), 'Test',round(linridge.score(X_test, y_test),3))

#for CostV1.xlsx check for random state 1 and 2 and 5
 
linridge = Ridge(alpha=10).fit(X_train, y_train)                                                   
print('Ridge with alpha=10: Training',round(linridge.score(X_train, y_train),3), 'Test',round(linridge.score(X_test, y_test),3))

In [None]:
alphaList  = np.arange(0.2, 10, 0.5)

ridgeTrainR2 = {}
ridgeTestR2 = {}

for i in alphaList:
    linridge = Ridge(alpha=i).fit(X_train,y_train)
    ridgeTrainR2[i]= round(linridge.score(X_train, y_train),3)
    ridgeTestR2[i]= round(linridge.score(X_test, y_test),3)
    

plt.figure()
plt.title('test R2 with alpha')
plt.scatter(alphaList, ridgeTestR2.values(), marker= 'o', s=50)
plt.scatter(alphaList, ridgeTrainR2.values(), marker= 'x', s=50)

plt.show()


In [None]:
kList =  np.arange(2, 18, 1)

kNNTrainR2= {}
kNNTestR2 = {}

for k in kList:
    kNN = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train) 
    kNNTrainR2[k]= round(kNN.score(X_train, y_train),3)
    kNNTestR2[k]= round(kNN.score(X_test, y_test),3)


plt.figure()
plt.title('test R2 with alpha')
plt.scatter(kList, kNNTestR2.values(), marker= 'o', s=50)
plt.scatter(kList, kNNTrainR2.values(), marker= 'x', s=50)

plt.show()
