In [14]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import recall_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
    


# read
fruits = pd.read_table('fruit_data_my.txt')
lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique())) #{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}

# split
X = fruits[['mass', 'width', 'height', 'color_score']]
y = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)              # default is 75% / 25% train-test split

# KNN
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train)
print('knn test score: {:.3f}'.format(knn.score(X_test, y_test)))
print('knn predicts lemon as: {:}'.format(lookup_fruit_name[np.round(knn.predict([[208, 6.7, 11.2, 48]]),0)[0]] ))
# recall = recall_score(y_test, y_train)
# precision = precision_score(y_test, svm_predicted)

def scale_0_4(value):
    value = 4. if value > 4 else value
    value = 1. if value < 1 else value
    return value

# scale
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# plot
# from library import adspy_shared_utilities as asu
# asu.plot_fruit_knn(X_train, y_train, 5, 'uniform')   

# KNN
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train)
print('knn test score: {:.3f}'.format(knn.score(X_test, y_test)))
print('knn predicts lemon as: {:}'.format(lookup_fruit_name[np.round(knn.predict([[208, 6.7, 11.2, 48]]),0)[0]] ))

# KNN regression
knnreg = KNeighborsRegressor(n_neighbors = 7).fit(X_train, y_train)
print('knnreg test score: {:.3f}'.format(knnreg.score(X_test, y_test)))
print('knnreg predicts lemon as: {:}'.format(lookup_fruit_name[np.round(knnreg.predict([[208, 6.7, 11.2, 48]]),0)[0]] ))

# Linear regression
linreg = LinearRegression().fit(X_train, y_train)
print('linreg test score: {:.3f}'.format(linreg.score(X_test, y_test)))
print('linreg predicts lemon as: {:}'.format(lookup_fruit_name[np.round(scale_0_4(linreg.predict([[208, 6.7, 11.2, 48]])),0)] ))

# Ridge regression
linridge = Ridge(alpha=20.0).fit(X_train, y_train)
print('linridge score (test): {:.3f}'.format(linridge.score(X_test, y_test)))
print('linridge predicts lemon as: {:}'.format(lookup_fruit_name[np.round(scale_0_4(linridge.predict([[208, 6.7, 11.2, 48]])),0)] ))
# Ridge regression with regularization parameter: alpha
for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
    linridge = Ridge(alpha = this_alpha).fit(X_train, y_train)
    print('linridge alpha=',this_alpha,'score (test): {:.3f}'.format(linridge.score(X_test, y_test)))
    
# Lasso regression
linlasso = Lasso(alpha=0.5, max_iter = 10000).fit(X_train, y_train)
print('linlasso score (test): {:.3f}'.format(linlasso.score(X_test, y_test)))
print('linlasso predicts lemon as: {:}'.format(lookup_fruit_name[np.round(scale_0_4(linlasso.predict([[208, 6.7, 11.2, 48]])),0)[0]] ))
# Lasso regression with regularization parameter: alpha        
for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train, y_train)
    print('linlasso alpha=',alpha,'score (test): {:.3f}'.format(linlasso.score(X_test, y_test)))

    
    
# Polynomial regression
# X_poly = PolynomialFeatures(degree=2).fit_transform(X)
# X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_poly, y, random_state=0)
# linreg = LinearRegression().fit(X_train_p, y_train_p)
# print('poly + linreg test score: {:.3f}'.format(linreg.score(X_test_p, y_test_p)))
# linreg.predict([[208, 6.7, 11.2, 48]])
# print('poly + linreg predicts lemon as: {:}'.format(lookup_fruit_name[np.round(scale_0_4(linreg.predict([[208, 6.7, 11.2, 48]])),0)] ))


def add_features(line, deg): # for prediction we should convert test values too
#     col = line.reshape(len(line),1) # transposes, line of 11 -> to column of 11
    col = line
    poly = PolynomialFeatures(degree=deg)
    value = poly.fit_transform(col)
    return value
for i, value in enumerate([1,3,6,9]):
    # prepare data
    X_train_poly = add_features(X_train, value)              # 1row x 11cols  ->  11rows x 7cols (11,7)
#     X_axis_poly = add_features(np.linspace(0,10,100), value) # 1row x 100cols -> 100rows x 7cols (100,7)
    X_test_poly = add_features(X_test, value)                # 1row x  4cols  ->   4rows x 7cols (4,7)

    # learn
    linreg = LinearRegression().fit(X_train_poly, y_train)

    # score
    print('poly degree ', value, 'score (test): {:.3f}'.format(linreg.score(X_test_poly, y_test)))

    # use
    #y_train_predict_one = linreg.predict([list(X_train_poly[0,:])]) #input=> 1rows x 7cols, output=> 1.22595734, but y_train[0] = 1.21213026
    #y_test_predict_one = linreg.predict([list(X_test_poly[0,:])])   #input=> 1rows x 7cols, output=> 0.98597635, but y_test[0] = 0.99517935
    y_test_predict = linreg.predict(X_test_poly)   #input=> 4rows x 7cols, output=> 1row x 4cols: array([ 0.98597635, -0.18539455,  0.37296501,  1.18016858]), but y_test = [ 0.99517935, -0.16081   ,  0.3187423 ,  1.53763897]
    #y_axis_predict = linreg.predict(X_axis_poly)   #input=> 100rows x 7cols, output=> 1row x 100cols
    print('poly degree ', value, 'predicts: ',y_test_predict)




knn test score: 0.933
knn predicts lemon as: lemon
knn test score: 0.933
knn predicts lemon as: lemon
knnreg test score: 0.513
knnreg predicts lemon as: lemon
linreg test score: 0.541
linreg predicts lemon as: lemon
linridge score (test): 0.445
linridge predicts lemon as: lemon
linridge alpha= 0 score (test): 0.541
linridge alpha= 1 score (test): 0.536
linridge alpha= 10 score (test): 0.488
linridge alpha= 20 score (test): 0.445
linridge alpha= 50 score (test): 0.362
linridge alpha= 100 score (test): 0.292
linridge alpha= 1000 score (test): 0.160
linlasso score (test): 0.230
linlasso predicts lemon as: orange
linlasso alpha= 0.5 score (test): 0.230
linlasso alpha= 1 score (test): 0.126
linlasso alpha= 2 score (test): 0.119
linlasso alpha= 3 score (test): 0.113
linlasso alpha= 5 score (test): 0.099
linlasso alpha= 10 score (test): 0.050
linlasso alpha= 20 score (test): -0.017
linlasso alpha= 50 score (test): -0.017
poly degree  1 score (test): 0.541
poly degree  1 predicts:  [ 2.0700340

In [None]:
# next in Module 2:

# Linear models for classification
#     Logistic regression
#     Logistic regression for binary classification on fruits dataset using height, width features (positive class: apple, negative class: others)
#     Logistic regression on simple synthetic dataset
#     Logistic regression regularization: C parameter
#     Application to real dataset

# Support Vector Machines
#     Linear Support Vector Machine
#     Linear Support Vector Machine: C parameter
#     Application to real dataset
    
# Multi-class classification with linear models
#     LinearSVC with M classes generates M one vs rest classifiers.
#     Multi-class results on the fruit dataset
    
# Kernelized Support Vector Machines
#     Classification
#     Support Vector Machine with RBF kernel: gamma parameter
#     Support Vector Machine with RBF kernel: using both C and gamma parameter
#     Application of SVMs to a real dataset: unnormalized data
#     Application of SVMs to a real dataset: normalized data with feature preprocessing using minmax scaling
        
# Cross-validation
#     Example based on k-NN classifier with fruit dataset (2 features)
#     A note on performing cross-validation for more advanced scenarios.
#     Validation curve example

# Decision Trees
#     Setting max decision tree depth to help avoid overfitting
#     Visualizing decision trees
#     Pre-pruned version (max_depth = 3)
#     Feature importance
#     Decision Trees on a real-world dataset