# Imports

In [225]:
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42) # this is how to set a random seed.

#### Display options

In [226]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Jupytper Config
%config Completer.use_jedi = False
%config IPCompleter.greedy=True
pd.set_option('display.max_columns', None)


## Import Exernal Files

In [227]:
# Data file produced by tests.
uf = pd.read_csv('./CSV/colour_data_clustered.csv')

In [228]:
print(f'Unique Cluster Classifications: {uf.cluster_classification.nunique()}')
uf.head(3)

Unique Cluster Classifications: 4


Unnamed: 0,user,correct,recorded_result,mask_image,cb_type1,cb_type2,ncb,datetime,random_spread,pallet_used,pallet_values,ishihara_list,COLORS_ON,COLORS_OFF,cluster_classification
0,unknown_1,1,5,5,0,0,1,16/6/20 11:22,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(220,105,53),(239,192,86),(227,162,61),(222,1...","[(220,105,53),(239,192,86),(227,162,61),(222,1...","[(71,138,38),(92,73,38),(153,165,49),(221,221,...",0
1,unknown_1,1,C,C,0,0,1,16/6/20 11:20,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(232,116,65),(247,198,95),(221,152,56),(220,1...","[(232,116,65),(247,198,95),(221,152,56),(220,1...","[(65,149,38),(99,72,41),(168,165,50),(223,211,...",0
2,unknown_1,1,D,D,0,0,1,16/6/20 11:30,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(226,114,54),(253,186,87),(233,148,70),(236,1...","[(226,114,54),(253,186,87),(233,148,70),(236,1...","[(75,142,40),(96,64,34),(162,152,41),(224,219,...",0


### Categorical Feature Encoding

In [161]:
uf.columns

Index(['user', 'correct', 'recorded_result', 'mask_image', 'cb_type1',
       'cb_type2', 'ncb', 'datetime', 'random_spread', 'pallet_used',
       'pallet_values', 'ishihara_list', 'COLORS_ON', 'COLORS_OFF',
       'cluster_classification'],
      dtype='object')

# X and y


In [229]:
# pallet used is the most important feature
pallet_dummies = pd.get_dummies(uf['pallet_used'],prefix='pallet_')
X = uf.drop(['user','recorded_result','mask_image','datetime','random_spread','pallet_values','pallet_used','ishihara_list','COLORS_ON','COLORS_OFF','cluster_classification'],axis=1)
X = pd.concat([X,pallet_dummies],axis=1)
X

Unnamed: 0,correct,cb_type1,cb_type2,ncb,pallet__bear,pallet__bluey2,pallet__camo2,pallet__greys,pallet__inverse_ring,pallet__orn_grn,pallet__pinks,pallet__reversed_ring,pallet__ring,pallet__standard_1,pallet__standard_2,pallet__standard_3
0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5816,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1
5817,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1
5818,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1
5819,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1


In [238]:
y = uf['cluster_classification']
# We have a categorical target, so we need to labelEncode
le = LabelEncoder()
le.fit_transform(y)

print(f'Transformed {le.classes_.shape}/{uf.cluster_classification.nunique()} labels')
print(f'labels are {le.classes_}')
y

Transformed (4,)/4 labels
labels are [0 1 2 3]


0       0
1       0
2       0
3       0
4       0
       ..
5816    0
5817    0
5818    0
5819    0
5820    0
Name: cluster_classification, Length: 5821, dtype: int64

In [239]:
print(f'We have {len(y)}y and {len(X)}x values')

We have 5821y and 5821x values


## Baseline

In [240]:
# Predicting that someone is not colourblind.
baseline = uf.cluster_classification.value_counts(normalize = True)
print("Correct Answer Baseline:")
print(baseline)

baseline = np.mean(uf.cluster_classification == 0)

Correct Answer Baseline:
0    0.812232
3    0.078681
1    0.066140
2    0.042948
Name: cluster_classification, dtype: float64


## Scaling

In [241]:
from sklearn.preprocessing import MinMaxScaler
X = StandardScaler().fit_transform(X)
# X = MinMaxScaler().fit_transform(X)





# Classisification





### Logistic Regression

In [304]:
# Evaluate Models
def model_evaluate(model):
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.15)
    m = model
    score = cross_val_score(m, X, y, cv = 10) #cross validation of 5.

    # Modeling on Testing
    m.fit(X_train, y_train)
    #better to do this at the start, rather than doing the whole thing.

    y_pred = m.predict(X_test)
    score_test = accuracy_score(y_test, y_pred)
    #Results
    print(f"{(y_test != y_pred).sum()}/{X_test.shape[0]} points mislabeled")
    print('Score Test: %.3f' %score_test)
    print('baseline: %.3f' %baseline)
    print("Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
    print('-'*55)
    print(classification_report(y_test, y_pred,zero_division=1))
    return m



### LogisticRegression

In [305]:
from sklearn.linear_model import LogisticRegression
m = model_evaluate(LogisticRegression(max_iter=300))
lg = m

188/874 points mislabeled
Score Test: 0.785
baseline: 0.812
Accuracy: 0.77 (+/- 0.25)
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       686
           1       1.00      0.00      0.00        76
           2       1.00      0.00      0.00        38
           3       1.00      0.00      0.00        74

    accuracy                           0.78       874
   macro avg       0.95      0.25      0.22       874
weighted avg       0.83      0.78      0.69       874



### Naive Bayes

In [262]:
from sklearn.naive_bayes import BernoulliNB
m = model_evaluate(BernoulliNB())
nb = m

188/874 points mislabeled
Score Test: 0.785
baseline: 0.812
Accuracy: 0.62 (+/- 0.57)
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       686
           1       1.00      0.00      0.00        76
           2       1.00      0.00      0.00        38
           3       1.00      0.00      0.00        74

    accuracy                           0.78       874
   macro avg       0.95      0.25      0.22       874
weighted avg       0.83      0.78      0.69       874



### Support Vector Classifier

In [263]:
from sklearn.svm import SVC
m = model_evaluate(SVC(C=1,kernel='rbf',degree=4))
svc = m

188/874 points mislabeled
Score Test: 0.785
baseline: 0.812
Accuracy: 0.76 (+/- 0.25)
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       686
           1       1.00      0.00      0.00        76
           2       1.00      0.00      0.00        38
           3       1.00      0.00      0.00        74

    accuracy                           0.78       874
   macro avg       0.95      0.25      0.22       874
weighted avg       0.83      0.78      0.69       874



### KNN Classifier

In [265]:
from sklearn.neighbors import KNeighborsClassifier
m = model_evaluate(KNeighborsClassifier(n_neighbors=5))
knn = m

192/874 points mislabeled
Score Test: 0.780
baseline: 0.812
Accuracy: 0.66 (+/- 0.37)
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.99      0.88       686
           1       1.00      0.00      0.00        76
           2       0.20      0.05      0.08        38
           3       1.00      0.00      0.00        74

    accuracy                           0.78       874
   macro avg       0.75      0.26      0.24       874
weighted avg       0.80      0.78      0.69       874



### Adaboost Classifier

In [266]:
from sklearn.ensemble import AdaBoostClassifier
m = model_evaluate(AdaBoostClassifier(n_estimators=100))
abc = m

188/874 points mislabeled
Score Test: 0.785
baseline: 0.812
Accuracy: 0.75 (+/- 0.40)
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       686
           1       1.00      0.00      0.00        76
           2       1.00      0.00      0.00        38
           3       1.00      0.00      0.00        74

    accuracy                           0.78       874
   macro avg       0.95      0.25      0.22       874
weighted avg       0.83      0.78      0.69       874



### MPL Classifier

In [267]:
from sklearn.neural_network import MLPClassifier
m = model_evaluate(MLPClassifier(solver='lbfgs', hidden_layer_sizes=(20, ), max_iter=1000, random_state=42))
mpl = m

188/874 points mislabeled
Score Test: 0.785
baseline: 0.812
Accuracy: 0.74 (+/- 0.39)
-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       686
           1       1.00      0.00      0.00        76
           2       1.00      0.00      0.00        38
           3       1.00      0.00      0.00        74

    accuracy                           0.78       874
   macro avg       0.95      0.25      0.22       874
weighted avg       0.83      0.78      0.69       874



### MLP Regressor

In [249]:
# from sklearn.neural_network import MLPRegressor #multi-layer-perceptron-Regressor
# model_evaluate(MLPRegressor(activation='logistic',
#                     hidden_layer_sizes=(100, ),
#                     max_iter=1000,
#                     learning_rate='constant',
#                     learning_rate_init=0.001,
#                     random_state=42))

In [250]:
# from sklearn.ensemble import RandomForestRegressor
# model_evaluate(RandomForestRegressor(n_estimators=20, random_state=0))

In [251]:
# binary values only
# from sklearn.metrics import plot_roc_curve
# plot_roc_curve(svc,X_test,y_test)

## Rebuild X & Apply Labels

In [300]:
X = uf.drop(['user','recorded_result','mask_image','datetime','random_spread','pallet_values','pallet_used','ishihara_list','COLORS_ON','COLORS_OFF','cluster_classification'],axis=1)
X = pd.concat([X,pallet_dummies],axis=1)

In [306]:
# Evaluate Models
def predict_labels(model):
    m = model
    m.fit(X,y)
    y_pred = m.predict(X)
    return y_pred

In [307]:
predict_labels(LogisticRegression(max_iter=300))
uf['LogisticRegressor'] = y_pred

predict_labels(AdaBoostClassifier(n_estimators=100))
uf['AdaBoostClassifier'] = y_pred

# y_label = nb.predict(X)
# uf['BernoulliNB'] = y_pred

# y_label = svc.predict(X)
# uf['SVC'] = y_pred

# y_label = mpl.predict(X)
# uf['MLPClassifier'] = y_pred

# y_label = knn.predict(X)
# uf['KNN'] = y_pred

print('AdaBoostClassifier returned the following classifications:')
print(uf['AdaBoostClassifier'].value_counts())
print('\n')
# print('Naive Bayes Classifier returned the following classifications:')
# print(uf['BernoulliNB'].value_counts())
# print('\n')
# print('SVC Classifier returned the following classifications:')
# print(uf['SVC'].value_counts())
# print('\n')
# print('MLP Classifier returned the following classifications:')
# print(uf['MLPClassifier'].value_counts())

ValueError: Length of values does not match length of index

In [303]:
uf

Unnamed: 0,user,correct,recorded_result,mask_image,cb_type1,cb_type2,ncb,datetime,random_spread,pallet_used,pallet_values,ishihara_list,COLORS_ON,COLORS_OFF,cluster_classification,AdaBoostClassifier
0,unknown_1,1,5,5,0,0,1,16/6/20 11:22,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(220,105,53),(239,192,86),(227,162,61),(222,1...","[(220,105,53),(239,192,86),(227,162,61),(222,1...","[(71,138,38),(92,73,38),(153,165,49),(221,221,...",0,0
1,unknown_1,1,C,C,0,0,1,16/6/20 11:20,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(232,116,65),(247,198,95),(221,152,56),(220,1...","[(232,116,65),(247,198,95),(221,152,56),(220,1...","[(65,149,38),(99,72,41),(168,165,50),(223,211,...",0,0
2,unknown_1,1,D,D,0,0,1,16/6/20 11:30,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(226,114,54),(253,186,87),(233,148,70),(236,1...","[(226,114,54),(253,186,87),(233,148,70),(236,1...","[(75,142,40),(96,64,34),(162,152,41),(224,219,...",0,0
3,unknown_1,1,N,N,0,0,1,16/6/20 11:21,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(231,116,53),(253,203,81),(215,149,67),(224,1...","[(231,116,53),(253,203,81),(215,149,67),(224,1...","[(70,149,26),(96,73,48),(163,158,57),(219,214,...",0,0
4,unknown_1,0,N,E,0,0,1,16/6/20 11:22,10,bear,"[#e26f39,#f6c458,#e09c3e,#e69e40,#ed7b3d,#f5be...","[(223,102,47),(244,188,85),(217,147,55),(222,1...","[(223,102,47),(244,188,85),(217,147,55),(222,1...","[(63,147,29),(103,82,46),(155,153,53),(214,215...",0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5816,unknown_30,1,C,C,0,1,1,22:28.0,10,standard_3,"[#89B270, #7AA45E, #B6C674, #7AA45E, #B6C674, ...","[(134, 182, 116), (113, 155, 97), (189, 203, 1...","[(134, 182, 116), (113, 155, 97), (189, 203, 1...","[(240, 154, 32), (200, 119, 101), (234, 139, 9...",0,0
5817,unknown_30,1,D,D,0,1,1,23:02.2,10,standard_3,"[#89B270, #7AA45E, #B6C674, #7AA45E, #B6C674, ...","[(135, 185, 107), (127, 171, 89), (190, 203, 1...","[(135, 185, 107), (127, 171, 89), (190, 203, 1...","[(237, 152, 45), (203, 112, 99), (235, 142, 11...",0,0
5818,unknown_30,1,2,2,0,1,1,24:26.4,10,standard_3,"[#89B270, #7AA45E, #B6C674, #7AA45E, #B6C674, ...","[(135, 178, 121), (114, 159, 90), (186, 198, 1...","[(135, 178, 121), (114, 159, 90), (186, 198, 1...","[(244, 152, 36), (198, 126, 94), (229, 132, 10...",0,0
5819,unknown_30,1,5,5,0,1,1,25:41.5,10,standard_3,"[#89B270, #7AA45E, #B6C674, #7AA45E, #B6C674, ...","[(131, 179, 105), (114, 163, 100), (173, 207, ...","[(131, 179, 105), (114, 163, 100), (173, 207, ...","[(235, 157, 39), (195, 119, 86), (230, 146, 97...",0,0


In [74]:
for user in uf['user'].unique():
    evaluation = uf[uf['user'] == user]
    print(f'for user: {str(user)}')
    print(f"cluster_classification: {evaluation['cluster_classification'].unique()}")
    print(f"AdaBoostClassifier: {evaluation['AdaBoostClassifier'].unique()}")
    print(f"BernoulliNB: {evaluation['BernoulliNB'].unique()}")
    print(f"SVC: {evaluation['SVC'].unique()}")
    print(f"MLPClassifier: {evaluation['MLPClassifier'].unique()}")
    print('---'*10)


for user: 2NuSkbN87vtA3NUnHnk7MA
cluster_classification: [0]
AdaBoostClassifier: [0 2]
BernoulliNB: [3 4 2 1 0]
SVC: [0 2]
MLPClassifier: [0]
------------------------------
for user: br1
cluster_classification: [0]
AdaBoostClassifier: [0 2]
BernoulliNB: [3 4 0 1]
SVC: [0 2]
MLPClassifier: [0]
------------------------------
for user: cb1
cluster_classification: [1]
AdaBoostClassifier: [0 2]
BernoulliNB: [3 4 2 1 0]
SVC: [0 2]
MLPClassifier: [0]
------------------------------
for user: cni3eMiU88dNUpBCqRw7lg
cluster_classification: [2]
AdaBoostClassifier: [0 2]
BernoulliNB: [4 3 2 1 0]
SVC: [0 2]
MLPClassifier: [0]
------------------------------
for user: GbTwShXl567OHWUmeK_yMQ
cluster_classification: [0]
AdaBoostClassifier: [0 2]
BernoulliNB: [4 3 2 1 0]
SVC: [0 2]
MLPClassifier: [0]
------------------------------
for user: IHdylcItlOaIGDpN_L9RoQ
cluster_classification: [2]
AdaBoostClassifier: [0 2]
BernoulliNB: [4 3 2 1 0]
SVC: [0 2]
MLPClassifier: [0]
------------------------------
fo

## Save To File

In [245]:
uf.to_csv('./CSV/colour_data_classified.csv', header=True,index=False)

In [None]:
# fin.