In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Display options
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Jupytper Config
%config Completer.use_jedi = False
%config IPCompleter.greedy=True
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

# SKLearn
from sklearn.preprocessing import StandardScaler # good bet, but max min may be better.
from sklearn.preprocessing import RobustScaler # will ignore outliers, but there shouldn't be any.
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN # clustering algos
from sklearn import metrics # for results

# SQL
import sqlalchemy
from sqlalchemy import create_engine

In [2]:
engine = create_engine('postgres://zfgtcbuzkpjiod:b52325b1a9244f1d3cf8500d14ae165af1369d7eff7841f306746f6a4e0733f4@ec2-52-20-248-222.compute-1.amazonaws.com:5432/d37uou3e7j5m64')

In [67]:
# df = pd.read_sql('colour_data', engine)
df = pd.read_csv('./Notebooks/CSV/colour_data_new.csv')

df = df.dropna(axis=0) # Drop nulls
df = df.groupby('user').filter(lambda x: len(x) > 30)

In [68]:
# build user list
user_list = list(df['user'].unique())
user_score = []

for i in df['user'].unique():
    isolated_user = df[df['user'] == i] # Create stats mask for individual stats for each iteration
    user_score.append(isolated_user['correct'].groupby(df['pallet_used']).mean())

user_scores = pd.DataFrame(user_score, index=user_list) # make a user sorted dataframe.

# find correlations
correlations = user_scores.T # make a second transposed dataframe for heatmap
correlations.columns = user_list # assign columns
correlations = correlations.corr().sort_index(ascending=True) # for 50% chart.

# create pallet list df
pallet_list = list(df.pallet_used.unique())
pallet_avg = []

for i in df.pallet_used.unique():
    pallet_stats = df[df['pallet_used'] == i] # individual stats for each iteration
    pallet_avg.append(pallet_stats['correct'].groupby(df['user']).mean())

pallet_avg = pd.DataFrame(pallet_avg, index=pallet_list) # make a user sorted dataframe.

# Add identfiers to columns
user_scores = user_scores.add_prefix('usr_scores_').add_suffix('_mean')
correlations = correlations.add_suffix('_corr') # reset columns names. No longer diagnoally aligned.

# Feature Engineering
# Group 1 - Sums of user responses per pallet category.
cb1_responses = []
cb2_responses = []
ncb_responses = []

# Group 2 - Means of user responses per pallet category.
cb1_percent_correct = []
cb2_percent_correct = []
ncb_percent_correct = []

# Group 3 - Score & Count (of binary responses)
percent_correct = []
responses = []

# Group 4 - list of correct responses by pallet.
user_sums = []

for i in df.user.unique():
    isolated_user = df[df['user'] == i] # make user mask for stats.
    
    # Group 1 - Sum of user responses per pallet category.
    cb1_responses.append(isolated_user['cb_type1'].sum()) # cb1 type total correct
    cb2_responses.append(isolated_user['cb_type2'].sum()) # cb2 type total correct
    ncb_responses.append(isolated_user['ncb'].sum()) # NCB type total correct
    
    # Group 2 - Mean of user responses per pallet category.
    cb1_percent_correct.append(isolated_user['cb_type1'].mean())
    cb2_percent_correct.append(isolated_user['cb_type2'].mean())
    ncb_percent_correct.append(isolated_user['ncb'].mean())
    
    # Group 3 - Score & Count (of binary responses)
    percent_correct.append(isolated_user['correct'].mean()) # % (float) of correct responses
    responses.append(isolated_user['correct'].value_counts().sum()) # number of correct responses
    
    # Group 4 - list of correct responses by pallet.
    user_sums.append(isolated_user['correct'].groupby(df['pallet_used']).sum())

# Group 1 - Sums
user_scores['cb1_responses'] = cb1_responses
user_scores['cb2_responses'] = cb2_responses
user_scores['ncb_responses'] = ncb_responses
    
# Group 2 - Means
user_scores['cb1_percent_correct'] = cb1_percent_correct
user_scores['cb2_percent_correct'] = cb2_percent_correct
user_scores['ncb_percent_correct'] = ncb_percent_correct

# Group 3 - Score & Count
user_scores['percent_correct'] = percent_correct
user_scores['responses'] = responses

# Group 4 - Totals per pallet
user_sums = pd.DataFrame(user_sums, index=user_list)
user_sums = user_sums.add_suffix('_sum') # identifyable as sum value. (Needs to be scaled!)

X = pd.concat([user_sums,user_scores,correlations], axis=1) # add all new dataframes together as X.b
X['user']= user_list

X = X.reindex(sorted(X.columns, reverse=True), axis=1) # Sort columns so User is index

# Write to CSV/SQL
# X.to_sql('colour_users', engine, if_exists = 'replace')

# Step 2
X.index = X['user']
Xusers = X[['user']].copy() # re-inserted later.
X.drop('user', axis=1, inplace=True)
Xss = StandardScaler().fit_transform(X) # Kmeans 0.325 DBSCAN 0.370 AG 0.392
Xrs = RobustScaler().fit_transform(X) # Kmeans 0.387 DBSCAN 0.416 AG 0.443

Xss = StandardScaler().fit_transform(X) # Kmeans 0.325 DBSCAN 0.370 AG 0.392
Xrs = RobustScaler().fit_transform(X) # Kmeans 0.387 DBSCAN 0.416 AG 0.443

baseline = X['percent_correct'].mean() # Define Baseline

all_clusters = pd.DataFrame(None,index = X.index) # Dataframe for recording cluster labels

cluster_loops = [3,4,5] # approx 3-5 types are expected. This will be revised.
Kmeans_results = pd.DataFrame()

for value in cluster_loops:
    km = KMeans(n_clusters=value).fit(Xrs)
    name = 'Kmeans_' + str(value)
    all_clusters[name] = km.labels_

neigh = NearestNeighbors(n_neighbors=3)
nbrs = neigh.fit(Xrs)
distances, indices = nbrs.kneighbors(Xrs)
distances = np.sort(distances, axis=0) # plotting shows approximate 85% shift.

eps_val =  np.quantile(distances, .85,).mean() # 85th quantile for distances. (Assume 85% regular vision based on previous data)
min_samples_val = 1 # 90% of the total pool required to make 1 cluster.

dbscan = DBSCAN(eps=eps_val, min_samples=min_samples_val, metric = 'euclidean').fit(Xrs) # eps with highest Silhouette score
all_clusters['DBSCAN'] = dbscan.labels_ # Assign Labels to 
    
# Add 1 to all values so 0 values are included.
for col in all_clusters.columns:
    if col in all_clusters.columns:
        all_clusters[col] = all_clusters[col] + 1

# cluster interated feature lables.
scaled_results = StandardScaler().fit_transform(all_clusters) #all_clusters.drop('DBSCAN',axis=1)
cluster_feature = DBSCAN(eps=2, min_samples=1, metric = 'euclidean').fit(scaled_results)
all_clusters['cluster_feature'] = cluster_feature.labels_ # Assign Labels to all_clusters

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

# Unsupervised Feature
X = pd.concat([X,all_clusters['cluster_feature']],axis=1)

# Take the new cluster labels and apply them to the users of our colour data
# We will use these as classification labels.
classification_feature= []
for i in df['user']:
    for j in X.index:
        if i == j:
            classification_feature.append(X['cluster_feature'].loc[i])

df['cluster_classification'] = classification_feature

ValueError: n_samples=1 should be >= n_clusters=3.

In [69]:
# df.to_sql('colour_classified', engine, if_exists='replace')

In [None]:


# Classification

CALIBRATION_GROUP = pd.DataFrame() # add all selections from loop to df.

for pallet in df['pallet_used'].unique():

    df_pallet = df[df['pallet_used'] == pallet] # iterate for the pallet.
    
    # cluster_classification'] == 0 : ncb
    # cluster_classification'] == 1 : cb_type1
    # cluster_classification'] == 2 : outliers
    # cluster_classification'] == 3 : cb_type2

    # Set for type1 colourblind correct
    df1 = df_pallet[(df_pallet['cb_type1'] == 1) & (df_pallet['cluster_classification'] == 0) & (df_pallet['correct'] == 0)] 
    df2 = df_pallet[(df_pallet['cb_type1'] == 1) & (df_pallet['cluster_classification'] == 1) & (df_pallet['correct'] == 1)]
    df3 = df_pallet[(df_pallet['cb_type1'] == 1) & (df_pallet['cluster_classification'] == 2) & (df_pallet['correct'] == 0)]
    df4 = df_pallet[(df_pallet['cb_type1'] == 1) & (df_pallet['cluster_classification'] == 3) & (df_pallet['correct'] == 1)]

    # Set for type2 colourblind correct
    df5 = df_pallet[(df_pallet['cb_type2'] == 1) & (df_pallet['cluster_classification'] == 0) & (df_pallet['correct'] == 0)] 
    df6 = df_pallet[(df_pallet['cb_type2'] == 1) & (df_pallet['cluster_classification'] == 1) & (df_pallet['correct'] == 1)]
    df7 = df_pallet[(df_pallet['cb_type2'] == 1) & (df_pallet['cluster_classification'] == 2) & (df_pallet['correct'] == 0)]
    df8 = df_pallet[(df_pallet['cb_type2'] == 1) & (df_pallet['cluster_classification'] == 3) & (df_pallet['correct'] == 1)]

    # Set for ncb correct
    df9 = df_pallet[(df_pallet['ncb'] == 1) & (df_pallet['cluster_classification'] == 0) & (df_pallet['correct'] == 1)] 
    df10 = df_pallet[(df_pallet['ncb'] == 1) & (df_pallet['cluster_classification'] == 1) & (df_pallet['correct'] == 0)]
    df11 = df_pallet[(df_pallet['ncb'] == 1) & (df_pallet['cluster_classification'] == 2) & (df_pallet['correct'] == 0)]
    df12 = df_pallet[(df_pallet['ncb'] == 1) & (df_pallet['cluster_classification'] == 3) & (df_pallet['correct'] == 0)] 

    calibration = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12],axis=0) #     This pulls together the criteria for improvement.
    calibration = calibration[['pallet_used','ishihara_list']]

    CALIBRATION_GROUP = pd.concat([CALIBRATION_GROUP,calibration],axis=0)

CALIBRATION_GROUP = CALIBRATION_GROUP.reset_index()

NEW_COLOURS_LIST = pd.DataFrame()
PALLET_NAMES = calibration['pallet_used'].unique()
CALIBRATION_REPORT = {}

In [None]:
def create_calibration_pallet(CALIBRATION_GROUP):
    for pallet in CALIBRATION_GROUP['pallet_used'].unique(): # this cycles for each pallet used in the data individually
        df_pallet = CALIBRATION_GROUP[CALIBRATION_GROUP['pallet_used'] == pallet] # iterate for the pallet.

        if len(df_pallet['pallet_used']) > 30: # check there are enough samples
            print(f"{pallet} has {len(df_pallet['pallet_used'])} suitable responses for calibration.")
        else:
            print(f"WARNING: for {pallet} we have less than 30 samples. Skipping")
            continue

        # Stage 1: Dictionary for iteration, made up of all colours.
        colours = {
            'colour_1' : {
                'red' : [colour[0][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[0][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[0][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_2' : {
                'red' : [colour[1][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[1][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[1][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_3' : {
                'red' : [colour[2][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[2][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[2][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_4' :{
                'red' : [colour[3][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[3][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[3][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_5' :{
                'red' : [colour[4][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[4][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[4][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_6' :{
                'red' : [colour[5][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[5][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[5][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_7' :{
                'red' : [colour[6][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[6][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[6][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_8' :{
                'red' : [colour[7][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[7][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[7][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_9' :{
                'red' : [colour[8][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[8][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[8][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_10' :{
                'red' : [colour[9][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[9][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[9][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_11' :{
                'red' : [colour[10][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[10][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[10][2] for colour in df_pallet['ishihara_list']],
            },
            'colour_12' :{
                'red' : [colour[11][0] for colour in df_pallet['ishihara_list']],
                'green': [colour[11][1] for colour in df_pallet['ishihara_list']],
                'blue' : [colour[11][2] for colour in df_pallet['ishihara_list']],
            }
        }

        # Stage 2: Convert Dictionary to pandas DataFrame
        coloursdf = pd.DataFrame(colours)

        # Create List of Pairs For Mashing.
        calibrated_pallet = []

        pallet_report = {}

        # Stage 3: Pairing & Clustering
        for colour,rgb in coloursdf.items():
            red_green = pd.DataFrame({'red': rgb[0],'green': rgb[1]})
            green_blue = pd.DataFrame({'green': rgb[1],'blue': rgb[2]})
            blue_red = pd.DataFrame({'blue': rgb[2],'red': rgb[0]})

            calibrator = [red_green,green_blue,blue_red] # List of vectors between R,G,B  X,Y,Z points.

            # Flexible DBSCAN Variables.
            eps_val = 15 # EPS Max dist is 20, for spread 10. 10 = 50% of range
            min_samples_val = int(len(calibrator[0].iloc[:, 0]) * .75) # 90% of the total pool required to make 1 cluster.

            # Stage 4: DBSCAN
            cluster_centers = [] # list of pair centers.
            for pair in calibrator:
                status = {}
                # prune outliers
                status['primary_count'] = len(pair.iloc[:, 0]) # Get data len # consider using .shape[0]
                pair = pair[(np.abs(stats.zscore(pair.iloc[:, 0])) < 3)] # uses the zscore function of df less than a SD of 3
                pair = pair[(np.abs(stats.zscore(pair.iloc[:, 1])) < 3)] # uses the zscore function of df less than a SD of 3
                status['secondary_count'] = len(pair.iloc[:, 0]) # Get data len
                status['primary_removed'] = status['primary_count'] - status['secondary_count'] # Get dif  in data len

                dbscan = DBSCAN(eps=eps_val, min_samples=min_samples_val, metric = 'euclidean').fit_predict(pair) # create a big cluster
                pair['dbscan'] = dbscan # assign it as an array
                dbscan = pair[pair['dbscan'] == 0] # Take cluster 0 
                dbscan = dbscan.drop('dbscan',axis=1)
                status['third_count'] = len(dbscan.iloc[:, 0]) # Get data len
                status['secondary_removed'] = status['secondary_count'] - status['third_count'] # Get dif  in data len

                # KMeans, finding new cluster centers.
                kmeans = KMeans(n_clusters=1).fit(dbscan) # fit KMEANS
                cluster_centers.append(kmeans.cluster_centers_) # Take the center value of the cluster.

                pallet_report[str(pair)] = status
            CALIBRATION_REPORT[pallet] = pallet_report

            red_corrected = int((cluster_centers[0][0][0] + cluster_centers[2][0][1]) / 2) # red
            green_corrected = int((cluster_centers[0][0][1] + cluster_centers[1][0][0]) / 2) # green
            blue_corrected = int((cluster_centers[1][0][1] + cluster_centers[2][0][0]) / 2) # blue

            calibrated_rgb=[red_corrected,green_corrected,blue_corrected]
            calibrated_rgb = '#%02x%02x%02x' % (red_corrected, green_corrected, blue_corrected) # Turn Tuple to Hex
            calibrated_pallet.append(calibrated_rgb) # add the list of colours to a pallet list

        NEW_COLOURS_LIST[pallet] = calibrated_pallet
    return f'All pallets have been calibrated.'

In [None]:
create_calibration_pallet(CALIBRATION_GROUP)

In [None]:
NEW_COLOURS_LIST.T

In [None]:
# Write back to pallet
pallet_dict = pd.read_csv('./CSV/pallets_dictionary_new.csv')