In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import humanize
from itertools import combinations
from operator import itemgetter
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean

In [2]:
RESULTS_PATH = 'data/results'
FEATURES_PATH = 'data/features'

In [3]:
results_dfs = {}
for filename in os.listdir(RESULTS_PATH):
    archtecture = re.search(r'(?<=results_).*(?=.csv)', filename).group()
    results_dfs[archtecture] = pd.read_csv(os.path.join(RESULTS_PATH, filename))

In [4]:
metabase_df = pd.concat(results_dfs.values(), ignore_index=True)

In [5]:
metabase_df.head()

Unnamed: 0,Architecture,Classes,Accuracy_vector,Precision_vector,Recall_vector,F1_vector,Accuracy_avg,Precision_Avg,Recall_Avg,F1_avg
0,ResNet10,airplane-automobile,"[0.942, 0.939, 0.93, 0.9275, 0.916, 0.9315, 0....","[0.9203510204081633, 0.9051657032755298, 0.893...","[0.932, 0.958, 0.948, 0.939, 0.954, 0.938, 0.8...","[0.9414141414141414, 0.9401373895976447, 0.931...",0.92505,0.892683,0.9274,0.925149
1,ResNet10,airplane-bird,"[0.89, 0.889, 0.8935, 0.882, 0.875, 0.8895, 0....","[0.8614113785557986, 0.8492249488752557, 0.852...","[0.847, 0.878, 0.89, 0.886, 0.888, 0.89, 0.887...","[0.8850574712643677, 0.8877654196157735, 0.893...",0.8848,0.843588,0.8753,0.883662
2,ResNet10,airplane-cat,"[0.9245, 0.9325, 0.935, 0.926, 0.932, 0.925, 0...","[0.8897871921182265, 0.8980344995140912, 0.905...","[0.932, 0.947, 0.937, 0.938, 0.961, 0.909, 0.8...","[0.9250620347394541, 0.933464760965993, 0.9351...",0.9258,0.893557,0.9279,0.925876
3,ResNet10,airplane-deer,"[0.9475, 0.9125, 0.943, 0.93, 0.917, 0.936, 0....","[0.9234072781655036, 0.8553790534618757, 0.921...","[0.949, 0.983, 0.933, 0.98, 0.935, 0.928, 0.95...","[0.947578632051922, 0.9182624941616068, 0.9424...",0.9332,0.900529,0.946,0.934192
4,ResNet10,airplane-dog,"[0.948, 0.9515, 0.936, 0.947, 0.9345, 0.9475, ...","[0.924704, 0.9361238390092881, 0.9224043010752...","[0.948, 0.936, 0.901, 0.95, 0.914, 0.93, 0.955...","[0.948, 0.9507364144235654, 0.9336787564766839...",0.9447,0.923633,0.9363,0.944177


In [6]:
features = {}
for filename in os.listdir(FEATURES_PATH):
    class_ = re.search(r'\w+(?=.data)', filename).group()
    with open(os.path.join(FEATURES_PATH, filename), 'rb') as f:
        features[class_] = pickle.load(f)

In [7]:
centroids = {}
for classes in metabase_df['Classes'].unique():
    class_1, class_2 = classes.split('-')
    X = np.concatenate((features[class_1], features[class_2]), axis=0)
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(X)
    centroids[classes] = kmeans.cluster_centers_

In [8]:
distances = {}
for tasks in combinations(metabase_df['Classes'].unique(), 2):
    task_1, task_2 = tasks
    distances[tasks] = min(
        np.mean([
            euclidean(centroids[task_1][0], centroids[task_2][0]),
            euclidean(centroids[task_1][1], centroids[task_2][1])
        ]),
        np.mean([
            euclidean(centroids[task_1][0], centroids[task_2][1]),
            euclidean(centroids[task_1][1], centroids[task_2][0])
        ])
    )

In [9]:
ranking_best_architecture_by_tasks = {}
for task in metabase_df['Classes'].unique():
    ranking_best_architecture_by_tasks[task] = metabase_df[
        metabase_df['Classes'] == task
    ].sort_values('Accuracy_avg', ascending=False)['Architecture'].tolist()

In [10]:
similar_tasks_by_tasks = {}
for task in metabase_df['Classes'].unique():
    hypotheses = []
    for tasks in distances.keys():
        if task in tasks:
            hypothese = tasks[1] if task == tasks[0] else tasks[0]
            hypotheses.append((hypothese, distances[tasks]))
    similar_tasks_by_tasks[task] = sorted(hypotheses, key=itemgetter(1))

In [11]:
recommendations_df = pd.DataFrame(
    metabase_df['Classes'].unique(),
    columns=['Task']
)

In [12]:
recommendations_df['Best_architecture'] = recommendations_df.apply(
    lambda row: ranking_best_architecture_by_tasks[row['Task']][0],
    axis=1
)

In [13]:
recommendations = []
for n in range(3):
    recommendation = []
    for task in recommendations_df['Task']:
        similar_task, centroids_distance = similar_tasks_by_tasks[task][n]
        recommended_architecture = ranking_best_architecture_by_tasks[
            similar_task
        ][0]
        recommended_architecture_rank = ranking_best_architecture_by_tasks[
            task
        ].index(recommended_architecture)
        recommendation.append('{} - {:.4f} - {} - {}'.format(
            similar_task,
            centroids_distance,
            recommended_architecture,
            humanize.ordinal(recommended_architecture_rank + 1)
        ))
    recommendations.append(recommendation)

In [14]:
(
    recommendations_df['First_recommendation'],
    recommendations_df['Second_recommendation'],
    recommendations_df['Third_recommendation']
) = recommendations

In [15]:
recommendations_df.head()

Unnamed: 0,Task,Best_architecture,First_recommendation,Second_recommendation,Third_recommendation
0,airplane-automobile,MiniVGG1,automobile-ship - 0.0521 - MiniGoogLeNet1 - 4th,airplane-truck - 0.0683 - MiniVGG3 - 2nd,ship-truck - 0.0835 - MiniVGG3 - 2nd
1,airplane-bird,MiniVGG1,airplane-deer - 0.0380 - MiniGoogLeNet1 - 2nd,bird-ship - 0.0513 - MiniVGG3 - 4th,deer-ship - 0.0599 - MiniGoogLeNet1 - 2nd
2,airplane-cat,MiniVGG3,cat-ship - 0.0517 - MiniVGG3 - 1st,airplane-deer - 0.0551 - MiniGoogLeNet1 - 4th,airplane-dog - 0.0604 - MiniGoogLeNet1 - 4th
3,airplane-deer,MiniGoogLeNet1,airplane-bird - 0.0380 - MiniVGG1 - 6th,deer-ship - 0.0509 - MiniGoogLeNet1 - 1st,airplane-cat - 0.0551 - MiniVGG3 - 2nd
4,airplane-dog,MiniGoogLeNet1,dog-ship - 0.0520 - MiniVGG1 - 4th,airplane-cat - 0.0604 - MiniVGG3 - 2nd,horse-ship - 0.0756 - MiniVGG3 - 2nd


In [16]:
recommendations_df.to_csv('recommendations.csv', index=False)