In [1]:
import torch
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skimage import io
import skimage
from importlib import reload
import folded_dataset
reload(folded_dataset)
import utils
reload(utils)
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, classification_report

In [8]:
root_dir = '/scr/zchen/datasets/morphem_70k_2.0'
dataset = 'CP'
leave_out = 'Task_four' # Leave-one-out task, set to None for Allen
leaveout_label = 'Plate'
model_choice = 'knn'

In [9]:
# Load features and metadata
print('Load features...')

features_path = f'{root_dir}/features/{dataset}/pretrained_resnet18_features.npy'
df_path = f'{root_dir}/{dataset}/enriched_meta.csv'

features = np.load(features_path)
df = pd.read_csv(df_path)

# Count number of tasks
tasks = list(df['train_test_split'].unique())
tasks.remove('Train')
if leave_out != None:
    leaveout_ind = tasks.index(leave_out)


# Get index for training and each testing set
train_indices = np.where(df['train_test_split'] == 'Train')[0]
all_test_indices = [np.where(df[task])[0] for task in tasks]

# Convert categorical labels to integers    
target_value = list(df['Label'].unique())

encoded_target = {}
for i in range(len(target_value)):
    encoded_target[target_value[i]] = i
df['encoded_label'] = df.Label.apply(lambda x: encoded_target[x])

# Split data into training and testing for regular classification
train_X = features[train_indices]
test_Xs = [features[test_indices] for test_indices in all_test_indices]

task_Ys = [df['encoded_label'].values for key in tasks]
train_Ys = [task_Ys[task_ind][train_indices] for task_ind in range(len(tasks))]
test_Ys = [task_Ys[task_ind][test_indices] for task_ind, test_indices in enumerate(all_test_indices)]

# Data splitting for leave one out task
if leave_out != None:
    df_takeout = df[df[leave_out]]
    groups = list(df_takeout[leaveout_label].unique())

    all_group_indices = [df_takeout[df_takeout[leaveout_label]==group].index.values for group in groups]
    all_other_indices = [df_takeout[df_takeout[leaveout_label]!=group].index.values for group in groups]

    takeout_X = [features[group_indices] for group_indices in all_group_indices]
    rest_X = [features[np.concatenate((train_indices,other_indices), axis=None)] \
                                          for other_indices in all_other_indices]

    takeout_Y = [task_Ys[leaveout_ind][group_indices] for group_indices in all_group_indices]
    rest_Y = [task_Ys[leaveout_ind][np.concatenate((train_indices,other_indices), axis=None)] \
                                                  for other_indices in all_other_indices]

print('Train classifiers...')
accuracies = []
f1scores_macro = []
reports_str = []
reports_dict = []



for task_ind, task in enumerate(tasks):
    if task != leave_out: # standard classification
        
        if model_choice == 'knn':
            model = utils.FaissKNeighbors(k=1)
        elif model_choice == 'sgd':
            model = SGDClassifier(alpha=0.001, max_iter=100)
        else:
            print(f'{model_choice} is not implemented. Try sgd or knn.')
            break
        
        model.fit(train_X, train_Ys[task_ind])
        predictions = model.predict(test_Xs[task_ind])
        ground_truth = test_Ys[task_ind]
    
    else: # leave-one-out
        predictions = []
        ground_truth = []
        for group_ind, group in enumerate(groups):
            model = utils.FaissKNeighbors(k=1)
            
            model.fit(rest_X[group_ind], rest_Y[group_ind])
            group_predictions = model.predict(takeout_X[group_ind])
            group_ground_truth = takeout_Y[group_ind]

            predictions.append(group_predictions)
            ground_truth.append(group_ground_truth)
    
        predictions = np.concatenate(predictions)
        ground_truth = np.concatenate(ground_truth)
        
    # Compute evaluation metrics
    accuracy = np.mean(predictions == ground_truth)
    report_str = classification_report(ground_truth, predictions)
    report_dict = classification_report(ground_truth, predictions, output_dict=True)
    f1score_macro = f1_score(ground_truth, predictions, average='macro')

    accuracies.append(accuracy)
    f1scores_macro.append(f1score_macro)
    reports_str.append(report_str)
    reports_dict.append(report_dict)    

print('Results:')
for task_ind, task in enumerate(tasks):
    print(f'Results for {dataset} {task} with {model_choice} :')
    print(reports_str[task_ind])

Load features...
Train classifiers...
Results:
Results for CP Task_one with knn :
              precision    recall  f1-score   support

           0       0.69      0.78      0.73      4313
           1       0.45      0.71      0.55       393
           2       0.55      0.64      0.60      2702
           3       0.73      0.57      0.64      5657

    accuracy                           0.66     13065
   macro avg       0.61      0.68      0.63     13065
weighted avg       0.67      0.66      0.66     13065

Results for CP Task_two with knn :
              precision    recall  f1-score   support

           0       0.27      0.30      0.28      3778
           1       0.17      0.47      0.25       369
           2       0.18      0.32      0.23      2209
           3       0.65      0.46      0.54     10039

    accuracy                           0.41     16395
   macro avg       0.32      0.39      0.33     16395
weighted avg       0.49      0.41      0.43     16395

Results for C

In [None]:
full_reports_dict = {}
full_reports_dict['target_encoding'] = encoded_target
for task_ind, task in enumerate(tasks):
    full_reports_dict[task] = reports_dict[task_ind]


In [None]:
dest_dir = '/scr/zchen/MorphEm_local/results'

dict_path = f'{dest_dir}/{dataset}_{model_choice}_full_results.json'
with open(dict_path, 'w') as f:
    json.dump(full_reports_dict, f)

In [7]:
results_temp = pd.DataFrame({'source': [dataset for i in range(len(tasks))],\
                        'task': tasks,'model': [model_choice for i in range(len(tasks))],\
                        'accuracy': accuracies,'f1_score_macro': f1scores_macro})
results_temp

Unnamed: 0,source,task,model,accuracy,f1_score_macro
0,HPA,Task_one,knn,0.52065,0.522278
1,HPA,Task_two,knn,0.378378,0.334066
2,HPA,Task_three,knn,0.142424,0.08935


In [None]:
results = pd.concat([results, results_temp]).reset_index(drop=True)
results

In [None]:
dest_dir = '/scr/zchen/MorphEm_local/results'
results.to_csv(f'{dest_dir}/resnet18_knn_sgd.csv', index=False)