# This is initial version for testing multi-class classification. This notebook will be converted to script in future!!

In [None]:
import os
import pickle
import errno

import numpy as np
import pandas as pd
import six
import tensorflow as tf
from sklearn.dummy import DummyClassifier
from sklearn import (model_selection, linear_model, multiclass,
                     preprocessing)
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
# This function loads the labels from the node and labels file. logic for multi-label will be added in future!!
def load_labels(label_filename, vocab_size):
    """Load labels file. Supports single or multiple labels"""
    raw_labels = {}
    min_labels = np.inf
    max_labels = 0
    with open(label_filename) as f:
        for line in f.readlines():
            values = [int(x) for x in line.strip().split()]
            raw_labels[values[0]] = values[1:]
            min_labels = min(len(values) - 1, min_labels)
            max_labels = max(len(values) - 1, max_labels)
    print("Raw Labels: {}".format(len(raw_labels)))
    if min_labels < 1:
        raise RuntimeError("Expected 1 or more labels in file {}"
                           .format(label_filename))
    # Single label
    elif max_labels == 1:
        labels = np.full(vocab_size, np.nan, dtype=np.int32)
        for (index, label) in six.iteritems(raw_labels):
            labels[index] = label[0]
        return raw_labels, labels

In [3]:
# This function builds the classification model for evaluation of embedding. 
# Model for multi-label classification will be added in future.
def eval_classification(labels, embeddings,seed,train_split):
    
    classifier = linear_model.LogisticRegression(random_state=seed)
    
    shuffle = model_selection.StratifiedShuffleSplit(n_splits=10, train_size=train_split, test_size=1.0 - train_split)

    scoring = ['accuracy', 'f1_macro', 'f1_micro']

    cv_scores = model_selection.cross_validate(
        classifier, embeddings, labels, scoring=scoring, cv=shuffle,
        return_train_score=True)
    
    train_acc = cv_scores['train_accuracy'].mean()
    train_macro_f1 = cv_scores['train_f1_macro'].mean()
    train_micro_f1 = cv_scores['train_f1_micro'].mean()
    test_acc = cv_scores['test_accuracy'].mean()
    test_macro_f1 = cv_scores['test_f1_macro'].mean()
    test_micro_f1 = cv_scores['test_f1_micro'].mean()

    print("Train acc: {:0.4f}, macro_f1: {:0.4f}, micro_f1: {:0.4f}".format(train_acc, train_macro_f1,train_micro_f1))
    print("Test acc: {:0.4f}, macro_f1: {:0.4f}, micro_f1: {:0.4f}".format(test_acc, test_macro_f1,test_micro_f1))

    return {'train_acc': train_acc, 'test_acc': test_acc, 'train_macro_f1': train_macro_f1, 'test_macro_f1': test_macro_f1, 
            'train_micro_f1': train_micro_f1, 'test_micro_f1': test_micro_f1}

In [1]:
# This function writes the final f1 scores to output file.
def save_scores(scores, prefix):
    with open("scores_fourone.txt","a") as f:
        f.write("{:0.4f}, {:0.4f}, {:0.4f}, {:0.4f}, {:0.4f}, {:0.4f}\n"
                .format(scores["train_acc"], scores["train_macro_f1"], scores["train_micro_f1"],scores["test_acc"],
                        scores["test_macro_f1"],scores["test_micro_f1"]))

In [5]:
# This code reads the embedding and sorts based on the node id. So the label data can be matched and used for evaluation.!!
embeddings = []
emb_dict = {}
all_embeddings = []
with open("emb_dgl_fourone.txt", "r") as pfile:
    for line in pfile:
        line = line[:-1]
        line = line.split(' ')
        embeddings.append(line)
embeddings = embeddings[1:]

for row in embeddings:
    emb_dict[int(row[0])] = [float(x) for x in row[1:]]
del embeddings

for i in range(0,len(emb_dict)):
    all_embeddings.append(emb_dict[i])
del emb_dict
v_size = len(all_embeddings)
print("Vocab size: {}".format(v_size))

Vocab size: 7624


In [6]:
# Loading labels from the labels file.
r_labels, all_labels = load_labels('lastfm_asia_target.txt', v_size)
un, counts = np.unique(all_labels, return_counts=True)
print(dict(zip(un, counts)))
print("Labeled vocab size: {}".format(len(all_labels)))
print(all_labels[0])

Raw Labels: 7624
{0: 1098, 1: 54, 2: 73, 3: 515, 4: 16, 5: 391, 6: 655, 7: 82, 8: 468, 9: 58, 10: 1303, 11: 138, 12: 57, 13: 63, 14: 570, 15: 257, 16: 254, 17: 1572}
Labeled vocab size: 7624
8


In [3]:
# Performing evaluation for different set of split ratio and seed values!!
seeds = [58125312,58125333,58125111,58125000]
train_split = [0.4,0.3,0.2,0.1]
for seed,split in zip(seeds,train_split):
    evals = eval_classification(all_labels, all_embeddings,seed,split)
    save_scores(evals, "scores")