In [None]:
import config
import models
import json
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt

Define Variables

In [None]:
model_info = {} # log information about the model

# main variables
model_info['dataset_name'] = "WN11"
embedding_model            = models.TransE

# hyperparameters
model_info['batch_size']    = 2048 # mini-batch size (takes precedence over n_batches)
model_info['n_batches']     = None # number of batches
model_info['n_epochs']      = 1000 # epochs
model_info['learning_rate'] = 0.001
model_info['margin']        = 1.0
model_info['k']             = 256 # embedding dimension
model_info['bern']          = 1 # use Bernoulli distribution for generating negative training examples
model_info['opt_method']    = 'adam'
model_info['score_norm']    = 'l2' # implemented in only TransD so far

# logging settings
model_info['log_on']    = 1
model_info['log_type']  = 'batch'
model_info['log_print'] = True

# device settings: GPU and CPU
model_info['work_threads']         = 8
model_info['CUDA_VISIBLE_DEVICES'] = "1" # should be a string

Create config instance and set variables/hyperparameters

In [None]:
con = config.Config()
dataset_path = "./benchmarks/{}/".format(model_info['dataset_name'])
con.set_in_path(dataset_path)
con.set_test_link_prediction(False)
con.set_test_triple_classification(True)
con.set_log_on(model_info['log_on'], log_type=model_info['log_type'], log_print=model_info['log_print'])
con.set_train_times(model_info['n_epochs'])
con.set_batch_size(model_info['batch_size']) if model_info['batch_size'] != None else con.set_nbatches(model_info['n_batches'])
con.set_alpha(model_info['learning_rate'])
con.set_bern(model_info['bern'])
con.set_dimension(model_info['k'])
con.set_margin(model_info['margin'])
# con.set_ent_neg_rate(1)
# con.set_rel_neg_rate(0)
con.set_opt_method(model_info['opt_method'])
con.set_work_threads(model_info['work_threads'])
os.environ["CUDA_VISIBLE_DEVICES"]=model_info['CUDA_VISIBLE_DEVICES']

## Training

Train the model

In [None]:
model_info['model_name'] = embedding_model.__name__

# defining the export path
export_path = "./results/{}/{}/{}".format(
    model_info['dataset_name'],
    model_info['model_name'],
    int(time.time()))

# create export_path dir if it does not exist
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
ensure_dir(export_path + '/') # without the "/" in the end the path wasn't being created

print("Model will be exported to {}. \n".format(export_path))



con.set_export_files("{}/model.pt".format(export_path))

# Model parameters will be exported to json files automatically.
con.set_out_files("{}/embedding.vec.json".format(export_path))

# Initialize experimental settings.
con.init()

# Pass the model to the configuration
con.set_model(model=embedding_model,
              score_norm=model_info['score_norm'])

# Train the model.
con.run()

# Save log data
learning_curve = pd.DataFrame(con.log['training_curve'])
learning_curve.to_csv("{}/learning_curve.tsv".format(export_path), sep='\t')

model_info['learning_time'] = con.log['learning_time']
print('\nModel was trained in {} seconds'.format(model_info['learning_time']))

Plot and save the learning curve

In [None]:
data = learning_curve
if model_info['log_type'] == 'epoch':
    n_epochs = data.epoch.max()
    y_min = data.epoch_loss.min()
    y_max = data.epoch_loss.iloc[model_info['n_epochs'] // 10]

    fig, ax1 = plt.subplots()

    ax1.set(title='Training Loss and Validation Accuracy',
            xlabel='Epoch',
            ylabel='Loss',
            xlim=[0, model_info['n_epochs']],
            ylim=[y_min, y_max],
           )
    ax1.plot(data.epoch, data.epoch_loss, label='Loss')
    ax1.legend(loc=2)

    ax2 = ax1.twinx()
    ax2.plot(data.epoch, data.valid_acc, label='Valid Acc', color='orange')
    ax2.set(ylabel='Accuracy')
    ax2.legend(loc=1)

    fig.set_size_inches(18, 12)
    fig.savefig("{}/learning_curve.svg".format(export_path))
    
elif model_info['log_type'] == 'batch':
    fig, ax1 = plt.subplots()

    ax1.set(title='Training Loss and Validation Accuracy',
            xlabel='Batch (absolute)',
            ylabel='Loss',
           )
    ax1.plot(data.batch_loss, label='Batch Loss')
    ax1.legend(loc=2)

    ax2 = ax1.twinx()
    ax2.plot(data.valid_acc, label='Valid Acc', color='orange')
    ax2.set(ylabel='Accuracy')
    ax2.legend(loc=1)

    fig.set_size_inches(18, 12)
    fig.savefig("{}/learning_curve.svg".format(export_path))

## Testing

In [None]:
con.set_model(embedding_model,
              score_norm=model_info['score_norm']) # dont know why this was necessary to run test()
con.import_variables("{}/model.pt".format(export_path)) # loading model via torch.load()
con.test()

In [None]:
from ctypes import c_float
res = c_float.in_dll(con.lib, 'aveAcc')
model_info['acc'] = res.value
print "Average accuracy in the test set is {}".format(model_info['acc'])

## Saving logs and history

In [None]:
# save model_info DataFrame
pd.DataFrame([model_info]).to_csv('{}/model_info.tsv'.format(export_path), sep='\t')

In [None]:
%notebook history.ipynb

In [None]:
bashCommand = "mv -t {}/ history.ipynb".format(export_path)
import subprocess
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()