In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#coding:utf-8
import numpy as np
import json
# from model import ConceptFlow, use_cuda
# from preprocession import prepare_data, build_vocab, gen_batched_data
import torch
import warnings
import yaml
import os
from torch.autograd import Variable
import torch.nn as nn
from torch.nn import utils as nn_utils
warnings.filterwarnings('ignore')

In [4]:
use_cuda = True
if use_cuda and torch.cuda.is_available():
    device = torch.device('cuda:0')

In [5]:
%run '/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/Central.ipynb'
%run '/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/Embedding.ipynb'
%run '/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/Outer.ipynb'
%run '/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/Conceptflow.ipynb'
%run '/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/Preprocession.ipynb'

# Utils

In [6]:
def padding(sent, l):
    return sent + ['_EOS'] + ['_PAD'] * (l-len(sent)-1)

def padding_triple_id(entity2id, triple, num, l):
    newtriple = []
    for i in range(len(triple)):
        for j in range(len(triple[i])):
            for k in range(len(triple[i][j])):
                if triple[i][j][k] in entity2id:
                    triple[i][j][k] = entity2id[triple[i][j][k]]
                else:
                    triple[i][j][k] = entity2id['_NONE']

    for tri in triple:
        newtriple.append(tri + [[entity2id['_PAD_H'], entity2id['_PAD_R'], entity2id['_PAD_T']]] * (l - len(tri)))
    pad_triple = [[entity2id['_PAD_H'], entity2id['_PAD_R'], entity2id['_PAD_T']]] * l
    return newtriple + [pad_triple] * (num - len(newtriple))

def build_kb_adj_mat(kb_adj_mats, fact_dropout):
    """Create sparse matrix representation for batched data"""
    mats0_batch = np.array([], dtype=int)
    mats0_0 = np.array([], dtype=int)
    mats0_1 = np.array([], dtype=int)
    vals0 = np.array([], dtype=float)

    mats1_batch = np.array([], dtype=int)
    mats1_0 = np.array([], dtype=int)
    mats1_1 = np.array([], dtype=int)
    vals1 = np.array([], dtype=float)

    for i in range(kb_adj_mats.shape[0]):
        (mat0_0, mat0_1, val0), (mat1_0, mat1_1, val1) = kb_adj_mats[i]
        assert len(val0) == len(val1)
        num_fact = len(val0)
        num_keep_fact = int(np.floor(num_fact * (1 - fact_dropout)))
        mask_index = np.random.permutation(num_fact)[ : num_keep_fact]
        # mat0
        mats0_batch = np.append(mats0_batch, np.full(len(mask_index), i, dtype=int))
        mats0_0 = np.append(mats0_0, mat0_0[mask_index])
        mats0_1 = np.append(mats0_1, mat0_1[mask_index])
        vals0 = np.append(vals0, val0[mask_index])
        # mat1
        mats1_batch = np.append(mats1_batch, np.full(len(mask_index), i, dtype=int))
        mats1_0 = np.append(mats1_0, mat1_0[mask_index])
        mats1_1 = np.append(mats1_1, mat1_1[mask_index])
        vals1 = np.append(vals1, val1[mask_index])

    return (mats0_batch, mats0_0, mats0_1, vals0), (mats1_batch, mats1_0, mats1_1, vals1)

# Generating

In [7]:
csk_triples, csk_entities, kb_dict = [], [], []
dict_csk_entities, dict_csk_triples = {}, {}
class Config():
  def __init__(self, path):
    self.config_path = path
    self._get_config()

  def _get_config(self):
    with open(self.config_path, "r") as setting:
      config = yaml.load(setting,Loader=yaml.FullLoader)
    self.is_train = config['is_train']
    self.test_model_path = config['test_model_path']
    self.embed_units = config['embed_units']
    self.symbols = config['symbols']
    self.units = config['units']
    self.layers = config['layers']
    self.batch_size = config['batch_size']
    self.data_dir = config['data_dir']
    self.num_epoch = config['num_epoch']
    self.lr_rate = config['lr_rate']
    self.lstm_dropout = config['lstm_dropout']
    self.linear_dropout = config['linear_dropout']
    self.max_gradient_norm = config['max_gradient_norm']
    self.trans_units = config['trans_units']
    self.gnn_layers = config['gnn_layers']
    self.fact_dropout = config['fact_dropout']
    self.fact_scale = config['fact_scale']
    self.pagerank_lambda = config['pagerank_lambda']
    self.result_dir_name = config['result_dir_name']
    self.generated_path = config['generated_path']

  def list_all_member(self):
    for name, value in vars(self).items():
        print('%s = %s' % (name, value))


def run(model, data_train, config, word2id, entity2id):
  batched_data = gen_batched_data(data_train, config, word2id, entity2id)

  if model.is_inference == True:
      word_index, selector = model(batched_data)
      return word_index, selector
  else:
      decoder_loss, sentence_ppx, sentence_ppx_word, sentence_ppx_local, sentence_ppx_only_two, word_neg_num, local_neg_num, only_two_neg_num = model(batched_data)
      return decoder_loss, sentence_ppx, sentence_ppx_word, sentence_ppx_local, sentence_ppx_only_two, word_neg_num, local_neg_num, only_two_neg_num


def generate(model, data_test, config, word2id, entity2id, epoch = 0, model_path = None):
  if model_path != None:
      model.load_state_dict(torch.load(model_path,map_location=torch.device('cpu')))

  count = 0
  model.is_inference = True
  id2word = dict()
  for key in word2id.keys():
      id2word[word2id[key]] = key

  def write_batch_res_text(word_index, id2word, selector = None):
      w = open(config.generated_path + '/generated_res_Scr.txt', 'a')
      batch_size = len(word_index)
      decoder_len = len(word_index[0])
      text = []
      if selector != None:
          for i in range(batch_size):
              tmp_dict = dict()
              tmp = []
              for j in range(decoder_len):
                  if word_index[i][j] == 2:
                      break
                  tmp.append(id2word[word_index[i][j]])
              # print(tmp)
              tmp_dict['res_text'] = tmp
              local_tmp = []
              only_two_tmp = []
              for j in range(len(tmp)):
                  if selector[i][j] == 1:
                      local_tmp.append(tmp[j])
                  if selector[i][j] == 2:
                      only_two_tmp.append(tmp[j])
              tmp_dict['local'] = local_tmp
              tmp_dict['only_two'] = only_two_tmp
              text.append(tmp_dict)

      w.write(json.dumps(model_path+ '\n'))
      for line in text:
          print(line)
          w.write(json.dumps(line) + '\n')
      w.close()

  for iteration in range(len(data_test)):
    word_index, selector = run(model, data_test[(iteration * config.batch_size):(iteration * \
          config.batch_size + config.batch_size)], config, word2id, entity2id)

    if count % 50 == 0:
        print ("generate:", iteration)
    count += 1
    write_batch_res_text(word_index, id2word, selector=selector)


def main():
  config = Config('/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/config.yml')
  config.list_all_member()
  raw_vocab, _, data_test = prepare_data(config)
  word2id, entity2id, vocab, embed, entity_vocab, entity_embed, relation_vocab, relation_embed, entity_relation_embed = build_vocab(config.data_dir, raw_vocab, config = config)
  model = use_cuda(ConceptFlow(config, embed, entity_relation_embed))

  model_optimizer = torch.optim.Adam(model.parameters(), lr = config.lr_rate)

  if not os.path.exists(config.generated_path):
      os.mkdir(config.generated_path)

  generate(model, data_test, config, word2id, entity2id, model_path=config.test_model_path)

In [27]:
os.path.realpath('/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/config.yml')

'/content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/config.yml'

In [8]:
main()

config_path = /content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/config.yml
is_train = False
test_model_path = /content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/training_output/_epoch_4.pkl
embed_units = 300
symbols = 30000
units = 512
layers = 2
batch_size = 30
data_dir = /content/drive/MyDrive/Colab Notebooks/ConceptFlow(ECCF)_data
num_epoch = 5
lr_rate = 0.0001
lstm_dropout = 0.3
linear_dropout = 0.2
max_gradient_norm = 5
trans_units = 100
gnn_layers = 3
fact_dropout = 0.0
fact_scale = 1
pagerank_lambda = 0.8
result_dir_name = /content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/training_output
generated_path = /content/drive/MyDrive/Commit_colab/PURDUE-2024-SPRING/CS592 HAI/Project/inference_output
Creating word vocabulary...
Creating entity vocabulary...
Creating relation vocabulary...
Loading word vectors...
    processing line 0
    processing line 100000
    processing line 200000
    processing line 300000
    pr

ValueError: max() arg is an empty sequence