In [1]:
import os
import glob
import json
import torch
import numpy as np
from collections import defaultdict
import random

In [37]:
class jsonEncoder(object):
	def __init__(self, json_obj=None, mode = None):
		self.json_obj = json_obj
		self.mode = mode

	@classmethod
	def from_json(cls, path, review_filename, mode):
		try:
			return cls(open(os.path.join(path, 'Embeddings', review_filename)), mode=mode)
		except FileNotFoundError:
			return cls(None)

	def __call__(self):
		if not self.json_obj == None:
			encoded = json.load(self.json_obj)
			paper = np.asarray(encoded['paper'])
			reviews = []
			if self.mode == 'SCAFFOLDS':
				rec_score = []
				conf_score = []
				for i, review in enumerate(encoded['reviews']):
					reviews.append(np.asarray(review.get('review_text')))
					rec_score.append(review.get('RECOMMENDATION'))
					conf_score.append(review.get('CONFIDENCE'))		
				return paper, reviews, rec_score, conf_score
			elif self.mode == 'MAIN':
				significance_score = []
				for i, review in enumerate(encoded['reviews']):
					reviews.append(np.asarray(review.get('review_text')))
					significance_score.append([float(i) for i in review.get('SIGNIFICANCE')])
				return paper, reviews, significance_score
			else:
				rec_score = []
				conf_score = []
				significance_score = []
				for i, review in enumerate(encoded['reviews']):
					reviews.append(np.asarray(review.get('review_text')))
					rec_score.append(review.get('RECOMMENDATION'))
					conf_score.append(review.get('CONFIDENCE'))	
					significance_score.append([float(i) for i in review.get('SIGNIFICANCE')])
				return paper, reviews, rec_score, conf_score, significance_score
		else:
			return None

In [38]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [39]:
def write(path, mode=None):
    reviews_dir = os.listdir(os.path.join(path, 'reviews'))#[:n]
    for review_dir in reviews_dir:
        ret = jsonEncoder.from_json(path, review_dir, mode=mode)()
        if ret == None:
            continue
        for i, rev in enumerate(ret[1],0):
            json_obj = {}
            json_obj['paper'] = ret[0]
            json_obj['review'] = rev
            if mode == 'SCAFFOLDS':
                json_obj['recommendation'] = int(ret[2][i])
                json_obj['confidence'] = int(ret[3][i])
            elif mode == 'MAIN':
                json_obj['significance'] = ret[2][i]
            else:
                json_obj['recommendation'] = int(ret[2][i])
                json_obj['confidence'] = int(ret[3][i])
                json_obj['significance'] = ret[4][i]
            filename = '../test_data/' + review_dir + '_' + str(i)
            with open(os.path.join(path, filename), 'w') as f:
                f.write(json.dumps(json_obj, indent=8, ensure_ascii=False, cls=NumpyEncoder))

In [40]:
write('./Data/SignData/test', mode='TEST')

In [2]:
from __future__ import division
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms


class Transform(object):
	def __init__(self):
		pass
	def __call__(self, array, max_sents):
		return torch.from_numpy(np.pad(array, [(0, max_sents - array.shape[0]), (0,0)], mode = 'constant', constant_values = 0.0))
    
class ScaleSigScores(object):
    def __init__(self):
        pass
    def __call__(self, array, min_, max_):
        sent = array[2]
        array = (array-min_)/(max_ - min_)
        array = (array*9) + 1
        array[2] = (((sent - (-1))*9)/2) + 1
        return array
        

In [8]:
class dataset(Dataset):
    def __init__(self, path, mode = 'SCAFFOLDS', transform = None, sigtx = None):
        self.path = path
        self.data = os.listdir(path)
        self.mode = mode
        if self.mode == 'SCAFFOLDS':
            self.max_paper_sentences, self.max_review_sentences = self.forTransform()
        else:
            self.max_paper_sentences, self.max_review_sentences, self.sig_min, self.sig_max = self.forTransform()
            #790, 790, np.array([0.05428571,0.93013972,-0.99966815]), np.array([33.26288336,252.25067107,0.99966679])
        self.transform = transform
        self.SigTransform = sigtx
        
    def forTransform(self):
        max_paper_sents = 0
        max_review_sents = 0
        if self.mode != 'SCAFFOLDS':
            sig_scores = []
            
        for file in self.data:
            json_obj = open(os.path.join(self.path, file))
            ret = json.load(json_obj)
            if len(ret['paper']) > max_paper_sents:
                max_paper_sents = len(ret['paper'])
            if len(ret['review']) > max_review_sents:
                max_review_sents = len(ret['review'])
            if self.mode != 'SCAFFOLDS':
                sig_scores.append(ret['significance'])
        if self.mode != 'SCAFFOLDS':            
            return max_paper_sents, max_review_sents,np.asarray(sig_scores).min(axis=0),np.asarray(sig_scores).max(axis=0)
        else:
            return max_paper_sents, max_review_sents
        
    def __getitem__(self, index):
        file = self.data[index]
        json_obj = open(os.path.join(self.path, file))
        json_data = json.load(json_obj)

        if self.mode == 'SCAFFOLDS':
            if self.transform:
                return self.transform(np.asarray(json_data['paper']), self.max_paper_sentences), \
                    self.transform(np.asarray(json_data['review']), self.max_review_sentences), \
                    json_data['recommendation'], json_data['confidence']
            else:
                return json_data['paper'], json_data['review'], json_data['recommendation'], json_data['confidence']
        elif self.mode == 'MAIN':
            if self.transform:
                return self.transform(np.asarray(json_data['paper']), self.max_paper_sentences), \
                    self.transform(np.asarray(json_data['review']), self.max_review_sentences), \
                    self.SigTransform(np.asarray(json_data['significance']), self.sig_min, self.sig_max)
            else:
                return json_data['paper'],json_data['review'],json_data['significance']
        else:
            if self.transform:
                return self.transform(np.asarray(json_data['paper']), self.max_paper_sentences), \
                    self.transform(np.asarray(json_data['review']), self.max_review_sentences), \
                    json_data['recommendation'], json_data['confidence'], self.SigTransform(np.asarray(json_data['significance']), self.sig_min, self.sig_max)
            else:
                return json_data['paper'], json_data['review'],json_data['recommendation'], json_data['confidence'],json_data['significance'] 
    def __len__(self):
        return len(self.data)

In [9]:
d = dataset('./Data/SignData/test_data', mode = 'TEST', transform=Transform(), sigtx = ScaleSigScores())

In [10]:
d.sig_min

array([ 0.85428571,  9.56806387, -0.99761351])

In [11]:
d.sig_max

array([ 28.76591667, 328.43497006,   0.68505453])

In [27]:
d.sig_min

array([ 0.05428571,  0.93013972, -0.99966815])

In [28]:
d.sig_max

array([ 33.26288336, 252.25067107,   0.99966679])

In [35]:
d[2]

ValueError: index can't contain negative values

In [16]:
dl = DataLoader(d, batch_size = 2, shuffle = True, num_workers=4)

In [18]:
for i, d in enumerate(dl):
    paper, review, rec, conf, sig = d
    print(paper.shape, review.shape, rec.shape, conf.shape, sig.shape)
    break

torch.Size([2, 790, 768]) torch.Size([2, 790, 768]) torch.Size([2]) torch.Size([2]) torch.Size([2, 3])
