# Load SAOKE

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/My Drive/0328/SAOKE

/content/drive/My Drive/0320/ORE/SAOKE


In [92]:
import json
import pandas as pd

import jieba
import re

In [93]:
saoke = []
with open('SAOKE_DATA.json', 'r') as f:
  for line in f:
      saoke.append(json.loads(line.encode("utf-8")))

# Select suitable data

In [94]:
max_seq_length = 128
data = []
all_chinese = re.compile('^[\u4e00-\u9fa5]*$')
all_chinese_english_num = re.compile('^[\u4e00-\u9fa5_a-zA-Z0-9]*$')

for s in saoke:
  if s['logic'] == [] or len(s['natural']) > max_seq_length:
    continue

  logic = s['logic']
  one_data_logic = []
  for l in logic:
    if l['predicate'] == "_" or l['subject'] == "_" or l['object'] == []:
      continue
    if bool(re.search(all_chinese, l['predicate'])):
      if bool(re.search(all_chinese_english_num, l['subject'])):
        if bool(len(l['object']) == 1) and bool(re.search(all_chinese_english_num, l['object'][0])):
          one_data_logic.append(l)
  
  if one_data_logic != []:
    one_data = {'natural': s['natural'], 'logic': one_data_logic}
    data.append(one_data)

In [95]:
len(data)

20204

In [96]:
data[:9]

[{'logic': [{'object': ['您的最佳落脚点'],
    'place': '_',
    'predicate': '是',
    'qualifier': '_',
    'subject': '国际学生公寓旅舍',
    'time': '_'}],
  'natural': '国际学生公寓旅舍是您来纽约的最佳落脚点。'},
 {'logic': [{'object': ['布特的重要市场'],
    'place': '_',
    'predicate': '是',
    'qualifier': '_',
    'subject': '非洲',
    'time': '_'}],
  'natural': '战乱频发的非洲是布特的重要市场。'},
 {'logic': [{'object': ['叶片'],
    'place': '_',
    'predicate': '咬食',
    'qualifier': '_',
    'subject': '银纹夜蛾幼虫',
    'time': '夏秋季'}],
  'natural': '银纹夜蛾：幼虫咬食叶片，夏秋季发生。'},
 {'logic': [{'object': ['照片'],
    'place': '_',
    'predicate': '有',
    'qualifier': '_',
    'subject': '微博',
    'time': '_'},
   {'object': ['解说文字'],
    'place': '_',
    'predicate': '配有',
    'qualifier': '_',
    'subject': '微博',
    'time': '_'}],
  'natural': '微博内容不仅有哈士奇的照片，而且还配有生动的解说文字。'},
 {'logic': [{'object': ['一个难题'],
    'place': '_',
    'predicate': '似乎将成为',
    'qualifier': '_',
    'subject': '这',
    'time': '_'}],
  'natural': '这似乎将成为一个难题，不能给

# Analyze data

In [97]:
data_analysis = pd.DataFrame({'text_len':[0] * len(data), 'logic_count':[0] * len(data)})

In [98]:
for i in range(len(data)):
  one_data = data[i]
  data_analysis['text_len'][i] = len(one_data['natural'])
  data_analysis['logic_count'][i] = len(one_data['logic'])

In [99]:
data_analysis['text_len'].value_counts()

22     407
33     405
30     401
31     390
24     389
      ... 
126     15
116     15
114     15
124     12
4        4
Name: text_len, Length: 125, dtype: int64

In [100]:
data_analysis['logic_count'].value_counts()

1     10499
2      5409
3      2440
4      1027
5       436
6       220
7        98
8        44
9        11
10        9
11        8
14        2
13        1
Name: logic_count, dtype: int64

# Build dataset

In [101]:
dataset = []

for i in range(len(data)):
  one = dict()
  id = i
  EL_res = []
  one_EL_res = dict()

  triples = []
  
  entities = dict()

  one_data = data[i]
  one_data_logics = one_data['logic']
  for logic in one_data_logics:
    one_triple = []
    one_triple.append(logic['subject'])
    one_triple.append(logic['object'][0])
    one_triple.append(logic['predicate'])
    triples.append(one_triple)

    if logic['subject'] not in entities:
      entities[logic['subject']] = 0
    if logic['object'][0] not in entities:
      entities[logic['object'][0]] = 0
  
  one['_id'] = id
  one_EL_res = {'text': one_data['natural'], 'triples': triples, 'entity_idx': entities}
  EL_res.append(one_EL_res)
  one['doc'] = EL_res

  dataset.append(one)


In [102]:
dataset[:9]

[{'_id': 0,
  'doc': [{'entity_idx': {'国际学生公寓旅舍': 0, '您的最佳落脚点': 0},
    'text': '国际学生公寓旅舍是您来纽约的最佳落脚点。',
    'triples': [['国际学生公寓旅舍', '您的最佳落脚点', '是']]}]},
 {'_id': 1,
  'doc': [{'entity_idx': {'布特的重要市场': 0, '非洲': 0},
    'text': '战乱频发的非洲是布特的重要市场。',
    'triples': [['非洲', '布特的重要市场', '是']]}]},
 {'_id': 2,
  'doc': [{'entity_idx': {'叶片': 0, '银纹夜蛾幼虫': 0},
    'text': '银纹夜蛾：幼虫咬食叶片，夏秋季发生。',
    'triples': [['银纹夜蛾幼虫', '叶片', '咬食']]}]},
 {'_id': 3,
  'doc': [{'entity_idx': {'微博': 0, '照片': 0, '解说文字': 0},
    'text': '微博内容不仅有哈士奇的照片，而且还配有生动的解说文字。',
    'triples': [['微博', '照片', '有'], ['微博', '解说文字', '配有']]}]},
 {'_id': 4,
  'doc': [{'entity_idx': {'一个难题': 0, '这': 0},
    'text': '这似乎将成为一个难题，不能给出同样的薪水却期望得到同样的人。',
    'triples': [['这', '一个难题', '似乎将成为']]}]},
 {'_id': 5,
  'doc': [{'entity_idx': {'揎': 0, '裂': 0, '襟袖': 0},
    'text': '襟袖结裂不可揎[4]，蹠破指伤流血般[5]。',
    'triples': [['襟袖', '裂', '结'], ['襟袖', '揎', '不可']]}]},
 {'_id': 6,
  'doc': [{'entity_idx': {'不发达国家': 0, '发展中国家': 0, '赞比亚': 0},
    'text': '赞比亚在

# Split dataset

In [103]:
len(dataset)

20204

In [108]:
import random

random.shuffle(dataset)

partition = int(0.8 * len(dataset))

train_data = dataset[:1300]
test_data = dataset[1300:1600]

print(len(train_data), len(test_data))

1300 300


In [109]:
import json

with open('train_data.json', 'w', encoding='utf-8') as f:
  for data in train_data:
    json.dump(data, f)
    f.write('\n')

with open('test_data.json', 'w', encoding='utf-8') as f:
  for data in test_data:
    json.dump(data, f)
    f.write('\n')