# 1. Get token example from item
> Notes: use geneal('linear') tokenizition method, which means do not parse formulas

In [1]:
# coding: utf-8
import json
from tqdm import tqdm
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize
from EduNLP.Pretrain import GensimWordTokenizer

def load_items():
    test_items = [
        {'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'<div>Below is a discussion on a website.<br><table border=\1'},
    ]
    for line in test_items:
        yield line
    # """or use your jsonfile like this"""
    # work_file_path = "../../../data/OpenLUNA.json"
    # with open(work_file_path, 'r', encoding="utf-8") as f:
    #     for line in f:
    #         yield json.loads(line)


token_items = []
for item in tqdm(load_items(), "sifing"): 
    # transform content into special marks('g','m','a','s'), except text('t') and formula('f').
    # 'general' means symbolize the Formula in figure format and use 'linear' method for formula segmentation 
    tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
    token_item = tokenizer(item["ques_content"])
    if token_item:
        token_items.append(token_item.tokens)

token_items[0]

sifing: 3it [00:00,  6.16it/s]


['公式',
 '[FORMULA]',
 '公式',
 '[FORMULA]',
 '如图',
 '[FIGURE]',
 'x',
 ',',
 'y',
 '约束条件',
 '[SEP]',
 'z',
 '=',
 'x',
 '+',
 '7',
 'y',
 '最大值',
 '[MARK]']

In [2]:
len(token_items)

3

# 2. Load Model and test item

In [7]:

from urllib.request import urlopen
import os,os.path
import zipfile


def down_file(subject):
  url = "http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_" + subject + "_256.zip"
  file_name = "../../../data/d2v/zip/" + url.split('/')[-1]
  u = urlopen(url)

  f = open(file_name, 'wb')  
  file_info = u.getheaders()
  print("[down file] file info : ", file_info)
  file_size_dl = 0
  block_sz = 8192
  while True: 
    buffer = u.read(block_sz) 
    if not buffer: 
      break
    file_size_dl += len(buffer) 
    f.write(buffer) 
  f.close()
  print("[down file] finish !")


def unzip_file(subject):
  zipfilename = "../../../data/d2v/zip/general_" + subject + "_256.zip"
  unziptodir = "../../../data/d2v/models/"
  print("[unzip file] start ...")
  if not os.path.exists(unziptodir):
      os.mkdir(unziptodir)
  zfobj = zipfile.ZipFile(zipfilename)
  for name in zfobj.namelist():
    name = name.replace('\\','/')
    if name.endswith('/'):
      continue
    ext_filename = os.path.join(unziptodir, name)
    ext_filename = ext_filename.replace('\\','/')
    print("save ======> ",ext_filename)
    ext_path= os.path.dirname(ext_filename)
    if not os.path.exists(ext_path) :
        os.mkdir(ext_path)
    outfile = open(ext_filename, 'wb')
    outfile.write(zfobj.read(name))
    outfile.close()
  print("[unzip file] finish !")

def getData(subject = "english"):
  """ subject = english | liberal | science |all """
  down_file(subject)
  unzip_file(subject)


work_subject = "science"
getData(work_subject)

[down file] file info :  [('Server', 'nginx'), ('Date', 'Thu, 08 Jul 2021 14:05:55 GMT'), ('Content-Type', 'application/zip'), ('Content-Length', '2035517115'), ('Connection', 'close'), ('Last-Modified', 'Thu, 08 Jul 2021 13:24:26 GMT'), ('ETag', '"60e6fc8a-795386bb"'), ('Accept-Ranges', 'bytes')]
[down file] finish !
[unzip file] start ...
[unzip file] finish !


In [4]:
print(token_items[0])

['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']


In [4]:
from EduNLP.Vector import D2V
work_subject = "science"
d2v = D2V("../../../data/d2v/models/general_" + work_subject +"_256/general_" + work_subject + "_256.bin")
d2v(token_items[0])

array([ 0.0650754 , -0.00764359,  0.00102682, -0.09162476,  0.05634515,
       -0.11749917, -0.1045712 , -0.05478571, -0.10104678,  0.06869555,
       -0.0279559 ,  0.01929354, -0.05042625,  0.1995408 ,  0.06944881,
       -0.07930482,  0.02208041, -0.09138293,  0.0048805 ,  0.12356721,
        0.17302142,  0.06938677, -0.04870617,  0.05332801,  0.04250436,
        0.08110414,  0.02373151,  0.03123289,  0.06333841,  0.04644187,
       -0.05552559,  0.03619028,  0.04306177, -0.15928595, -0.1739715 ,
       -0.09467822, -0.03689221,  0.10378218, -0.03774287,  0.01273248,
       -0.03909611,  0.04232696, -0.1317193 , -0.00897106, -0.03322024,
        0.10972358,  0.09367326,  0.10724379, -0.08290622, -0.11172097,
       -0.07732891,  0.00606115,  0.0490447 , -0.01932557,  0.04300616,
       -0.00384228,  0.05668037,  0.08008637, -0.11094984, -0.06255185,
        0.02104814, -0.0395325 , -0.06146542, -0.05361609,  0.07272391,
       -0.04243532, -0.16048154, -0.06604826, -0.12449021,  0.05