# 1. Get token example from item
> Notes: use geneal('linear') tokenizition method, which means do not parse formulas

In [1]:
# coding: utf-8
import json
from tqdm import tqdm
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize


def load_items():
    test_items = [
        {'ques_content':'如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'<div>Below is a discussion on a website.<br><table border=\1'},
    ]
    for line in test_items:
        yield line
    # """or use your jsonfile like this"""
    # work_file_path = "../../../data/OpenLUNA.json"
    # with open(work_file_path, 'r', encoding="utf-8") as f:
    #     for line in f:
    #         yield json.loads(line)


token_items = []
for item in tqdm(load_items(), "sifing"):
    # use seg to transform content into special marks('g','m','a','s'), except text('t') and formula('f').
    seg_ret = seg(item["ques_content"], symbol="gmas")

    # get tokens by 'linear' method instead of 'ast'
    tokenization_params = {
        "formula_params": {
            "method": "linear",
        }
    }
    token_item = tokenize(seg_ret, **tokenization_params)

    if token_item:
        token_items.append(token_item.tokens)

token_items[0]

sifing: 2it [00:00,  3.89it/s]


['如图',
 '[FIGURE]',
 'x',
 ',',
 'y',
 '约束条件',
 '[SEP]',
 'z',
 '=',
 'x',
 '+',
 '7',
 'y',
 '最大值',
 '[MARK]']

In [2]:
len(token_items)

2

# 2. Load Model and test item

In [7]:

from urllib.request import urlopen
import os,os.path
import zipfile


def down_file(subject):
  url = "http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_" + subject + "_256.zip"
  file_name = "../../../data/d2v/zip/" + url.split('/')[-1]
  u = urlopen(url)

  f = open(file_name, 'wb')  
  file_info = u.getheaders()
  print("[down file] file info : ", file_info)
  file_size_dl = 0
  block_sz = 8192
  while True: 
    buffer = u.read(block_sz) 
    if not buffer: 
      break
    file_size_dl += len(buffer) 
    f.write(buffer) 
  f.close()
  print("[down file] finish !")


def unzip_file(subject):
  zipfilename = "../../../data/d2v/zip/general_" + subject + "_256.zip"
  unziptodir = "../../../data/d2v/models/"
  print("[unzip file] start ...")
  if not os.path.exists(unziptodir):
      os.mkdir(unziptodir)
  zfobj = zipfile.ZipFile(zipfilename)
  for name in zfobj.namelist():
    name = name.replace('\\','/')
    if name.endswith('/'):
      continue
    ext_filename = os.path.join(unziptodir, name)
    ext_filename = ext_filename.replace('\\','/')
    print("save ======> ",ext_filename)
    ext_path= os.path.dirname(ext_filename)
    if not os.path.exists(ext_path) :
        os.mkdir(ext_path)
    outfile = open(ext_filename, 'wb')
    outfile.write(zfobj.read(name))
    outfile.close()
  print("[unzip file] finish !")

def getData(subject = "english"):
  """ subject = english | liberal | science |all """
  down_file(subject)
  unzip_file(subject)


work_subject = "science"
getData(work_subject)

[down file] file info :  [('Server', 'nginx'), ('Date', 'Thu, 08 Jul 2021 14:05:55 GMT'), ('Content-Type', 'application/zip'), ('Content-Length', '2035517115'), ('Connection', 'close'), ('Last-Modified', 'Thu, 08 Jul 2021 13:24:26 GMT'), ('ETag', '"60e6fc8a-795386bb"'), ('Accept-Ranges', 'bytes')]
[down file] finish !
[unzip file] start ...
[unzip file] finish !


In [4]:
print(token_items[0])

['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']


In [8]:
from EduNLP.Vector import D2V

d2v = D2V("../../../data/d2v/models/general_" + work_subject +"_256/general_" + work_subject + "_256.bin")
d2v(token_items[0])

array([ 1.12057403e-01,  2.52121985e-02, -1.84135586e-02, -1.93015739e-01,
        7.21611977e-02, -2.33779430e-01, -1.64141074e-01, -6.87493235e-02,
       -1.90396085e-01,  8.75402018e-02,  2.38416390e-03,  6.11983947e-02,
       -6.36335239e-02,  1.64499998e-01,  1.03401735e-01, -1.27245992e-01,
        4.49625924e-02, -1.13446854e-01, -2.65566111e-02,  7.19655454e-02,
        2.18087152e-01, -5.27603514e-02, -6.00272790e-02, -3.24151537e-04,
        4.26347973e-03,  6.99777529e-02,  7.29825273e-02,  4.88277040e-02,
        9.19435546e-02,  8.59248787e-02, -6.78287968e-02,  9.07242820e-02,
        3.56323123e-02, -8.70092660e-02, -1.92239523e-01, -8.89661834e-02,
       -1.03428471e-03,  1.70044407e-01, -4.78864461e-02,  3.79877910e-02,
        1.59824491e-02, -5.50671928e-02, -2.07250416e-01, -3.63758020e-02,
       -3.72698829e-02,  1.72938913e-01,  1.26007214e-01,  1.48316160e-01,
       -1.35055989e-01, -1.00548312e-01, -6.66367710e-02,  4.60145958e-02,
       -1.65324789e-02, -