# d2v_general

## 1. Get token example from item
> Notes: use geneal('linear') tokenizition method, which means do not parse formulas

In [1]:
# coding: utf-8
import json
from tqdm import tqdm
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize
from EduNLP.Pretrain import GensimWordTokenizer

def load_items():
    test_items = [
        {'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'<div>Below is a discussion on a website.<br><table border=\1'},
    ]
    for line in test_items:
        yield line
    # """or use your jsonfile like this"""
    # work_file_path = "../../../data/OpenLUNA.json"
    # with open(work_file_path, 'r', encoding="utf-8") as f:
    #     for line in f:
    #         yield json.loads(line)


token_items = []
for item in tqdm(load_items(), "sifing"): 
    # transform content into special marks('g','m','a','s'), except text('t') and formula('f').
    # 'general' means symbolize the Formula in figure format and use 'linear' method for formula segmentation 
    tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
    token_item = tokenizer(item["ques_content"])
    if token_item:
        token_items.append(token_item.tokens)

token_items[0]

sifing: 3it [00:00,  5.07it/s]


['公式',
 '[FORMULA]',
 '公式',
 '[FORMULA]',
 '如图',
 '[FIGURE]',
 'x',
 ',',
 'y',
 '约束条件',
 '[SEP]',
 'z',
 '=',
 'x',
 '+',
 '7',
 'y',
 '最大值',
 '[MARK]']

In [2]:
len(token_items)

3

## 2. Load Model and test item

In [7]:

from urllib.request import urlopen
import os,os.path
import zipfile


def down_file(subject):
  url = "http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_" + subject + "_256.zip"
  file_name = "../../../data/d2v/zip/" + url.split('/')[-1]
  u = urlopen(url)

  f = open(file_name, 'wb')  
  file_info = u.getheaders()
  print("[down file] file info : ", file_info)
  file_size_dl = 0
  block_sz = 8192
  while True: 
    buffer = u.read(block_sz) 
    if not buffer: 
      break
    file_size_dl += len(buffer) 
    f.write(buffer) 
  f.close()
  print("[down file] finish !")


def unzip_file(subject):
  zipfilename = "../../../data/d2v/zip/general_" + subject + "_256.zip"
  unziptodir = "../../../data/d2v/models/"
  print("[unzip file] start ...")
  if not os.path.exists(unziptodir):
      os.mkdir(unziptodir)
  zfobj = zipfile.ZipFile(zipfilename)
  for name in zfobj.namelist():
    name = name.replace('\\','/')
    if name.endswith('/'):
      continue
    ext_filename = os.path.join(unziptodir, name)
    ext_filename = ext_filename.replace('\\','/')
    print("save ======> ",ext_filename)
    ext_path= os.path.dirname(ext_filename)
    if not os.path.exists(ext_path) :
        os.mkdir(ext_path)
    outfile = open(ext_filename, 'wb')
    outfile.write(zfobj.read(name))
    outfile.close()
  print("[unzip file] finish !")

def getData(subject = "english"):
  """ subject = english | liberal | science |all """
  down_file(subject)
  unzip_file(subject)


work_subject = "science"
getData(work_subject)

[down file] file info :  [('Server', 'nginx'), ('Date', 'Thu, 08 Jul 2021 14:05:55 GMT'), ('Content-Type', 'application/zip'), ('Content-Length', '2035517115'), ('Connection', 'close'), ('Last-Modified', 'Thu, 08 Jul 2021 13:24:26 GMT'), ('ETag', '"60e6fc8a-795386bb"'), ('Accept-Ranges', 'bytes')]
[down file] finish !
[unzip file] start ...
[unzip file] finish !


In [4]:
print(token_items[0])

['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']


In [2]:
from EduNLP.Vector import D2V
work_subject = "science"
d2v = D2V("../../../data/d2v/models/general_" + work_subject +"_256/general_" + work_subject + "_256.bin")
print(d2v.vector_size)
d2v(token_items[0])

256


array([ 6.68359101e-02, -6.85622962e-03,  1.71755534e-03, -9.45999995e-02,
        5.71297631e-02, -1.14749409e-01, -1.06426410e-01, -5.48244826e-02,
       -1.01055816e-01,  6.82074800e-02, -3.01527120e-02,  1.88328531e-02,
       -5.40650599e-02,  1.96987823e-01,  7.23450258e-02, -7.86591992e-02,
        2.52593309e-02, -8.93113762e-02,  5.15675824e-03,  1.25454620e-01,
        1.75611585e-01,  7.01171979e-02, -4.82840873e-02,  5.61073385e-02,
        4.38053571e-02,  8.21266770e-02,  2.25354582e-02,  2.86612101e-02,
        6.49044961e-02,  4.38563228e-02, -5.53747378e-02,  3.68891433e-02,
        4.41701710e-02, -1.57279179e-01, -1.71185300e-01, -9.53545198e-02,
       -3.68149281e-02,  1.03217609e-01, -4.01013494e-02,  1.34829208e-02,
       -3.90383117e-02,  4.31797989e-02, -1.31486431e-01, -6.81887381e-03,
       -3.09619904e-02,  1.09645449e-01,  9.19818357e-02,  1.05142176e-01,
       -8.25446919e-02, -1.10780641e-01, -7.99699128e-02,  4.87378612e-03,
        5.09812087e-02, -