In [1]:
import pandas as pd
import json

In [2]:
converters = {'json': json.loads}
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)

In [28]:
import re
import json

class SepTokenJSONProcessor:
  spec_tokens = ["<B>", "<T>", "<P>", "<C1>", "<C2>"]
  def process_json(self, json):
    return ''.join([f"<B><T>{d['Title']}<P>{d['Price']}<C1>{d['Count']}<C2>{d['Currency']}" for d in json])

  def unprocess_json(self, s):
    json = []
    for t in s.split('<B>')[1:]:
      try:
        json.append({
          'Title': re.findall(r'<T>(.*?)(<(B|P|C1|C2)>|$)', t)[0][0],
          'Price': re.findall(r'<P>(.*?)(<(T|B|C1|C2)>|$)', t)[0][0],
          'Count': re.findall(r'<C1>(.*?)(<(T|B|P|C2)>|$)', t)[0][0],
          'Currency': re.findall(r'<C2>(.*?)(<(T|B|P|C1)>|$)', t)[0][0]
        })
      except Exception as e:
        print(t)
        raise e
    return json

In [29]:
proc = SepTokenJSONProcessor()
assert train.loc[train.index[0], 'json'] == proc.unprocess_json(proc.process_json(train.loc[train.index[0], 'json']))

In [31]:
s = '<B><T> столик-подставка<P> 50<C1> 1<C2> лари<B><T> столик-подставка<P> 50<C1> 1<C2> лари'
proc.unprocess_json(s)

[{'Title': ' столик-подставка',
  'Price': ' 50',
  'Count': ' 1',
  'Currency': ' лари'},
 {'Title': ' столик-подставка',
  'Price': ' 50',
  'Count': ' 1',
  'Currency': ' лари'}]

In [13]:
import re
import json

class TextJSONProcessor:
  spec_tokens = []
  def process_json(self, json):
    return 'Продается: ' + ';'.join([f"{d['Title']} в количестве {d['Count']}, цена {d['Price']} ({d['Currency']})" for d in json])

  def unprocess_json(self, s):
    json = []
    if s.startswith('Продается: '):
      s = s[11:]
    for t in s.split(';'):
      try:
        json.append({
          'Title': re.findall(r'(.*?) в количестве ', t)[0],
          'Count': re.findall(r' в количестве (.*?), цена ', t)[0],
          'Price': re.findall(r', цена (.*?) \(', t)[0],
          'Currency': re.findall(r', цена [^()]*\((.*?)\)(;|$)', t)[0][0]
        })
      except Exception as e:
        print(t)
        raise e
    return json

In [14]:
proc = TextJSONProcessor()
assert train.loc[train.index[0], 'json'] == proc.unprocess_json(proc.process_json(train.loc[train.index[0], 'json']))

In [46]:
s = proc.process_json(train.loc[train.index[0], 'json'])

In [49]:
re.findall(r', цена .*\((.*?)\)', s.split('\n')[1])

['RUB']

In [48]:
s.split('\n')[1]

'Свёкла кормовая в количестве 1 мешок, цена 250 (RUB)'