In [1]:
import pandas as pd
import logging
import io
from itertools import tee
from collections.abc import Iterator

https://radimrehurek.com/gensim/models/poincare.html

In [2]:
pd.set_option("plotting.backend", "plotly")

In [3]:
import docx2txt

In [4]:
text = docx2txt.process("Rus_Passages_Fixed_01.08.docx")

In [10]:
prefix = [
    "format",
    "query:",
    "context:",
    "answer:",
]

In [17]:
def chunkify(f, chunksize=10_000_000, sep="\n"):
    """
    Read a file separating its content lazily.

    Usage:

    >>> with open('INPUT.TXT') as f:
    >>>     for item in chunkify(f):
    >>>         process(item)
    """
    chunk = None
    remainder = None  # data from the previous chunk.
    while chunk != "":
        chunk = f.read(chunksize)
        if remainder:
            piece = remainder + chunk
        else:
            piece = chunk
        pos = None
        while pos is None or pos >= 0:
            pos = piece.find(sep)
            if pos >= 0:
                if pos > 0:
                    yield piece[:pos]
                piece = piece[pos + 1 :]
                remainder = None
            else:
                remainder = piece
    if remainder:  # This statement will be executed iff @remainder != ''
        yield remainder

In [40]:
class NIterator(Iterator):

    def __init__(self, it):
        self.it = iter(it)
        self._is_next = None
        self._next = None
    
    def has_next(self) -> bool:
        if self._is_next is None:
            try:
                self._next = next(self.it)
            except:
                self._is_next = False
            else:
                self._is_next = True
        return self._is_next
    
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._is_next :
            response = self._next
        else:
            response = next(self.it)
        self._is_next = None
        return response

In [57]:
next(iter({"a": 1, "b":2, "c": 3}))

'a'

In [None]:
[
    ("title:", "title:")
    ("format:", "format:"),
    ("query:", "query:"),
    ("")
]

In [108]:
import abc
from typing import List
import collections

class Runnable(abc.ABC):

    @abc.abstractmethod
    def run(self, x, **kwargs):
        pass

    @abc.abstractmethod
    def next(self, x):
        pass

class Action:
    def __init__(self, action=None):
        self.action = action
    def __str__(self): return self.action
    def __repr__(self): return self.action
    def __cmp__(self, other):
        return cmp(self.action, other.action)
    # Necessary when __cmp__ or __eq__ is defined
    # in order to make this class usable as a
    # dictionary key:
    def __hash__(self):
        return hash(self.action)
    
    def __eq__(self, other):
        """Overrides the default implementation"""
        if isinstance(other, Action):
            return self.action == other.action
        return NotImplemented

Action.title = Action("title:") # title -> format
Action.format = Action("format:") # format -> query
Action.query = Action("query:") # query -> query | query -> context
Action.context = Action("context:") # context -> context | context -> answer
Action.answer = Action("answer:") # answer -> answer | answer -> title | answer -> format
Action.fill = Action("fill:") #


class IState(Runnable):

    def __init__(self, name: str, transitions = None, data=None):
        super(IState, self).__init__()
        self.name = name
        self.g = collections.defaultdict(set) # Action (aka Input) -> new IState
        # transitions is a list of pairs (input, next->state)
        # Every possible state from current
        # Note, that there can be 1...i  input(s) acceptable
        # Transition can also happen to 1..j new_state(s) 
        if isinstance(transitions, collections.Iterable):
            for observe in transitions:
                for next_state in transitions[source]:
                    self.g[observe].add(next_state)
    
    def next(self, obs, data=None):
        # Can we yield each state here sequentially?
        # TODO:
        # How to props the prev data sequentially?
        if self.g.has_key(obs):
            return self.g[obs]
        raise ValueError(f"Got unexpected {str(obs)} action being in {self.name} state")

    def process(self, x, **kwargs):
        print(f"processing (aka running) {self.name}")

class DState(IState):

    def __init__(self, name: str, transitions= None, fn=None, fn_check=None):
        super(DState, self).__init__(name, transitions)
        self.store = collections.deque()
    
    # How to add check and report on done
    
    def run(self, x, **kwargs):
        self.store

class TitleState(IState):
    pass

class FormatState(IState):
    pass

class QueryState(StoringState):
    pass

class ContextState(StoringState):
    pass

class AnswerState(StoringState):
    pass

class StateMachine:

    def __init__(self, states, start_state: str, it):
        self.states = states
        self.state = start_state
        self.it = it

    def move(self, action: str, data=None):
        return self.state.next(Action(action))

    # Template method:
    def start(self):
        #
        while self.it.has_next():
            item = self.it.next().strip()
            # <TODO:>
            # fastnumbers try to convert and if it is simply a number => continue
            if item.lower() in ("extractive", "abstractive"):
                item = "format:" + item
            # splitting
            # TODO: join with the action object here
            try:
                action, ctx = item.split(":", maxsplit=1)
                action = action.strip().lower()
            except:
                # ...
                action = "fill:"
                self.buffer.append(item)
            else:
                self.state = self.move(action, data=ctx)
            
            
        # for i in inputs:
        #     self.currentState = self.currentState.next(i)
        #     self.currentState.run()

In [114]:
"query: какие правила голодных игр".split(":", maxsplit=1)

['query', ' какие правила голодных игр: freear?']

In [109]:
del d
d = collections.defaultdict(set)

In [110]:
d[Action("title:")].add(StoringState("format:"))
d[Action("format:")].add(StoringState("query:"))

In [112]:
d.get(Action("title:"))

{<__main__.StoringState at 0x7f81c9049450>}

In [111]:
list(d.keys())[0]

title:

In [101]:
d.get(Action("title:"))

In [61]:
mapped = map(
    Action,
    [
        "title: ",
        "format: ",
        "query: "
    ]
)

In [71]:
test = {
    "query:": "action on query"
}

In [None]:
test.has_key()

In [70]:
Action.query

<__main__.Action at 0x7f81c94b3510>

In [None]:
IState("title", transitions={
    "", IState()
})

In [35]:
import collections

class AIProcessor:

    prefix = [
        "title:",
        "format:",
        "query:",
        "context:",
        "answer:",
    ]

    book_prefix = "title:"

    def __init__(self):
        self._command = None
        self._store = None

    def _on_prefix(self, s: str):
        return any([chunk.lower().startswith(_) for _ in self.prefix])

    def process(self, chunk, p=None):
        chunk = chunk.strip()

        if any([chunk.lower().startswith(_) for _ in self.prefix]):
            self._command = chunk.split(":", maxsplit=1)[0]
        if self._command:
            _next, done = None, False
            _current = tee(p)
            while current.has_next() and not done:
                _next = p.next().strip()
                if self._on_prefix(_next):
                    done = True
                

        

SyntaxError: invalid syntax (42994340.py, line 15)

In [None]:
processor = AIProcessor()

In [41]:
ptr = NIterator(chunkify(io.StringIO(text)))

In [42]:
ptr.next()

'Title: Голодные игры - Сьюзен Коллинз'

In [48]:
ptr, current = tee(ptr)

In [54]:
next(ptr)

'abstractive'

In [55]:
next(current)

'query: Как выглядел кот Лютик?'

In [30]:
# with io.StringIO(text) as p:
#     for chunk in chunkify(p):
#         chunk = chunk.strip()

In [56]:
ptr.next()

AttributeError: 'itertools._tee' object has no attribute 'next'