In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/odimka/p/crossword')

In [152]:
import os
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import cv2

import cv
import ocr
import parsing
import base
import qdb
import word_embeddings
import sentence_embeddings
import scoring
import eval

In [4]:
DATA_DIR = '/home/odimka/p/crossword/data'

In [147]:
#@title Load questions

# img_path = '/home/odimka/d/puzzles/images/1.jpeg'
# img_path = '/home/odimka/d/puzzles/images/51475.png'
# QUESTIONS = ocr.recognize(img_path)

html_path = os.path.join(DATA_DIR,'crosswords', '51475.html')
QUESTIONS = parsing.parse(html_path)
# len(QUESTIONS)

QUESTION2ID = {q.text: i for i, q in enumerate(QUESTIONS)}

In [148]:
#@title Load data

QDF = qdb.get_questions_db(os.path.join(DATA_DIR, 'baza.csv'))

GRAPH = base.get_intersection_graph(QUESTIONS)

VOCAB = list(map(lambda x: x.strip().lower(),
                 open(os.path.join(DATA_DIR, 'hagen_nouns.txt')).readlines()))
VOCAB = [w for w in VOCAB if '-' not in w]
VOCAB = list(set(VOCAB) | set(QDF['answer']))
WORD2ID = {w: i for i, w in enumerate(VOCAB)}

print('# Questions:', len(QUESTIONS))
print('# Intersections:', sum(map(len, GRAPH.values())) / 2)
print('# Words:', len(VOCAB))
print('# Questions DB:', len(QDF))

# Questions: 48
# Intersections: 80.0
# Words: 111026
# Questions DB: 137147


In [168]:
#@title BERT based scoring fn

QUESTION_BERT_EMBEDS = sentence_embeddings.get_bert_embeds([q.text for q in QUESTIONS])

QDF_EMBEDS = pickle.load(open(os.path.join(DATA_DIR, 'qdf_embeds.pkl'), 'rb'))
DB_BERT_EMBEDS = np.array([QDF_EMBEDS[x] for x in QDF['question']])
DB_ANSWERS = list(QDF['answer'])
BERT_DB_SIMILARITY = cosine_similarity(QUESTION_BERT_EMBEDS, DB_BERT_EMBEDS)

def get_db_bert_similarity_scoring_fn(words, q):
  return scoring.get_db_similarity_scoring_fn(
      QUESTION2ID, DB_ANSWERS, BERT_DB_SIMILARITY, words, q)
  
def get_scoring_fn(words, q):
  word_scores = word_embeddings.get_rusvectores_emb_scoring_fn(words, q)
  db_scores = get_db_bert_similarity_scoring_fn(words, q)
  return scoring.demote_question_words(words, q, np.fmax(word_scores, db_scores))

In [169]:
#@title Scoring

from collections import defaultdict

VOCAB_INDICES_BY_LENGTH = defaultdict(list)
VOCAB_TO_VOCAB_BY_LENGTH = []
for i, w in enumerate(VOCAB):
  VOCAB_TO_VOCAB_BY_LENGTH.append(len(VOCAB_INDICES_BY_LENGTH[len(w)]))
  VOCAB_INDICES_BY_LENGTH[len(w)].append(i)


class Scorer:
  def __init__(self, score_fn):
    self.scores = []
    for q in QUESTIONS:
      words = [VOCAB[i] for i in VOCAB_INDICES_BY_LENGTH[q.length]]
      self.scores.append(score_fn(words, q))

  def score(self, q_index, w_index):
    return self.scores[q_index][VOCAB_TO_VOCAB_BY_LENGTH[w_index]]


class ActiveVocab:
  def __init__(self, q_index, scored_word_indices):
    self.q_index = q_index
    self.scored_word_indices = scored_word_indices
    self.scored_word_indices_by_pos = {}
    for _, pos, _ in GRAPH[self.q_index]:
      if pos not in self.scored_word_indices_by_pos:
        self.scored_word_indices_by_pos[pos] = defaultdict(list)
      for score, word_index in self.scored_word_indices:
        self.scored_word_indices_by_pos[pos][VOCAB[word_index][pos]].append((score, word_index))
    
  def restrict(self, pos, char):
    return ActiveVocab(self.q_index, self.scored_word_indices_by_pos[pos][char])

  def max_restricted_score(self, pos, char):
    t = self.scored_word_indices_by_pos[pos][char]
    if not t:
      return None
    else:
      return t[0][0]
  
  def max_score(self):
    return self.scored_word_indices[0][0]
  
  @classmethod
  def create(self, scorer, q_index):
    length = QUESTIONS[q_index].length
    scored_word_indices = [(scorer.score(q_index, w_index), w_index)
                           for w_index in VOCAB_INDICES_BY_LENGTH[length]]
    scored_word_indices = sorted(scored_word_indices, reverse=True)
    return ActiveVocab(q_index, scored_word_indices)

scorer = Scorer(get_scoring_fn)
ACTIVE_VOCABS = [ActiveVocab.create(scorer, i) for i in range(len(QUESTIONS))]

In [24]:
#@title Export

def export(prefix, vocab, questions, active_vocabs, graph):
  if not os.path.exists(prefix):
    os.makedirs(prefix)
  
  open(os.path.join(prefix, 'vocab.txt'), 'w').write('\n'.join(vocab))

  open(os.path.join(prefix, 'answers.txt'), 'w').write('\n'.join(
      [str(q.length) for q in questions]))

  with open(os.path.join(prefix, 'scores.txt'), 'w') as out:
    for v in active_vocabs:
      for score, word_index in v.scored_word_indices:
        out.write('%s %s %s\n' % (v.q_index, word_index, score))

  with open(os.path.join(prefix, 'graph.txt'), 'w') as out:
    for src, edges in graph.items():
      for (dst, src_pos, dst_pos) in edges:
        out.write('%d %d %d %d\n' % (src, dst, src_pos, dst_pos))

export(os.path.join(DATA_DIR, 'f'), VOCAB, QUESTIONS, ACTIVE_VOCABS, GRAPH)

In [44]:
import subprocess

SOLVER_BINARY = '/home/odimka/p/crossword/solver'

args = (
    SOLVER_BINARY,
    f'{DATA_DIR}/f/vocab.txt',
    f'{DATA_DIR}/f/answers.txt',
    f'{DATA_DIR}/f/scores.txt',
    f'{DATA_DIR}/f/graph.txt',
    f'{DATA_DIR}/f/res.txt'
)
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print(output)

b'Vocab size: 111026\nNum questions: 52\nNum edges: 82\n17.5849\n17.3817\n17.2281\n17.2004\n******* \xd0\xbf\xd1\x80\xd1\x8f\xd0\xb4\xd0\xb5\xd0\xb2\xd0\xbe\n******* \xd1\x8d\xd1\x81\xd1\x82\xd1\x80\xd0\xb0\xd0\xb4\xd0\xb0\n********* \xd0\xb3\xd0\xb8\xd0\xbf\xd0\xbd\xd0\xbe\xd1\x82\xd0\xb8\xd0\xb7\xd0\xbc\n**** \xd1\x83\xd1\x88\xd0\xba\xd0\xbe\n**** \xd0\xb8\xd0\xb7\xd0\xb1\xd0\xb0\n***** \xd1\x88\xd0\xb0\xd1\x81\xd1\x81\xd0\xb8\n**** \xd0\xb1\xd1\x80\xd0\xb0\xd0\xba\n****** \xd1\x81\xd0\xb8\xd0\xb5\xd1\x81\xd1\x82\xd0\xb0\n***** \xd0\xb0\xd0\xb3\xd0\xb5\xd0\xbd\xd1\x82\n***** \xd1\x80\xd1\x83\xd0\xb8\xd0\xbd\xd0\xb0\n***** \xd1\x88\xd0\xb0\xd0\xba\xd0\xb0\xd0\xbb\n***** \xd0\xb0\xd1\x84\xd0\xb8\xd1\x88\xd0\xb0\n**** \xd0\xb7\xd0\xb0\xd0\xbb\xd0\xbf\n****** \xd0\xba\xd1\x80\xd0\xbe\xd0\xba\xd1\x83\xd1\x81\n**** \xd0\xb0\xd0\xbb\xd0\xbb\xd0\xbe\n**** \xd0\xbc\xd0\xbe\xd0\xb0\xd0\xb8\n****** \xd0\xb1\xd0\xb5\xd1\x85\xd0\xb0\xd0\xb9\xd0\xbc\n**** \xd1\x8f\xd0\xb4\xd1\x80\xd0\xbe\n***** \x

In [35]:
import copy

def read_answers(filepath):
  answers = []
  with open(filepath) as f:
    for line in f.readlines():
      answers.append(line.strip())
  return answers

answers = read_answers(os.path.join(DATA_DIR, 'f', 'res.txt'))
for q, a in zip(QUESTIONS, answers):
  q.answer = a
questions = copy.deepcopy(QUESTIONS)
questions = sorted(questions, key=lambda q: (q.d, q.number))
answers = [q.answer for q in questions]

correct_answers = read_answers(os.path.join(DATA_DIR, '1.answers'))
assert len(answers) == len(correct_answers)

num_correct = 0
for a, correct_a in zip(answers, correct_answers):
  if a == correct_a:
    num_correct += 1
  else:
    print(a, correct_a)

print('Acc: %f (%d out of %d)' % (num_correct / len(answers), num_correct, len(answers)))

прядево кружево
брак шлак
шакал тариф
алло реле
бехайм битком
иафет индюк
мормо пуаро
сельджуки ессентуки
статика стрекач
оранжад гранула
гоброн горлан
атлас анфас
аббе асаи
парагенезис приключение
урэр убор
эльдорадо бладхаунд
обод опор
камелек капуста
радда марал
Acc: 0.634615 (33 out of 52)


In [47]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS']

'/home/odimka/key.json'

In [41]:
def show_questions_grid(questions):
  n = 0
  m = 0
  for q in questions:
    if q.d == 1:
      n = max(n, q.i + q.length)
    else:
      m = max(m, q.j + q.length)
  
  board = [['#'] * m for _ in range(n)]
  for q in questions:
    i = q.i
    j = q.j
    for k in range(q.length):
      board[i][j] = q.answer[k]
      k += 1
      if q.d == 0:
        j += 1
      else:
        i += 1
  return '\n'.join([' '.join(row) for row in board])

print(show_questions_grid(QUESTIONS))

п р я д е в о # э с т р а д а
# и # # # ы # п # т # # # и #
# к # г и п н о т и з м # а #
у ш к о # у # я # л # и з б а
# а # б # ш а с с и # н # е #
# # б р а к # н # с и е с т а
г # # о # а г е н т # р # # т
р у и н а # # н # # ш а к а л
у # # # # а ф и ш а # л # # а
з а л п # п # е # к р о к у с
# б # а л л о # # к # г # р #
# б # р # о # # м о а и # э #
б е х а й м # э # р # я д р о
о # # г # б о л и д # # # # б
и а ф е т # # ь # # м а р с о
н # # н # к а д е т # ф # # д
г а л е р а # о # р и г а # #
# р # з # м о р м о # а # р #
л о р и # е # а # т # н р а в
# м # с е л ь д ж у к и # д #
# а # # # е # о # а # # # д #
с т а т и к а # о р а н ж а д


## Scratch

In [83]:
#@title display_image_in_actual_size

from matplotlib.pyplot import imshow
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt


def display_image(im_data):
  dpi = mpl.rcParams['figure.dpi']
  height, width = im_data.shape

  # What size does the figure need to be in inches to fit the image?
  figsize = width / float(dpi), height / float(dpi)

  # Create a figure of the right size with one axes that takes up the full figure
  fig = plt.figure(figsize=figsize)
  ax = fig.add_axes([0, 0, 1, 1])

  # Hide spines, ticks, etc.
  ax.axis('off')

  # Display the image.
  ax.imshow(im_data, cmap='gray')

  plt.show()

# display_image_in_actual_size(questions_img)

In [None]:
questions1 = sorted(questions1, key=lambda q: (q.i, q.j, q.d))

print(len(questions))
print(len(questions1))
assert len(questions) == len(questions1)

for q1, q2 in zip(questions, questions1):
  if q1.text != q2.text:
    print('%s---%s' % (q1, q2))

In [304]:
img = questions_img.copy()
for b in boxes:
  img = cv2.rectangle(
    img,
    (b.top_left.x, b.top_left.y),
    (b.bottom_right.x, b.bottom_right.y),
    (0,255,0), 3)

img = cv2.rectangle(
    img,
    (int(thresholds[2]), 0),
    (int(thresholds[2]) + 5, 3000),
    (0,255,0),3)

display_image(img)