In [1]:
# Solving redactle puzzle using only BERT predictions on a sentence level

In [2]:
%%bash
pip install tqdm boto3 requests regex sentencepiece sacremoses 
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.21-py3-none-any.whl (132 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting boto3
  Downloading boto3-1.26.20-py3-none-any.whl (132 kB)
Collecting botocore<1.30.0,>=1.29.20
  Downloading botocore-1.29.20-py3-none-any.whl (10.2 MB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses 

In [3]:
import re
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [4]:
# Redactle stopwords found here: https://github.com/benjamin-brady/redactle/blob/main/src/routes/index.svelte
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [5]:
# Tokenization on articles by whitespace or non alpha-numeric (Keeps non-alpha numeric tokens)
def tokenize(article, keep_space = True):
  split = ''
  # Keeps all punctuation to leave the article untouched
  if keep_space:
    split = re.split('(\W+)',article)
  # Splits on punctuation and removes it as well (may change what the article looks like)
  else:
    split = re.split('\W+',article)
  return split

# Tokenization to sentences to make use of bert sentence prediction
def sent_tokenize(article):
  return re.split('[\.\n]', article)

In [6]:
# Dictionary full of revealed stopwords for a row in the data
def build_revealed():
  revealed_dict = {}
  for word in stopwords:
    revealed_dict[word] = None
  return revealed_dict

In [7]:
# Redacts any word not in revealed dict, or non-alpha numeric
def redact_article(tokens,revealed,bert_mask = False):
  redacted = []
  for t in tokens:
    #print(t)
    if re.search('\W+',t):
      redacted.append(t)
    elif t in revealed:
      redacted.append(t)
    else:
      if bert_mask:
        redacted.append('[MASK]')
      else:
        redacted.append('█' * len(t))
  return redacted

In [8]:
# Adds case-insensitive versions of word to revealed_dict
def add_revealed(revealed_dict,word):
  revealed_dict[word.casefold()] = None
  revealed_dict[word.capitalize()] = None

In [9]:
test_input = '''
Architecture (from Latin  architectura; from Ancient Greek  ἀρχιτέκτων (arkhitéktōn) 'architect'; from  ἀρχι- (arkhi-) 'chief', and  τέκτων (téktōn) 'creator') is the art and technique of designing and building, as distinguished from the skills associated with construction. It is both the process and the product of sketching, conceiving, planning, designing, and constructing buildings or other structures. Architectural works, in the material form of buildings, are often perceived as cultural symbols and as works of art. Historical civilizations are often identified with their surviving architectural achievements.The practice, which began in the prehistoric era, has been used as a way of expressing culture for civilizations on all seven continents. For this reason, architecture is considered to be a form of art. Texts on architecture have been written since ancient times. The earliest surviving text on architectural theories is the 1st century AD treatise De architectura by the Roman architect Vitruvius, according to whom a good building embodies firmitas, utilitas, and venustas (durability, utility, and beauty). Centuries later, Leon Battista Alberti developed his ideas further, seeing beauty as an objective quality of buildings to be found in their proportions. Giorgio Vasari wrote Lives of the Most Excellent Painters, Sculptors, and Architects and put forward the idea of style in the Western arts in the 16th century. In the 19th century, Louis Sullivan declared that "form follows function". "Function" began to replace the classical "utility" and was understood to include not only practical but also aesthetic, psychological and cultural dimensions. The idea of sustainable architecture was introduced in the late 20th century.
Architecture began as rural, oral vernacular architecture that developed from trial and error to successful replication. Ancient urban architecture was preoccupied with building religious structures and buildings symbolizing the political power of rulers until Greek and Roman architecture shifted focus to civic virtues. Indian and Chinese architecture influenced forms all over Asia and Buddhist architecture in particular took diverse local flavors. In fact, During the European Middle Ages, pan-European styles of Romanesque and Gothic cathedrals and abbeys emerged while the Renaissance favored Classical forms implemented by architects known by name. Later, the roles of architects and engineers became separated. Modern architecture began after World War I as an avant-garde movement that sought to develop a completely new style appropriate for a new post-war social and economic order focused on meeting the needs of the middle and working classes. Emphasis was put on modern techniques, materials, and simplified geometric forms, paving the way for high-rise superstructures. Many architects became disillusioned with modernism which they perceived as ahistorical and anti-aesthetic, and postmodern and contemporary architecture developed.
Over the years, the field of architectural construction has branched out to include everything from ship design to interior decorating.


== Definitions ==
Architecture can mean:

A general term to describe buildings and other physical structures.
The art and science of designing buildings and (some) nonbuilding structures.
The style of design and method of construction of buildings and other physical structures.
A unifying or coherent form or structure.
Knowledge of art, science, technology, and humanity.
The design activity of the architect, from the macro-level (urban design, landscape architecture) to the micro-level (construction details and furniture). The practice of the architect, where architecture means offering or rendering professional services in connection with the design and construction of buildings, or built environments.


== Theory of architecture ==

The philosophy of architecture is a branch of philosophy of art, dealing with aesthetic value of architecture, its semantics and in relation with development of culture. Many philosophers and theoreticians from Plato to Michel Foucault, Gilles Deleuze, Robert Venturi and Ludwig Wittgenstein have concerned themselves with the nature of architecture and whether or not architecture is distinguished from building.


=== Historic treatises ===
The earliest surviving written work on the subject of architecture is De architectura by the Roman architect Vitruvius in the early 1st century AD. According to Vitruvius, a good building should satisfy the three principles of firmitas, utilitas, venustas, commonly known by the original translation – firmness, commodity and delight. An equivalent in modern English would be:

Durability – a building should stand up robustly and remain in good condition
Utility – it should be suitable for the purposes for which it is used
Beauty – it should be aesthetically pleasingAccording to Vitruvius, the architect should strive to fulfill each of these three attributes as well as possible. Leon Battista Alberti, who elaborates on the ideas of Vitruvius in his treatise, De re aedificatoria, saw beauty primarily as a matter of proportion, although ornament also played a part. For Alberti, the rules of proportion were those that governed the idealized human figure, the Golden mean. The most important aspect of beauty was, therefore, an inherent part of an object, rather than something applied superficially, and was based on universal, recognizable truths. The notion of style in the arts was not developed until the 16th century, with the writing of Giorgio Vasari. By the 18th century, his Lives of the Most Excellent Painters, Sculptors, and Architects had been translated into Italian, French, Spanish, and English.
In the 16th century, Italian Mannerist architect, painter and theorist Sebastiano Serlio wrote Tutte L'Opere D'Architettura et Prospetiva (Complete Works on Architecture and Perspective). This treatise exerted immense influence throughout Europe, being the first handbook that emphasized the practical rather than the theoretical aspects of architecture, and it was the first to catalog the five orders.In the early 19th century, Augustus Welby Northmore Pugin wrote Contrasts (1836) that, as the title suggested, contrasted the modern, industrial world, which he disparaged, with an idealized image of neo-medieval world. Gothic architecture, Pugin believed, was the only "true Christian form of architecture." The 19th-century English art critic, John Ruskin, in his Seven Lamps of Architecture, published 1849, was much narrower in his view of what constituted architecture. Architecture was the "art which so disposes and adorns the edifices raised by men … that the sight of them" contributes "to his mental health, power, and pleasure". For Ruskin, the aesthetic was of overriding significance. His work goes on to state that a building is not truly a work of architecture unless it is in some way "adorned". For Ruskin, a well-constructed, well-proportioned, functional building needed string courses or rustication, at the very least.On the difference between the ideals of architecture and mere construction, the renowned 20th-century architect Le Corbusier wrote: "You employ stone, wood, and concrete, and with these materials you build houses and palaces: that is construction. Ingenuity is at work. But suddenly you touch my heart, you do me good. I am happy and I say: This is beautiful. That is Architecture". Le Corbusier's contemporary Ludwig Mies van der Rohe said "Architecture starts when you carefully put two bricks together. There it begins."


=== Modern concepts ===
The notable 19th-century architect of skyscrapers, Louis Sullivan, promoted an overriding precept to architectural design: "Form follows function". While the notion that structural and aesthetic considerations should be entirely subject to functionality was met with both popularity and skepticism, it had the effect of introducing the concept of "function" in place of Vitruvius' "utility". "Function" came to be seen as encompassing all criteria of the use, perception and enjoyment of a building, not only practical but also aesthetic, psychological and cultural.
Nunzia Rondanini stated, "Through its aesthetic dimension architecture goes beyond the functional aspects that it has in common with other human sciences. Through its own particular way of expressing values, architecture can stimulate and influence social life without presuming that, in and of itself, it will promote social development.... To restrict the meaning of (architectural) formalism to art for art's sake is not only reactionary; it can also be a purposeless quest for perfection or originality which degrades form into a mere instrumentality".Among the philosophies that have influenced modern architects and their approach to building design are Rationalism, Empiricism, Structuralism, Poststructuralism, Deconstruction and Phenomenology.
In the late 20th century a new concept was added to those included in the compass of both structure and function, the consideration of sustainability, hence sustainable architecture. To satisfy the contemporary ethos a building should be constructed in a manner which is environmentally friendly in terms of the production of its materials, its impact upon the natural and built environment of its surrounding area and the demands that it makes upon non-sustainable power sources for heating, cooling, water and waste management, and lighting.



	
'''


In [10]:
test_tokens = tokenize(test_input)
reveal_dict = build_revealed()
test_redact = redact_article(test_tokens,reveal_dict,bert_mask = True)


In [11]:
''.join(test_redact)

'[MASK]\n[MASK] (from [MASK]  [MASK]; from [MASK] [MASK]  [MASK] ([MASK]) \'[MASK]\'; from  [MASK]- ([MASK]-) \'[MASK]\', and  [MASK] ([MASK]) \'[MASK]\') is the [MASK] and [MASK] of [MASK] and [MASK], as [MASK] from the [MASK] [MASK] with [MASK]. [MASK] is both the [MASK] and the [MASK] of [MASK], [MASK], [MASK], [MASK], and [MASK] [MASK] or other [MASK]. [MASK] [MASK], in the [MASK] [MASK] of [MASK], are [MASK] [MASK] as [MASK] [MASK] and as [MASK] of [MASK]. [MASK] [MASK] are [MASK] [MASK] with their [MASK] [MASK] [MASK].[MASK] [MASK], which [MASK] in the [MASK] [MASK], has been [MASK] as a [MASK] of [MASK] [MASK] for [MASK] on all [MASK] [MASK]. [MASK] this [MASK], [MASK] is [MASK] to be a [MASK] of [MASK]. [MASK] on [MASK] have been [MASK] [MASK] [MASK] [MASK]. [MASK] [MASK] [MASK] [MASK] on [MASK] [MASK] is the [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] by the [MASK] [MASK] [MASK], [MASK] to whom a [MASK] [MASK] [MASK] [MASK], [MASK], and [MASK] ([MASK], [MASK], and [MASK]). [MASK

In [12]:
from collections import Counter, OrderedDict

# Uses pre-trained BERT models to attempt to predict [MASK] in each sentence of the articles
# Returns most frequent word predicted by BERT models in all sentences as a guess with the most hits
class Bert_Guesser:

  def __init__(self):
    self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    self.bert_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
    self.stop_dict = build_revealed()
    self.history = {}

  # We do not want BERT to predict stopwords and tokens with length of only 1, so remove them
  def remove_stopwords(self,tokens):
    token_list = []
    for t in tokens:
      if (not (t in self.stop_dict)) and (len(t) > 1):
        token_list.append(t)
    return token_list

  # Prevent making guesses that have already been guessed (found in history)
  def remove_history(self,tokens):
    token_list = []
    for t in tokens:
      if (not (t in self.history)):
        token_list.append(t)
    return token_list

  # Add token to guess history
  def update_history(self,token):
    self.history[token.casefold()] = None
    self.history[token.capitalize()] = None

  # Uses copy-paste from redactle as input
  # replaces redacted items with [MASK] for bert sentence prediction
  def process_article(self,article):
    mask = re.sub('█+','[MASK]',article)
    return mask

  # Recieves list of tokens with [MASK] in place of redacted items
  # Outputs most frequently guessed word by bert pretrained models
  def best_sentence_guess(self,sentence):
    bert_token_input = self.bert_tokenizer(''.join(sentence),return_tensors="pt")
    with torch.no_grad():
      logits = self.bert_model(**bert_token_input).logits

    mask_token_index = (bert_token_input.input_ids == self.bert_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)

    predictions = self.bert_tokenizer.decode(predicted_token_id)

    prediction_tokens = tokenize(predictions,keep_space = False)

    prediction_tokens = self.remove_stopwords(prediction_tokens)
    prediction_tokens = self.remove_history(prediction_tokens)
    # Counter finds the most common token that occurs, first one if its a tie
    token_counter = Counter(prediction_tokens)
    return token_counter.most_common(1)

  # Returns a list of guesses to go through, with the intention that best guess is called again 
  # Whenever the article recieves a hit
  def best_guess(self,article):
    sentence_guesses = []
    sentences = sent_tokenize(article)
    for sentence in sentences:
      sentence_guesses.append(self.best_sentence_guess(sentence))

    sent_guess_dict = {}
    for guess in sentence_guesses:
      if len(guess) > 0:
        try:
          sent_guess_dict[guess[0][0]] += guess[0][1]
        except KeyError:
          sent_guess_dict[guess[0][0]] = guess[0][1]

    #max_guess = max(sent_guess_dict,key = sent_guess_dict.get)
    #print("Max Guess: ")
    #print(max(sent_guess_dict,key = sent_guess_dict.get))
    sorted_list = sorted(sent_guess_dict.items(), key=lambda x: x[1],reverse=True)
    sorted_list = [x[0] for x in sorted_list]
    return sorted_list

In [13]:

bert_guess_2 = Bert_Guesser()
redacted_article = str(''.join(test_redact))

guesses = bert_guess_2.best_guess(redacted_article)

for g in guesses:
  print(g)


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


love
women
also
room
people
children
film
history
new
said
god
combination
religion
death
medicine
line
would
based
world
half
one
education
song
case
things
finally
first
well
friends
working
turned
journal
part
number
book
according
man
used
possible
john
true
important
level
last
apparently
onewed
thought
went
difference
life
say
damn
born
idea
little


In [14]:
# Game Loop

# 1. Tokenize and prepare redactle article (tokenization, applying mask, other preprocessing)
# 2. Put redacted article into bert guesser to get dictionary of guesses
# 3. Go through sorted dictionary of guesses and apply them. Save a history of guesses so that we dont enter an infinite loop
# If there are 0 hits, continue down the sorted dictionary
# If there is at least 1 hit, reveal the parts and move back to part 1
# If the topic is guessesd, end the game

In [None]:

hit = True
bert_guess = Bert_Guesser()
guess_list = []

while True:
  full_break = False
  if full_break:
    break
  if hit:
    print("Enter the most up to date redactle article, QUIT to stop : ")
    article = input()
    if article == "QUIT":
      full_break = True
      break
    masked = bert_guess.process_article(article)
    
    guess_list = bert_guess.best_guess(masked)


  guess_break = False
  for guess in guess_list:
    print("Best guess based on BERT sentence prediction : ")
    print(guess)

    print("Was this guess a hit ? y/n")
    response = input()

    while True:
      if response == 'y':
        bert_guess.update_history(guess)
        hit = True
        guess_break = True
        break
      elif response == 'n':
        bert_guess.update_history(guess)
        hit = False
        break
      elif response == 'QUIT':
        full_break = True
        guess_break = True
        break
      else:
        print("Unknown response (not y/n)")
        break

    if guess_break:
      break
