In [None]:
# Run this file first
import os

# Check if libraries are installed

if not {"word2vec.model", "word2vec.model.vectors.npy"}.issubset(set(os.listdir())):
    # Get the pretrained Google Word2Vec dataset
    # This might take a couple minutes
    print("Downloading Google word2vec dataset...")

    import gensim.downloader
    wv = gensim.downloader.load("word2vec-google-news-300")
    wv.save("./word2vec.model")

    print("Download complete.")


if not "english-words.txt" in os.listdir():
    print("Downloading list of English words...")

    import requests
    words = requests.get(
        "https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt").content.decode().strip().split("\n")

    word2vec_words = set(wv.index_to_key)
    words = [w.strip() for w in words if w.strip() in word2vec_words]

    with open("./english-words.txt", "w") as fout:
        fout.write("\n".join(words))

    print("Download complete.")


print("Setup complete.")

Downloading Google word2vec dataset...
Download complete.
Downloading list of English words...
Download complete.
Setup complete.


In [None]:
import requests

def query(input_data):
    response = requests.post('http://semantle.advml.com/score', json={'data': input_data})
    return response.json()

query("asteroids")

In [None]:
from scipy import spatial
from gensim.models import KeyedVectors
import os
import numpy as np
from tqdm import tqdm
import random

# Make sure datasets are downloaded
needed_files = ["word2vec.model",
                "word2vec.model.vectors.npy", "english-words.txt"]
existing_files = set(os.listdir())
for file in existing_files:
    if file not in existing_files:
        print("Run 'setup.py' to download the datasets first.")

# Load datasets
print("Loading datasets...")
wv = KeyedVectors.load("word2vec.model", mmap="r")
with open("english-words.txt") as fin:
    english_words = fin.read().strip().split("\n")
print("Datasets loaded.")
print()

In [None]:
# Problem One

In [None]:


def find_possible_vectorized(guess, reported_sim, words_to_consider, tolerance=0.005):
    """
    Returns the list of all words in [words_to_consider] <= 0.05 from [reported_sim] of [guess].
    This version is vectorized and exploits the fast computation of matrix-vector products.

    cos(A, B) = dot(A, B) / sqrt(A^2) * sqrt(B^2)

    In this case, B is a matrix. Therefore, we get a vector of cosines, from which we look up the closest words.
    """
    p_bar = tqdm(range(3))

    # Need to map between indices of the matrix and words
    id_to_word = list(words_to_consider) if not isinstance(words_to_consider, list) else words_to_consider

    guess_v = wv[guess]  # features x 1
    # TODO: This operation takes long because for-loop over words, is there a way to load word2vec directly as a matrix?
    words_matrix = np.stack([wv[word] for word in words_to_consider])  # num_words x features

    p_bar.update(1)

    # num_words x features @ features x 1 = num_words x 1
    numerator = words_matrix @ guess_v  # dot product of every datapoint with guess_v

    # words_matrix @ words_matrix.T --> the squares of the matrix are on the diagonal
    # we want to avoid computing all the non-diagonal elements somehow
    # we can achieve this with einstein summation:
    #   np.einsum('ij,jk') is the normal matrix product
    #   np.einsum('ij,ji') gives us the sum over all diagonal elements of the matrix product
    #   np.einsum('ij,ji->i') unforces the sum operation, so just returns the elements of the diagonal
    # dim: num_words x features \w features x num_words --> num_words x 1
    norms = np.sqrt(np.einsum('ij,ji->i', words_matrix, words_matrix.T))

    denominator = norms * np.linalg.norm(guess_v, 2)  # elem-multiply by norm of guess, denominator --> (num_words x 1)

    # NOTE: We do not have to do 1 - cosine, because spatial.distance.cosine calculates cosine DISTANCE
    # Whereas I here calculate cosine SIMILARITY directly
    cosines = (numerator / denominator) * 100  # num_words x 1, cosines[i] = SEMANTLE cosine of guess with ith wv

    p_bar.update(1)

    # return all the indices where the difference is <= 0.05
    candidates = np.where(np.abs(cosines - reported_sim) <= tolerance)[0]

    p_bar.update(1)

    return [id_to_word[i] for i in candidates]


def similarity(word1, word2):
    return (1 - spatial.distance.cosine(wv[word1], wv[word2])) * 100


def find_possible(guess, reported_sim, words_to_consider):
    """
    Return a set of possible words given a guess and its the similary to the secret word.
    """
    ans = []
    for w in tqdm(words_to_consider):
        if abs(similarity(guess, w) - reported_sim) <= 0.005:
            ans.append(w)
    return ans


def make_list(words):
    if len(words) == 0:
        return ""
    if len(words) == 1:
        return f"'{words[0]}'"
    if len(words) == 2:
        return f"'{words[0]}' and '{words[1]}'"
    return "'" + "', '".join(words[:-1]) + "', and '" + words[-1] + "'"


# Start guessing process
def do_run():
    possible = set(english_words)

    print("Enter guesses as '<guess>, <similarity>'")
    print(f"There are {len(possible)} possible words remaining.")
    print()
    guess = "attention"

    count = 10
    while len(possible) > 1 and count > 0:
        count -= 1

        print(f"submitting {guess}")
        result = query(guess)
        print(result)

        if "message" not in result:
          print(result)
          break

        score = result["message"]
        reported_sim = score * 10


        possible = possible.intersection(
            set(find_possible_vectorized(guess, reported_sim, possible, tolerance=1)))
        sample_words = random.sample(sorted(possible), min(3, len(possible)))

        if (len(sample_words) == 0):
          print()
          print("out of words")
          return

        guess = sample_words[0]
        print(f"next guess: {guess}")

        print(f"There are {len(possible)} possible words remaining: {make_list(sample_words)}.")

        if len(possible) == 0:
            print(f"There are no possible words remaining--something went wrong.")
            raise Exception()




In [None]:
do_run()

Enter guesses as '<guess>, <similarity>'
There are 97419 possible words remaining.

submitting attention
{'message': 0.78}


100%|██████████| 3/3 [00:02<00:00,  1.21it/s]


sim metrics
[ 6.922587    0.92039305 10.869526   ...  7.0107183   1.6071666
 -9.745776  ]
7.800000000000001

next guess: shirring
There are 9982 possible words remaining: 'shirring', 'repackage', and 'bumps'.
submitting shirring
{'message': 0.74}


100%|██████████| 3/3 [00:00<00:00, 30.67it/s]

sim metrics
[21.462091  10.130891  26.03762   ... 12.817133   4.403933   5.1435685]
7.4

next guess: incitements
There are 928 possible words remaining: 'incitements', 'diversification', and 'pigeonhole'.
submitting incitements





{'message': 0.75}


100%|██████████| 3/3 [00:00<00:00, 89.15it/s]

sim metrics
[ 1.23200102e+01  1.92812004e+01  2.73667068e+01  4.43256426e+00
  2.50281639e+01  1.99696999e+01  2.65306091e+01  1.31282167e+01
  1.70877552e+00  1.85033054e+01  4.66263390e+00  5.69729662e+00
 -3.87126422e+00  2.37033882e+01 -3.02642679e+00  4.34650612e+00
  2.30434799e+01  4.02459526e+00  2.29038448e+01  3.81360590e-01
  1.86591587e+01  1.30262003e+01  6.84307051e+00  1.80633640e+01
  1.06489639e+01  1.00231218e+01  8.72429562e+00  1.79203281e+01
  1.47526922e+01 -1.48146820e+00  1.55805321e+01  8.71843624e+00
  1.18232937e+01 -4.08709526e+00 -3.49312687e+00 -6.20905256e+00
  5.59824419e+00  7.43867445e+00  1.21102314e+01  7.16193771e+00
  4.58490133e+00  5.83055639e+00  2.65609150e+01  1.03038511e+01
  7.95871162e+00  6.39781356e-01  5.74322510e+00  2.22112160e+01
 -2.33518147e+00  2.45565796e+00  1.39877481e+01  1.98693657e+01
  1.58727837e+01  4.52231598e+00  6.97993135e+00  1.18322048e+01
  2.48447590e+01  1.14831562e+01  1.62746010e+01 -7.07250178e-01
  1.08681488e




{'message': 0.77}


100%|██████████| 3/3 [00:00<00:00, 587.60it/s]

sim metrics
[-3.72751307e+00  7.71386194e+00 -2.88834900e-01  1.53985138e+01
  2.78452244e+01  3.71353381e-04  1.13884258e+01  8.38618946e+00
  2.40017605e+00  1.98257256e+01 -3.41025066e+00  5.55519390e+00
  6.29943705e+00  4.52690506e+00 -3.86224604e+00  1.00991983e+01
  1.42184000e+01  3.93707490e+00  8.83966255e+00  3.96269608e+00
  2.61942444e+01 -5.50797367e+00  1.36067028e+01  8.21339703e+00
  5.44327307e+00  8.31383705e+00  5.49881554e+00  5.31138611e+00
  1.11462140e+00 -1.52658720e+01  1.38860111e+01  1.00000000e+02
  1.78479443e+01  2.54118977e+01  1.05571890e+01  6.20073366e+00
  1.36844950e+01  8.72223186e+00 -4.35724115e+00 -3.97454238e+00
  8.18801880e+00  1.65977848e+00  1.39932871e+01  2.30138645e+01
  2.46375847e+01  1.48886175e+01  1.89366837e+01  1.36455479e+01
 -5.04696488e-01  6.00392246e+00  8.94685173e+00  1.06019764e+01
  3.74289703e+00  1.32858191e+01  1.89334564e+01  1.21732626e+01
  5.91321087e+00  1.44910364e+01  4.80929375e+00  1.50979674e+00
 -1.70415568e




{'message': 0.75}


100%|██████████| 3/3 [00:00<00:00, 1364.59it/s]

sim metrics
[  4.762543    1.2883575   9.283867  100.         16.95606     1.1646332
   7.717012   16.009117 ]
7.5

next guess: invites
There are 1 possible words remaining: 'invites'.





In [None]:
a

In [None]:
# Start guessing process
def do_run():
    possible = set(english_words)

    print("Enter guesses as '<guess>, <similarity>'")
    print(f"There are {len(possible)} possible words remaining.")
    print()

    guess = "attention"

    count = 10
    while len(possible) > 1 and count > 0:
        count -= 1

        print(f"submitting {guess}")
        result = query(guess)
        print(result)

        if "message" not in result:
          print(result)
          break

        score = result["message"]
        reported_sim = score * 100


        possible = possible.intersection(
            set(find_possible_vectorized(guess, reported_sim, possible)))
        sample_words = random.sample(sorted(possible), min(3, len(possible)))

        if (len(sample_words) == 0):
          print("Done guessing")
          return


        guess = sample_words[0]
        print(f"next guess: {guess}")

        print(f"There are {len(possible)} possible words remaining: {make_list(sample_words)}.")

        if len(possible) == 0:
            print(f"There are no possible words remaining--something went wrong.")
            raise Exception()


do_run()

Enter guesses as '<guess>, <similarity>'
There are 97419 possible words remaining.

submitting attention
{'message': 0.78}


100%|██████████| 3/3 [00:03<00:00,  1.02s/it]

Done guessing





In [None]:
similarity("dog", "cat")

76.0945737361908

In [None]:
similarity("apple", "bible")

14.98883068561554

In [None]:
similarity("apple", "john")

17.741386592388153

In [None]:
similarity("apple", "apple")

100

In [None]:
import spacy
import nltk
import numpy as np
nltk.download('words')
from nltk.corpus import words

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Simulated query function
def query(word):
    # Placeholder for the real API call
    return 1 if word == "banana" else 0.5  # Assume a score of 0.5 for demonstration

# Function to get weighted score
def weighted_score(word, target_word, api_score):
    similarity = nlp(word).similarity(nlp(target_word))
    return similarity * api_score

# Initial random word
target_word = "apple"

english_words = words.words()
np.random.shuffle(english_words)  # Shuffle for randomness

for _ in range(100):  # Limit the number of iterations to avoid infinite loop
    api_score = query(target_word)
    if api_score == 1:
        print(f"Found the word: {target_word}")
        break

    scores = [weighted_score(word, target_word, api_score) for word in english_words]
    next_index = np.argmax(scores)
    target_word = english_words[next_index]


In [None]:
import numpy as np
from tqdm import tqdm

def find_next_word(guess, reported_sim, words_to_consider, tolerance=0.005):
    candidates_scores = []

    for word in tqdm(words_to_consider):
        cosine_sim = similarity(guess, word)
        # Considering words that have a cosine similarity close to the reported similarity
        if abs(cosine_sim - reported_sim) <= tolerance:
            candidates_scores.append((word, cosine_sim))

    # Sorting candidates based on their cosine similarity, and picking the one with highest similarity
    candidates_scores.sort(key=lambda x: x[1], reverse=True)
    return candidates_scores[0][0] if candidates_scores else None

def do_run():
    possible = set(english_words)
    guess = "attention"
    count = 10

    while len(possible) > 1 and count > 0:
        count -= 1

        result = query(guess)
        print(f"Result for '{guess}': {result}")

        api_score = result.get("message", 0) * 100  # Adjust based on actual API response

        next_guess = find_next_word(guess, api_score, possible, tolerance=)
        if next_guess:
            guess = next_guess
            print(f"Next guess: {guess}")
        else:
            print("No suitable next guess found.")
            break

# Assuming query is a function that takes a word as input and returns a dictionary with a "message" key holding the score.
do_run()


Result for 'attention': {'message': 0.78}


100%|██████████| 97419/97419 [00:07<00:00, 13151.45it/s]

No suitable next guess found.





In [None]:
import gensim.downloader as api
from nltk.corpus import words
import nltk

# Downloading the word2vec word embeddings
word2vec_model = api.load("word2vec-google-news-300")  # you can choose another model if you prefer

# Downloading the words corpus
nltk.download('words')
english_words = words.words()

# Now you can use `word2vec_model` to get the embeddings and `english_words` to get the English words


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
import requests

def query(input_data):
    response = requests.post('http://semantle.advml.com/score', json={'data': input_data})
    return response.json()

query("attention")

{'message': 0.78}

In [None]:

def find_high_score_words(word2vec_model, english_words, target_score=1.0):
    high_score_words = []

    for word in english_words:
        # Query the word itself
        response = query(word)
        if response.get("message") == target_score:
            high_score_words.append(word)

        # Find similar words and query them
        if word in word2vec_model:
            similar_words = word2vec_model.most_similar(word, topn=5)
            for similar_word, _ in similar_words:
                response = query(similar_word)
                if response.get("message") == target_score:
                    high_score_words.append(similar_word)

    return high_score_words

# Assuming word2vec_model and english_words are already loaded
high_score_words = find_high_score_words(word2vec_model, english_words)

print(f"Words with score {1.0}: {high_score_words}")


In [None]:
word2vec_model.most_similar("apple", topn=5)

[('apples', 0.720359742641449),
 ('pear', 0.6450697183609009),
 ('fruit', 0.6410146355628967),
 ('berry', 0.6302295327186584),
 ('pears', 0.613396167755127)]

In [None]:
word2vec_model["apple"]

array([-0.06445312, -0.16015625, -0.01208496,  0.13476562, -0.22949219,
        0.16210938,  0.3046875 , -0.1796875 , -0.12109375,  0.25390625,
       -0.01428223, -0.06396484, -0.08056641, -0.05688477, -0.19628906,
        0.2890625 , -0.05151367,  0.14257812, -0.10498047, -0.04736328,
       -0.34765625,  0.35742188,  0.265625  ,  0.00188446, -0.01586914,
        0.00195312, -0.35546875,  0.22167969,  0.05761719,  0.15917969,
        0.08691406, -0.0267334 , -0.04785156,  0.23925781, -0.05981445,
        0.0378418 ,  0.17382812, -0.41796875,  0.2890625 ,  0.32617188,
        0.02429199, -0.01647949, -0.06494141, -0.08886719,  0.07666016,
       -0.15136719,  0.05249023, -0.04199219, -0.05419922,  0.00108337,
       -0.20117188,  0.12304688,  0.09228516,  0.10449219, -0.00408936,
       -0.04199219,  0.01409912, -0.02111816, -0.13476562, -0.24316406,
        0.16015625, -0.06689453, -0.08984375, -0.07177734, -0.00595093,
       -0.00482178, -0.00089264, -0.30664062, -0.0625    ,  0.07

In [None]:
import numpy as np
import requests
import random

def query(input_data, queried_words):
    if input_data in queried_words:
        return queried_words[input_data]

    response = requests.post('http://semantle.advml.com/score', json={'data': input_data})
    score = response.json()["message"]
    print(f"query: {input_data} {score}")
    queried_words[input_data] = score
    return score

def calculate_gradient(start_word_vec, word_vectors, scores, epsilon):
    gradients = [(score * word_vec) for score, word_vec in zip(scores, word_vectors)]
    avg_gradient = np.mean(gradients, axis=0)
    updated_vector = start_word_vec + epsilon * avg_gradient
    return updated_vector

def optimize_word(word2vec_model, english_words, start_word, n_steps=5, n_sample=5, epsilon=0.1):
    current_word = start_word
    queried_words = {}

    for step in range(n_steps):
        sampled_words = []
        while len(sampled_words) < n_sample:
            word = random.choice(english_words)
            if word not in queried_words:
                sampled_words.append(word)

        word_vectors = [word2vec_model[word] for word in sampled_words if word in word2vec_model]
        scores = [query(word, queried_words) for word in sampled_words]

        if current_word in word2vec_model:
            start_word_vec = word2vec_model[current_word]
            scores.append(query(current_word, queried_words))

            updated_vector = calculate_gradient(start_word_vec, word_vectors, scores, epsilon)
            similar_words = word2vec_model.similar_by_vector(updated_vector, topn=1)

            if similar_words:
                current_word = similar_words[0][0]
                print(f"Step {step+1}: New word is '{current_word}'")
            else:
                print(f"Step {step+1}: No similar words found.")
        else:
            print(f"Step {step+1}: The word '{current_word}' is not in the model vocabulary.")
            raise Exception()

    return current_word, queried_words

# Assuming word2vec_model and english_words are already loaded
optimized_word, queried_words_dict = optimize_word(
    word2vec_model, english_words, start_word="attention", n_steps=5, n_sample=5, epsilon=1000_000
)
print(f"Optimized word: '{optimized_word}'")
print(f"Queried words and their scores: {queried_words_dict}")


query: Bumbledom 0.79
query: yawmeter 0.76
query: roomy 0.77
query: rampler 0.79
query: merchant 0.78
query: attention 0.78
Step 1: New word is 'roomy'
query: redemptrice 0.76
query: unfurnished 0.74
query: tympanal 0.77
query: duke 0.78
query: urethropenile 0.78
Step 2: New word is 'unfurnished'
query: billethead 0.77
query: thelyplasty 0.77
query: Valsaceae 0.71
query: seraphtide 0.78
query: hogan 0.78
Step 3: New word is 'hogan'
query: parumbilical 0.78
query: hexose 0.76
query: demology 0.76
query: tiklin 0.77
query: cerago 0.78
Step 4: New word is 'hexose'
query: foliosity 0.75
query: magnochromite 0.78
query: chinching 0.76
query: defacement 0.73
query: retenant 0.75
Step 5: New word is 'defacement'
Optimized word: 'defacement'
Queried words and their scores: {'Bumbledom': 0.79, 'yawmeter': 0.76, 'roomy': 0.77, 'rampler': 0.79, 'merchant': 0.78, 'attention': 0.78, 'redemptrice': 0.76, 'unfurnished': 0.74, 'tympanal': 0.77, 'duke': 0.78, 'urethropenile': 0.78, 'billethead': 0.77, 

The key with the maximum value is 'b' with a value of 50


In [None]:
import requests

class QueryManager:
    def __init__(self):
        self.queried_words = {}

    def query(self, input_data):
        if input_data in self.queried_words:
            return self.queried_words[input_data]

        response = requests.post('http://semantle.advml.com/score', json={'data': input_data})
        score = response.json()["message"]
        print(f"query: {input_data} {score}")
        self.queried_words[input_data] = score
        return score

    def get_max_score_word(self):
        max_key = max(self.queried_words, key=lambda k: self.queried_words[k])
        return max_key, self.queried_words[max_key]

# Example usage:
manager = QueryManager()

# Perform some queries
manager.query('stoner')
manager.query('sediment')

# Get the word with the maximum score
max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


query: stoner 0.81
query: sediment 0.8
The word with the maximum score is 'stoner' with a score of 0.81


In [None]:
manager.queried_words

{'stoner': 0.81,
 'sediment': 0.8,
 'precipice': 0.78,
 'lignose': 0.78,
 'nudibranchian': 0.77,
 'myelosyphilis': 0.74,
 'prodigiosity': 0.74,
 'acetylenediurein': 0.73,
 'bucky': 0.78,
 'leucocytopenia': 0.74,
 'Ortheris': 0.81,
 'interseminal': 0.77,
 'urinomancy': 0.76,
 'uninjurious': 0.74,
 'oothecal': 0.78,
 'Myxogastrales': 0.75,
 'synchronously': 0.74,
 'nitrosochloride': 0.73,
 'ruttiness': 0.78,
 'lupulus': 0.79,
 'unbred': 0.77,
 'polymath': 0.77,
 'indiminishable': 0.77,
 'polenta': 0.78,
 'shraddha': 0.76,
 'civil': 0.78,
 'lavishment': 0.74}

In [None]:
def get_max_value_key(input_dict):
    # Find the key with the maximum value
    max_key = max(input_dict, key=lambda k: input_dict[k])
    return max_key, input_dict[max_key]



def calculate_gradient(current_word_vec, new_word_vec, score_difference):
    gradient = new_word_vec - current_word_vec
    normalized_gradient = gradient / np.linalg.norm(gradient)
    return normalized_gradient * score_difference


def optimize_word(word2vec_model, english_words, start_word, n_steps=5, n_sample=5, epsilon=0.1):
    current_word = start_word

    for step in range(n_steps):
        current_score = manager.query(current_word)

        for _ in range(n_sample):
            new_word = random.choice(english_words)
            while new_word in manager.queried_words:
                new_word = random.choice(english_words)

            new_score = manager.query(new_word)

            if new_score > current_score and new_word in word2vec_model:
                gradient = calculate_gradient(
                    word2vec_model[current_word],
                    word2vec_model[new_word],
                    new_score - current_score
                )

                for attempt in range(5):
                    step_size = epsilon * (10 ** attempt)
                    new_vector = word2vec_model[current_word] + step_size * gradient
                    similar_word = word2vec_model.similar_by_vector(new_vector, topn=1)[0][0]
                    similar_word_score = manager.query(similar_word)

                    if similar_word_score > current_score:
                        current_word = similar_word
                        current_score = similar_word_score
                        print(f"Step {step+1}: New word is '{current_word}' with score {current_score}")
                    else:
                        print(f"Step {step+1}, Attempt {attempt+1}: No improvement, trying a bigger step.")

    return current_word


optimized_word = optimize_word(
    word2vec_model, english_words, start_word="Ortheris", n_steps=5, n_sample=5, epsilon=1
)
print(f"Optimized word: '{optimized_word}'")
max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


query: commensality 0.73
query: Pleurosteon 0.77
query: acockbill 0.79
query: enchylema 0.77
query: Bryanism 0.75
query: meanwhile 0.77
query: tamas 0.77
query: trimethylmethane 0.76
query: goodman 0.77
query: Actinozoa 0.78
query: reirrigation 0.74
query: possessorship 0.74
query: drifting 0.78
query: tritium 0.8
query: manufactural 0.77
query: nephrolysin 0.75
query: antagonistic 0.77
query: bombarder 0.8
query: experience 0.77
query: hisingerite 0.8
query: transfusive 0.75
query: pairedness 0.75
query: meekheartedness 0.73
query: fencible 0.75
query: dorsimesal 0.77
Optimized word: 'Ortheris'
The word with the maximum score is 'stoner' with a score of 0.81


In [None]:
# import numpy as np
# import requests
# import random

# class QueryManager:
#     def __init__(self):
#         self.queried_words = {}

#     def query(self, input_data):
#         if input_data in self.queried_words:
#             return self.queried_words[input_data]

#         response = requests.post('http://semantle.advml.com/score', json={'data': input_data})
#         score = response.json()["message"]
#         print(f"query: {input_data} {score}")
#         self.queried_words[input_data] = score
#         return score

# def calculate_gradient(current_word_vec, new_word_vec, score_difference):
#     gradient = new_word_vec - current_word_vec
#     normalized_gradient = gradient / np.linalg.norm(gradient)
#     return normalized_gradient * score_difference

def optimize_word(word2vec_model, english_words, start_word, n_steps=5, n_sample=5, epsilon=0.1):
    manager = QueryManager()
    current_word = start_word

    for step in range(n_steps):
        current_score = manager.query(current_word)

        for _ in range(n_sample):
            new_word = random.choice(english_words)
            while new_word in manager.queried_words:
                new_word = random.choice(english_words)

            new_score = manager.query(new_word)

            if new_score > current_score and new_word in word2vec_model:
                gradient = calculate_gradient(
                    word2vec_model[current_word],
                    word2vec_model[new_word],
                    new_score - current_score
                )
            elif new_score < current_score and new_word in word2vec_model:
                gradient = -calculate_gradient(
                    word2vec_model[current_word],
                    word2vec_model[new_word],
                    current_score - new_score
                )
            else:
                continue

            for attempt in range(10):
                step_size = epsilon * (10 ** attempt)
                new_vector = word2vec_model[current_word] + step_size * gradient
                similar_word = word2vec_model.similar_by_vector(new_vector, topn=1)[0][0]
                similar_word_score = manager.query(similar_word)

                if similar_word_score > current_score:
                    current_word = similar_word
                    current_score = similar_word_score
                    print(f"Step {step+1}: New word is '{current_word}' with score {current_score}")
                    break
                else:
                    print(f"Step {step+1}, Attempt {attempt+1}: No improvement, trying a bigger step.")

    return current_word

optimized_word = optimize_word(
    word2vec_model, english_words, start_word="stoner", n_steps=5, n_sample=5, epsilon=0.1
)
print(f"Optimized word: '{optimized_word}'")

max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


query: stoner 0.81
query: textrine 0.77
query: hyostyly 0.77
query: figbird 0.78
query: Elaeagnus 0.74
query: idiomelon 0.79
query: motleyness 0.78
query: argillomagnesian 0.76
query: chaute 0.77
query: buckwashing 0.72
query: Naticidae 0.73
query: panterer 0.79
query: iridodesis 0.77
query: divertingly 0.76
Step 3, Attempt 1: No improvement, trying a bigger step.
Step 3, Attempt 2: No improvement, trying a bigger step.
Step 3, Attempt 3: No improvement, trying a bigger step.
Step 3, Attempt 4: No improvement, trying a bigger step.
Step 3, Attempt 5: No improvement, trying a bigger step.
query: tapermaker 0.76
query: tidological 0.77
query: anemometric 0.76
query: pyramidologist 0.79
query: hemocytolysis 0.73
query: scorpionic 0.78
query: bridebed 0.76
query: renickel 0.77
query: whisperer 0.77
Step 5, Attempt 1: No improvement, trying a bigger step.
Step 5, Attempt 2: No improvement, trying a bigger step.
Step 5, Attempt 3: No improvement, trying a bigger step.
Step 5, Attempt 4: No i

In [None]:
def optimize_word(word2vec_model, english_words, start_word, n_steps=5, n_sample=5, epsilon=0.1):
    current_word = start_word

    for step in range(n_steps):
        current_score = manager.query(current_word)

        best_word, best_score = None, -float('inf')
        worst_word, worst_score = None, float('inf')

        for _ in range(n_sample):
            new_word = random.choice(english_words)
            while new_word in manager.queried_words or new_word not in word2vec_model:
                new_word = random.choice(english_words)

            new_score = manager.query(new_word)
            if new_score > best_score:
                best_word, best_score = new_word, new_score
            if new_score < worst_score:
                worst_word, worst_score = new_word, new_score

        if best_score > current_score:
            target_word, target_score = best_word, best_score
        else:
            target_word, target_score = worst_word, worst_score

        gradient = calculate_gradient(
            word2vec_model[current_word],
            word2vec_model[target_word],
            target_score - current_score
        )

        for attempt in range(10):
            step_size = epsilon * (10 ** attempt)
            new_vector = word2vec_model[current_word] + step_size * gradient
            similar_word = word2vec_model.similar_by_vector(new_vector, topn=1)[0][0]
            similar_word_score = manager.query(similar_word)

            if similar_word_score > current_score:
                current_word = similar_word
                current_score = similar_word_score
                print(f"Step {step+1}: New word is '{current_word}' with score {current_score}")
                break
            else:
                print(f"Step {step+1}, Attempt {attempt+1}: No improvement, trying a bigger step.")

    return current_word


optimized_word = optimize_word(
    word2vec_model, english_words, start_word="stoner", n_steps=5, n_sample=5, epsilon=1
)
print(f"Optimized word: '{optimized_word}'")
max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


query: Madi 0.76
query: nonsectarian 0.75
query: dastardly 0.76
query: unedifying 0.75
query: middy 0.76
Step 1, Attempt 1: No improvement, trying a bigger step.
Step 1, Attempt 2: No improvement, trying a bigger step.
Step 1, Attempt 3: No improvement, trying a bigger step.
Step 1, Attempt 4: No improvement, trying a bigger step.
Step 1, Attempt 5: No improvement, trying a bigger step.
Step 1, Attempt 6: No improvement, trying a bigger step.
Step 1, Attempt 7: No improvement, trying a bigger step.
Step 1, Attempt 8: No improvement, trying a bigger step.
Step 1, Attempt 9: No improvement, trying a bigger step.
Step 1, Attempt 10: No improvement, trying a bigger step.
query: regret 0.77
query: superficial 0.78
query: surliness 0.74
query: stude 0.79
query: unsigned 0.77
Step 2, Attempt 1: No improvement, trying a bigger step.
Step 2, Attempt 2: No improvement, trying a bigger step.
Step 2, Attempt 3: No improvement, trying a bigger step.
Step 2, Attempt 4: No improvement, trying a bigge

In [None]:
'obstacle'
'banana'

optimized_word = optimize_word(
    word2vec_model, english_words, start_word="banana", n_steps=5, n_sample=5, epsilon=0.1
)
print(f"Optimized word: '{optimized_word}'")
max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")

query: banana 0.81
query: faun 0.8
query: constrictor 0.77
query: curator 0.77
query: pried 0.76
query: surplice 0.76
Step 1, Attempt 1: No improvement, trying a bigger step.
Step 1, Attempt 2: No improvement, trying a bigger step.
Step 1, Attempt 3: No improvement, trying a bigger step.
Step 1, Attempt 4: No improvement, trying a bigger step.
Step 1, Attempt 5: No improvement, trying a bigger step.
Step 1, Attempt 6: No improvement, trying a bigger step.
Step 1, Attempt 7: No improvement, trying a bigger step.
Step 1, Attempt 8: No improvement, trying a bigger step.
Step 1, Attempt 9: No improvement, trying a bigger step.
Step 1, Attempt 10: No improvement, trying a bigger step.
query: iconostasis 0.78
query: Llandeilo 0.74
query: cuff 0.75
query: coonskin 0.75
query: grievous 0.76
Step 2, Attempt 1: No improvement, trying a bigger step.
Step 2, Attempt 2: No improvement, trying a bigger step.
Step 2, Attempt 3: No improvement, trying a bigger step.
Step 2, Attempt 4: No improvement, 

In [None]:
manager.get_max_score_word()

('stoner', 0.81)

In [None]:
'obstacle'
'banana'

optimized_word = optimize_word(
    word2vec_model, english_words, start_word="chondritic", n_steps=20, n_sample=1, epsilon=10
)

print(f"Optimized word: '{optimized_word}'")
max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")

query: Jenson 0.78
Step 1, Attempt 1: No improvement, trying a bigger step.
Step 1, Attempt 2: No improvement, trying a bigger step.
Step 1, Attempt 3: No improvement, trying a bigger step.
Step 1, Attempt 4: No improvement, trying a bigger step.
Step 1, Attempt 5: No improvement, trying a bigger step.
Step 1, Attempt 6: No improvement, trying a bigger step.
Step 1, Attempt 7: No improvement, trying a bigger step.
Step 1, Attempt 8: No improvement, trying a bigger step.
Step 1, Attempt 9: No improvement, trying a bigger step.
Step 1, Attempt 10: No improvement, trying a bigger step.
query: sinking 0.78
Step 2, Attempt 1: No improvement, trying a bigger step.
Step 2, Attempt 2: No improvement, trying a bigger step.
Step 2, Attempt 3: No improvement, trying a bigger step.
Step 2, Attempt 4: No improvement, trying a bigger step.
Step 2, Attempt 5: No improvement, trying a bigger step.
Step 2, Attempt 6: No improvement, trying a bigger step.
Step 2, Attempt 7: No improvement, trying a bigg

In [None]:
# Function to optimize word based on closest words and their scores
def optimize_word_closest(word2vec_model, start_word, n_steps=5, n_words=10):
    current_word = start_word

    for step in range(n_steps):
        # Getting the 10 closest words
        closest_words = [word for word, _ in word2vec_model.most_similar(current_word, topn=n_words)]

        # Querying scores of the closest words
        scores = {word: manager.query(word) for word in closest_words}

        # Getting the word with the highest score
        best_word, best_score = max(scores.items(), key=lambda x: x[1])

        if best_score > manager.query(current_word):
            current_word = best_word
            print(f"Step {step+1}: New word is '{current_word}' with score {best_score}")
        else:
            print(f"Step {step+1}: No improvement.")

    return current_word

# Example of usage
optimized_word = optimize_word_closest(word2vec_model, start_word="asteroidal", n_steps=50, n_words=50)
print(f"Optimized word: '{optimized_word}'")

max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


query: floresiensis 0.78
query: olivine_crystals 0.78
query: plagioclase 0.8
query: isotope_ratios 0.76
query: asteroidal 0.92
query: spheroidal 0.83
query: stromatolite 0.84
query: Fig._1b 0.76
query: granitoids 0.8
query: planetesimal 0.88
query: stony_meteorite 0.86
query: phyllosilicates 0.77
query: lunar_meteorites 0.87
query: nebular 0.85
query: mineralogically 0.78
query: circumstellar_disk 0.85
query: Cretaceous_Paleogene_boundary 0.78
query: post_perovskite 0.77
query: graviton 0.82
query: olivines 0.78
query: comet_nuclei 0.82
query: pyroxene 0.79
query: anorthosite 0.81
query: moonlet 0.85
query: electrons_orbiting 0.8
query: lunar_highlands 0.8
query: planktonic_foraminifera 0.8
query: aragonite 0.81
query: hematite_mineral 0.79
query: silicate_mineral 0.78
query: phlogopite 0.78
query: circumstellar 0.85
query: uraninite 0.8
query: magma_ocean 0.8
query: allanite 0.78
Step 1: New word is 'asteroidal' with score 0.92
query: protoplanet 0.84
query: icy_moons 0.8
query: extra

In [None]:
# Function to optimize word based on closest words and their scores
def optimize_word_closest(word2vec_model, start_word, n_steps=5, n_words=10):
    current_word = start_word
    visited_words = set([start_word])

    for step in range(n_steps):
        # Getting the 10 closest words excluding visited words
        closest_words = [word for word, _ in word2vec_model.most_similar(current_word, topn=n_words)
                         if word not in visited_words]

        if not closest_words:
            print(f"No unvisited closest words found for '{current_word}'. Ending optimization.")
            break

        # Querying scores of the closest words
        scores = {word: manager.query(word) for word in closest_words}

        # Getting the word with the highest score
        best_word, best_score = max(scores.items(), key=lambda x: x[1])

        current_word = best_word
        visited_words.add(current_word)
        print(f"Step {step+1}: New word is '{current_word}' with score {best_score}")

    return current_word



optimized_word = optimize_word_closest(word2vec_model, start_word="asteroidal", n_steps=50, n_words=50)
print(f"Optimized word: '{optimized_word}'")

max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


In [None]:
# Function to optimize word based on closest words and their scores
def optimize_word_closest(word2vec_model, start_word, n_steps=5, n_words=10):
    current_word = start_word
    visited_words = set([start_word])

    for step in range(n_steps):
        # Getting the 10 closest words excluding visited words
        closest_words = [word for word, _ in word2vec_model.most_similar(current_word, topn=n_words)
                         if word not in visited_words]

        if not closest_words:
            print(f"No unvisited closest words found for '{current_word}'. Ending optimization.")
            break

        # Querying scores of the closest words
        scores = {}
        for word in closest_words:
          try:
            scores[word] = manager.query(word)
          except KeyError as e:
            print(word)
            print(e)
            raise e

        # Getting the word with the highest score
        best_word, best_score = max(scores.items(), key=lambda x: x[1])

        current_word = best_word
        visited_words.add(current_word)
        print(f"Step {step+1}: New word is '{current_word}' with score {best_score}")

    return current_word



optimized_word = optimize_word_closest(word2vec_model, start_word="asteroids_comets", n_steps=50, n_words=50)
print(f"Optimized word: '{optimized_word}'")

max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


In [None]:
optimized_word = optimize_word_closest(word2vec_model, start_word="asteroidal", n_steps=5, n_words=200)
print(f"Optimized word: '{optimized_word}'")

max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")

In [None]:
query("asteroids", [])

In [None]:

import json

# Your dictionary
data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


# Second challenge

In [None]:
import requests

input_data = "cat jungle hat space water"

def query(input_data):
    response = requests.post('http://semantle2.advml.com/score', json={'data': input_data})
    return response.json()

query(input_data)

{'message': 0.75}

In [None]:
import requests

class QueryManager:
    def __init__(self):
        self.queried_words = {}

    def query(self, input_data):
        if input_data in self.queried_words:
            return self.queried_words[input_data]

        response = requests.post('http://semantle.advml.com/score', json={'data': input_data})
        score = response.json()["message"]
        print(f"query: {input_data} {score}")
        self.queried_words[input_data] = score
        return score

    def get_max_score_word(self):
        max_key = max(self.queried_words, key=lambda k: self.queried_words[k])
        return max_key, self.queried_words[max_key]

# Example usage:
manager = QueryManager()

In [None]:
# Function to optimize word based on closest words and their scores
def optimize_word_closest(word2vec_model, start_word, n_steps=5, n_words=10):
    current_word = start_word
    visited_words = set([start_word])

    for step in range(n_steps):
        # Getting the 10 closest words excluding visited words
        closest_words = [word for word, _ in word2vec_model.most_similar(current_word, topn=n_words)
                         if word not in visited_words]

        if not closest_words:
            print(f"No unvisited closest words found for '{current_word}'. Ending optimization.")
            break

        # Querying scores of the closest words
        scores = {}
        for word in closest_words:
          try:
            scores[word] = manager.query(word)
          except KeyError as e:
            print(word)
            print(e)
            raise e

        # Getting the word with the highest score
        best_word, best_score = max(scores.items(), key=lambda x: x[1])

        current_word = best_word
        visited_words.add(current_word)
        print(f"Step {step+1}: New word is '{current_word}' with score {best_score}")

    return current_word



optimized_word = optimize_word_closest(word2vec_model, start_word="asteroids_comets", n_steps=50, n_words=50)
print(f"Optimized word: '{optimized_word}'")

max_word, max_score = manager.get_max_score_word()
print(f"The word with the maximum score is '{max_word}' with a score of {max_score}")


In [None]:
# Second try

In [None]:
! pip install gensim
! pip install nltk



In [None]:
import gensim.downloader as api
from nltk.corpus import words
import nltk

# Downloading the word2vec word embeddings
word2vec_model = api.load("word2vec-google-news-300")  # you can choose another model if you prefer

# Downloading the words corpus
nltk.download('words')
english_words = words.words()

# Now you can use `word2vec_model` to get the embeddings and `english_words` to get the English words


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
import json
import requests

class QueryManager:
    def __init__(self):
        self.queried_words = {}

    def query(self, words_list):
        words_key = ' '.join(words_list)  # Convert list to string to use as a key
        if words_key in self.queried_words:
            return self.queried_words[words_key]

        response = requests.post('http://semantle2.advml.com/score', json={'data': words_key})
        response_data = response.json()

        if 'flag' in response_data:
            print(response.text)

        score = response_data["message"]
        print(f"query: {words_key} {score}")
        self.queried_words[words_key] = score
        return score

    def get_max_score_word(self):
        max_key = max(self.queried_words, key=lambda k: self.queried_words[k])
        return max_key, self.queried_words[max_key]

# Example usage:
manager = QueryManager()

# Reading from json and assigning it to manager's queried_words
# with open('data.json', 'r', encoding='utf-8') as file:
#     manager.queried_words = json.load(file)

# Now the manager's queried_words has been populated with the data from the JSON file


manager.query(["dog", "forest", "hat", "space", "wind"])
manager.query(["dog", "jogging", "table", "earth", "wind"])
manager.query(['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'wind'])
manager.query(['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale'])
manager.get_max_score_word()


max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


query: dog forest hat space wind 0.74
query: dog jogging table earth wind 0.77
query: dogs snowboarding_mountain_biking dining_room mankind wind 0.83
query: dogs snowboarding_mountain_biking dining_room mankind gale 0.83
The combination with the maximum score is 'dogs snowboarding_mountain_biking dining_room mankind wind' with a score of 0.83


In [None]:
import numpy as np
import random

def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []

    for step in range(n_steps):
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)  # Increase the pool for diversity
            similar_words = [w for w, _ in all_similar_words if w not in current_words]

            # Gather scores for similar words to enable intelligent sampling
            word_scores = []
            for similar_word in similar_words:
                combined_words = current_words[:]
                combined_words[i] = similar_word
                combined_words_str = ' '.join(combined_words)
                if combined_words_str in manager.queried_words:
                    word_scores.append(manager.queried_words[combined_words_str])
                else:
                    # This part involves querying the API for the score, you'd handle it as needed
                    score = manager.query(combined_words)  # Assume this function exists and works as intended
                    word_scores.append(score)

            # If no new scores are found, skip to the next word
            if not word_scores:
                continue

            # Select a word using intelligent sampling
            selected_word = intelligent_sampling(similar_words, word_scores, temperature=temperature)
            new_combination = current_words[:]
            new_combination[i] = selected_word
            score = manager.query(new_combination)

            # Update the best score and combination found so far
            if score > best_score:
                best_score = score
                best_combination = new_combination[:]
                print(f"Step {step+1}, Word {i+1}: New combination is '{new_combination}' with score {score}")

        current_words = best_combination
        if not best_combination:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break

    return best_combination

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method
# - manager: An instance of QueryManager with an appropriate query method

sw = ['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale']

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=10, n_words=50
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
import numpy as np
import random

def genetic_optimization(word2vec_model, start_words, n_steps=5, population_size=10, mutation_rate=0.1):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    # Initialize population with the start_words being one of them
    population = [start_words]
    for _ in range(population_size - 1):
        population.append(random.sample(start_words, len(start_words)))

    best_score = -float('inf')
    best_combination = []

    for step in range(n_steps):
        # Evaluate all combinations
        scored_population = []
        for combination in population:
            score = manager.query(combination)
            scored_population.append((score, combination))
            if score > best_score:
                best_score = score
                best_combination = combination

        # Sort combinations by their score
        scored_population.sort(key=lambda x: x[0], reverse=True)

        # Selection: Pick the top scoring combinations
        survivors = scored_population[:population_size // 2]

        # Crossover: Create new combinations by merging survivors
        new_population = []
        for i in range(len(survivors) // 2):
            for j in range(i + 1, len(survivors)):
                combo1 = survivors[i][1]
                combo2 = survivors[j][1]
                # Simple one-point crossover
                point = random.randint(1, len(combo1) - 2)
                new_combination = combo1[:point] + combo2[point:]
                new_population.append(new_combination)

        # Mutation: Randomly change one word in the combination
        for combination in new_population:
            if random.random() < mutation_rate:
                word_to_replace = random.choice(combination)
                replacements = word2vec_model.most_similar(word_to_replace, topn=1)
                if replacements:
                    replacement_word = replacements[0][0]
                    combination[combination.index(word_to_replace)] = replacement_word

        # Add best combinations and new ones to the population for the next step
        population = [combo for _, combo in survivors] + new_population

        print(f"Step {step+1}: Best combination so far is '{best_combination}' with score {best_score}")

    return best_combination

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method
# - manager: An instance of QueryManager with an appropriate query method

optimized_combination = genetic_optimization(
    word2vec_model, start_words=["word1", "word2", "word3", "word4", "word5"],
    n_steps=50, population_size=20, mutation_rate=0.2
)

print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
import numpy as np
import random

def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []
    visited_combinations = set()

    for step in range(n_steps):
        improvement_found = False
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)  # Increase the pool for diversity
            similar_words = [w for w, _ in all_similar_words if w not in current_words]

            # Gather scores for similar words to enable intelligent sampling
            word_scores = []
            for similar_word in similar_words:
                combined_words = current_words[:]
                combined_words[i] = similar_word
                combined_words_str = ' '.join(combined_words)
                if combined_words_str in manager.queried_words:
                    word_scores.append(manager.queried_words[combined_words_str])
                else:
                    # Query the API for the score
                    score = manager.query(combined_words)
                    word_scores.append(score)

            # If no new scores are found, skip to the next word
            if not word_scores:
                continue

            # Select a word using intelligent sampling
            selected_word = intelligent_sampling(similar_words, word_scores, temperature=temperature)
            new_combination = current_words[:]
            new_combination[i] = selected_word
            new_combination_str = ' '.join(new_combination)
            if new_combination_str not in visited_combinations:
                visited_combinations.add(new_combination_str)
                score = manager.query(new_combination)
                if score > best_score:
                    best_score = score
                    best_combination = new_combination[:]
                    improvement_found = True
                    print(f"Step {step+1}, Word {i+1}: New combination is '{new_combination}' with score {score}")

        if not improvement_found:
            # Sample randomly from the highest ranking not already seen values
            new_scores_and_combos = [(score, combo) for score, combo in zip(word_scores, similar_words)
                                     if ' '.join(combo) not in visited_combinations]
            if new_scores_and_combos:
                new_scores_and_combos.sort(reverse=True)  # Highest scores first
                _, new_word = new_scores_and_combos[0]  # Select the highest score not seen
                current_words[i] = new_word
                improvement_found = True

        if improvement_found:
            current_words = best_combination
        else:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break

    return best_combination

# Assuming the existence of the required objects and methods
optimized_combination = optimize_word_closest(
    word2vec_model, start_words=['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale'],
    n_steps=10, n_words=50
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:

import json

# Your dictionary
data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


In [None]:
# print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


The combination with the maximum score is 'dogs snowboarding_mountain_biking dining_room mankind Repair_Remote_Monitoring' with a score of 0.84


In [None]:
sw = ['snowboarder', 'mankind', 'ski', 'human_beings', 'diningroom']

manager.query(sw)


max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


query: snowboarder mankind ski human_beings diningroom 0.85
The combination with the maximum score is 'snowboarder mankind ski human_beings diningroom' with a score of 0.85


In [None]:
import numpy as np
import itertools
import random

def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_two_words_sampling(word2vec_model, start_words, n_steps=5, n_words=20, temperature=0.5, combinations_to_sample=10):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []
    visited_combinations = set()

    for step in range(n_steps):
        improvement_found = False
        index_pairs = [(i, j) for i in range(len(current_words)) for j in range(i+1, len(current_words))]
        random.shuffle(index_pairs)

        for i, j in index_pairs:
            word1, word2 = current_words[i], current_words[j]
            similar_words1 = [word for word, _ in word2vec_model.most_similar(word1, topn=n_words)]
            similar_words2 = [word for word, _ in word2vec_model.most_similar(word2, topn=n_words)]

            # Generate all possible combinations from the two lists of similar words
            all_combinations = list(itertools.product(similar_words1, similar_words2))
            random.shuffle(all_combinations)  # Shuffle to ensure randomness

            # Sample a subset of combinations to try
            sampled_combinations = all_combinations[:combinations_to_sample] if combinations_to_sample < len(all_combinations) else all_combinations

            for sim_word1, sim_word2 in sampled_combinations:
                if sim_word1 == sim_word2 or sim_word1 in current_words or sim_word2 in current_words:
                    continue

                new_combination = current_words[:]
                new_combination[i], new_combination[j] = sim_word1, sim_word2
                new_combination_str = ' '.join(new_combination)

                if new_combination_str not in visited_combinations:
                    visited_combinations.add(new_combination_str)
                    score = manager.query(new_combination)
                    if score > best_score:
                        best_score = score
                        best_combination = new_combination
                        improvement_found = True
                        print(f"Step {step+1}: New best combination is '{best_combination}' with score {best_score}")

        if not improvement_found:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break
        else:
            current_words = best_combination

    return best_combination

# Assuming the existence of the required objects and methods
sw = ['snowboarder', 'mankind', 'ski', 'human_beings', 'diningroom']

optimized_combination = optimize_two_words_sampling(
    word2vec_model,
    start_words=sw,
    n_steps=10, n_words=50, combinations_to_sample=20
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


query: freestyle_skiers mankind ski civilized_society diningroom 0.81
Step 1: New best combination is '['freestyle_skiers', 'mankind', 'ski', 'civilized_society', 'diningroom']' with score 0.81
query: skiiers mankind ski Humans diningroom 0.82
Step 1: New best combination is '['skiiers', 'mankind', 'ski', 'Humans', 'diningroom']' with score 0.82
query: snowmobiler mankind ski mortals diningroom 0.82
query: freeskiing mankind ski sentient_beings diningroom 0.81
query: Snowboarder mankind ski sentient_beings diningroom 0.82
query: downhiller mankind ski Untermenschen diningroom 0.81
query: snowboard mankind ski fallible_humans diningroom 0.82
query: snowboarders mankind ski homo_sapiens diningroom 0.82
query: Tricia_Byrnes mankind ski humanness diningroom 0.82
query: snowboarders mankind ski sentient_creatures diningroom 0.82
query: skiers mankind ski humankind diningroom 0.82
query: freeskiing mankind ski beings diningroom 0.82
query: Jossi_Wells mankind ski subhumans diningroom 0.8
que

In [None]:
import numpy as np
import itertools
import random

def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_all_words_sampling(word2vec_model, start_words, n_steps=5, n_words=20, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []
    visited_combinations = set()

    improvement_found = 0
    for step in range(n_steps):

        similar_words_lists = [word2vec_model.most_similar(word, topn=n_words) for word in current_words]

        all_combinations = list(itertools.product(*similar_words_lists))
        random.shuffle(all_combinations)

        for new_combination in all_combinations:
            new_combination = [word for word, _ in new_combination]  # Only take the word, not its similarity score
            if any(word in current_words for word in new_combination):  # Ensure no words from the current list are used
                continue

            new_combination_str = ' '.join(new_combination)
            if new_combination_str not in visited_combinations:
                visited_combinations.add(new_combination_str)
                score = manager.query(new_combination)
                if score > best_score:
                    best_score = score
                    best_combination = new_combination[:]
                    improvement_found += 1
                    print(f"Step {step+1}: New best combination is '{best_combination}' with score {best_score}")

        if improvement_found > 4:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break
        else:
            current_words = best_combination

    return best_combination


sw = ['snowboarder', 'mankind', 'ski', 'human_beings', 'diningroom']
# Assuming the existence of the required objects and methods
optimized_combination = optimize_all_words_sampling(
    word2vec_model, start_words=sw,
    n_steps=10, n_words=20
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


The combination with the maximum score is 'snowboarder mankind ski human_beings diningroom' with a score of 0.85


In [None]:
data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


In [None]:
sw =  ["sandwich", "banana", "snowing", "awareness", "housepets"]

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw,
    n_steps=20, n_words=50
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")

The combination with the maximum score is 'snowboarder mankind ski human_beings diningroom' with a score of 0.85


In [None]:
manager.query(["snow", "snow", "snow", "snow", "snow"])

query: snow snow snow snow snow 0.8


0.8

In [None]:
manager.query(["ski", "ski", "snow", "ski", "snow"])

query: ski ski snow ski snow 0.81


0.81

In [None]:
manager.query(["ski", "ski", "ski", "ski", "ski"])

query: ski ski ski ski ski 0.8


0.8

In [None]:
manager.query(["human", "human", "human", "human", "ski"])

query: human human human human ski 0.82


0.82

In [None]:
manager.query(["human", "human", "human", "human", "human"])

query: human human human human human 0.83


0.83

In [None]:
manager.query(["human", "person", "snowman", "woman", "snow"])

0.85

In [None]:
manager.query(["snowing", "falling", "snowman", "woman", "snowaaabadasdfas"])

query: snowing falling snowman woman snowaaabadasdfas 0.83


0.83

In [None]:
manager.query(["snowing", "skldahfaipuwohf", "snowman", "woman", "snowaaabadasdfas"])

query: snowing skldahfaipuwohf snowman woman snowaaabadasdfas 0.82


0.82

In [None]:
manager.query(["asdkl;fjaslkj", "skldahfaipuwohf", "aeawf", "a", "snowaaabadasdfas"])

query: asdkl;fjaslkj skldahfaipuwohf aeawf a snowaaabadasdfas 0.77


0.77

In [None]:
manager.query(["ab", "cd", "ef", "gh" "ijkas"])

query: ab cd ef ghijkas 0.78


0.78

In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")

The combination with the maximum score is 'snowboarder mankind ski human_beings diningroom' with a score of 0.85


In [None]:
import numpy as np
import random

def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, n_words_random=10, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []

    all_words = english_words

    for step in range(n_steps):
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)
            similar_words = [w for w, _ in all_similar_words if w not in current_words]

            # Get n_words_random random words from the corpus
            random_words = random.sample(all_words, n_words_random)
            random_words = [w for w in random_words if w not in current_words]

            # Combine similar and random words
            candidate_words = similar_words + random_words

            # Gather scores for candidate words to enable intelligent sampling
            word_scores = []
            for candidate_word in candidate_words:
                combined_words = current_words[:]
                combined_words[i] = candidate_word
                combined_words_str = ' '.join(combined_words)
                if combined_words_str in manager.queried_words:
                    word_scores.append(manager.queried_words[combined_words_str])
                else:
                    score = manager.query(combined_words)
                    word_scores.append(score)

            # If no new scores are found, skip to the next word
            if not word_scores:
                continue

            # Select a word using intelligent sampling
            selected_word = intelligent_sampling(candidate_words, word_scores, temperature=temperature)
            new_combination = current_words[:]
            new_combination[i] = selected_word
            score = manager.query(new_combination)

            # Update the best score and combination found so far
            if score >= best_score:
                best_score = score
                best_combination = new_combination[:]
                print(f"Step {step+1}, Word {i+1}: New combination is '{new_combination}' with score {score}")

        current_words = best_combination
        if not best_combination:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break

    return best_combination

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method and a vocab attribute
# - manager: An instance of QueryManager with an appropriate query method

sw = ['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale']
sw = ["human", "person", "snowman", "woman", "snow"]

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=10, n_words=50, n_words_random=50
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
len(english_words)

236736

In [None]:
sw = 'snowboarder mankind telemarkers human_beings diningroom'.split()
manager.query(sw)

query: snowboarder mankind telemarkers human_beings diningroom 0.86


0.86

In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


The combination with the maximum score is 'snowboarder mankind telemarkers human_beings diningroom' with a score of 0.86


In [None]:

import json

# Your dictionary
data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


In [None]:
import nltk
from nltk.corpus import brown

nltk.download('brown')
freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
common_words = [word for word, count in freq_dist.most_common(20_000)]  # get the top 5000 common words

print(len(common_words))

In [None]:
import numpy as np
import random
from json.decoder import JSONDecodeError



def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, n_words_random=10, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []

    all_words = common_words

    for step in range(n_steps):
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)
            similar_words = [w for w, _ in all_similar_words if w not in current_words]

            # Get n_words_random random words from the corpus
            random_words = random.sample(all_words, n_words_random)
            random_words = [w for w in random_words if w not in current_words]

            # Combine similar and random words
            candidate_words = similar_words + random_words

            # Gather scores for candidate words to enable intelligent sampling
            word_scores = []
            for candidate_word in candidate_words:
                combined_words = current_words[:]
                combined_words[i] = candidate_word
                combined_words_str = ' '.join(combined_words)
                if combined_words_str in manager.queried_words:
                    word_scores.append(manager.queried_words[combined_words_str])
                else:
                    try:
                      score = manager.query(combined_words)
                    except JSONDecodeError as e:
                      print("An error occurred while trying to decode the JSON response.")
                      print(combined_words)
                      raise e
                    word_scores.append(score)

            # If no new scores are found, skip to the next word
            if not word_scores:
                continue

            # Select a word using intelligent sampling
            selected_word = intelligent_sampling(candidate_words, word_scores, temperature=temperature)
            new_combination = current_words[:]
            new_combination[i] = selected_word
            score = manager.query(new_combination)

            # Update the best score and combination found so far
            if score >= best_score:
                best_score = score
                best_combination = new_combination[:]
                print(f"Step {step+1}, Word {i+1}: New combination is '{new_combination}' with score {score}")

        current_words = best_combination
        if not best_combination:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break

    return best_combination

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method and a vocab attribute
# - manager: An instance of QueryManager with an appropriate query method

sw = ['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale']
sw = ["human", "person", "snowman", "woman", "snow"]
sw = 'snowboarder mankind telemarkers human_beings diningroom'.split()

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=10, n_words=10, n_words_random=20
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


The combination with the maximum score is 'snowboarder mankind telemarkers human_beings diningroom' with a score of 0.86


In [None]:
def get_all_max_score_words(manager):
    # Get all the scores from the manager's queried words
    scores = manager.queried_words.values()

    # Find the maximum score
    max_score = max(scores)

    # Find all keys (word combinations) that have the maximum score
    max_combinations = [key for key, value in manager.queried_words.items() if value == max_score]

    return max_combinations, max_score


# Usage:
max_combinations, max_score = get_all_max_score_words(manager)
for combination in max_combinations:
    print(f"The combination '{combination}' has a score of {max_score}")


The combination 'snowboarder mankind telemarkers human_beings diningroom' has a score of 0.86
The combination 'avid_snowboarder mankind telemarkers human_beings diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers Human_beings diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers humans diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers humanity diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers human diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers Humans diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers humanness diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers humanbeing diningroom' has a score of 0.86
The combination 'snowboarder mankind telemarkers human_beings stainless_appliances' has a score of 0.86
The combination 'cameras mankind telemarkers humanity Palladian_window' h

In [None]:
manager.query("snowboarder mankind telemarkers sun-tan tile_backsplashes".split())

query: snowboarder mankind telemarkers sun-tan tile_backsplashes 0.83


0.83

In [None]:
data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")





data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


The combination with the maximum score is 'snowboarder mankind telemarkers human_beings diningroom' with a score of 0.86


In [None]:
sw = 'snowboarder mankind telemarkers human diningroom'.split()




optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=50, n_words=100, n_words_random=100
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")





data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)


In [None]:
manager.query(['snowboarder', 'mankind', 'telemarkers', 'human', 'glazed_atrium'])

query: snowboarder mankind telemarkers human glazed_atrium 0.84


0.84

In [None]:
import nltk
from nltk.corpus import brown

nltk.download('brown')
freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
common_words = [word for word, count in freq_dist.most_common(50_000)]  # get the top 5000 common words

print(len(common_words))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


49815


In [None]:
import numpy as np
import itertools
import random

def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_all_words_sampling(word2vec_model, start_words, n_steps=5, n_words=20, n_words_random=20, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []
    visited_combinations = set()

    all_words = common_words

    improvement_found = 0
    for step in range(n_steps):


        similar_words_lists = []
        for word in current_words:
            list_1 = word2vec_model.most_similar(word, topn=n_words)
            list_2 = random_words = random.sample(all_words, n_words_random)
            similar_words_lists.append(list_1 + list_2)

        all_combinations = list(itertools.product(*similar_words_lists))
        random.shuffle(all_combinations)

        for new_combination in all_combinations:
            new_combination = [word for word, _ in new_combination]  # Only take the word, not its similarity score
            if any(word in current_words for word in new_combination):  # Ensure no words from the current list are used
                continue

            new_combination_str = ' '.join(new_combination)
            if new_combination_str not in visited_combinations:
                visited_combinations.add(new_combination_str)
                try:
                    score = manager.query(new_combination)
                except JSONDecodeError as e:
                    print("An error occurred while trying to decode the JSON response.")
                    print(combined_words)
                    raise e

                if score > best_score:
                    best_score = score
                    best_combination = new_combination[:]
                    improvement_found += 1
                    print(f"Step {step+1}: New best combination is '{best_combination}' with score {best_score}")

        if improvement_found > 10:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break
        else:
            current_words = best_combination

    return best_combination


sw = ['snowboarder', 'mankind', 'ski', 'human_beings', 'diningroom']
sw = 'snowboarder mankind telemarkers humanity diningroom'.split()


# Assuming the existence of the required objects and methods
optimized_combination = optimize_all_words_sampling(
    word2vec_model, start_words=sw,
    n_steps=100,
    n_words_random=40,
    n_words=10
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
data_dict = manager.queried_words

# Writing to json
with open('data.json', 'w', encoding='utf-8') as file:
    json.dump(data_dict, file, ensure_ascii=False, indent=4)

In [None]:
import requests

input_data = "cat jungle hat space water"

def query(input_data):
    response = requests.post('http://semantle2.advml.com/score', json={'data': input_data})
    return response.json()

query(input_data)

{'message': 0.75}

In [None]:
query('snowboarder mankind telemarker humanity room')

In [None]:
query('person man telemarker humanity room')

{'message': 0.87}

In [None]:
query('person woman telemarker humanity room')

{'message': 0.89}

In [None]:
query('person woman couple humanity room')

{'message': 0.88}

In [None]:
query('person woman telemarker mankind room')

{'message': 0.9}

In [None]:
query('person woman telemarker mankind all')

{'message': 0.89}

In [None]:
query('person woman telemarker mankind house')

{'message': 0.89}

In [None]:
query('person woman telemarker mankind room')

{'message': 0.9}

In [None]:
query('person woman person man person')

{'message': 0.9}

In [None]:
query('person man person woman person')

{'message': 0.9}

In [None]:
query('person woman human man people')

{'message': 0.9}

In [None]:
! pip install gensim
! pip install nltk



In [None]:
import gensim.downloader as api
from nltk.corpus import words
import nltk

# Downloading the word2vec word embeddings
word2vec_model = api.load("word2vec-google-news-300")  # you can choose another model if you prefer

# Downloading the words corpus
nltk.download('words')
english_words = words.words()

# Now you can use `word2vec_model` to get the embeddings and `english_words` to get the English words




[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
import nltk
from nltk.corpus import brown

nltk.download('brown')
freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
common_words = [word for word, count in freq_dist.most_common(100_000)]  # get the top 5000 common words

print(len(common_words))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


49815


In [None]:
import json
import requests

class QueryManager:
    def __init__(self):
        self.queried_words = {}

    def query(self, words_list):
        words_key = ' '.join(words_list)  # Convert list to string to use as a key
        if words_key in self.queried_words:
            return self.queried_words[words_key]

        try:
          response = requests.post('http://semantle2.advml.com/score', json={'data': words_key})
          response_data = response.json()
        except JSONDecodeError as e:
          print("An error occurred while trying to decode the JSON response.")
          print(words_list)
          print(response)
          print(response.text)
          raise e

        if 'flag' in response_data:
            print(response.text)

        score = response_data["message"]
        print(f"query: {words_key} {score}")
        self.queried_words[words_key] = score
        return score

    def get_max_score_word(self):
        max_key = max(self.queried_words, key=lambda k: self.queried_words[k])
        return max_key, self.queried_words[max_key]

# Example usage:
manager = QueryManager()

# Reading from json and assigning it to manager's queried_words
# with open('data.json', 'r', encoding='utf-8') as file:
#     manager.queried_words = json.load(file)

# Now the manager's queried_words has been populated with the data from the JSON file


manager.query(["dog", "forest", "hat", "space", "wind"])
manager.query(["dog", "jogging", "table", "earth", "wind"])
manager.query(['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'wind'])
manager.query(['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale'])
manager.query('person woman telemarker mankind room'.split())
manager.query('person woman telemarker mankind all'.split())
manager.query('man woman man camera camera'.split())
manager.query('man woman man camera camera'.split())


max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


query: dog forest hat space wind 0.74
query: dog jogging table earth wind 0.76
query: dogs snowboarding_mountain_biking dining_room mankind wind 0.83
query: dogs snowboarding_mountain_biking dining_room mankind gale 0.83
query: person woman telemarker mankind room 0.9
query: person woman telemarker mankind all 0.89
query: man woman man camera camera 0.96
The combination with the maximum score is 'man woman man camera camera' with a score of 0.96


In [None]:
import numpy as np
import random
from json.decoder import JSONDecodeError



def softmax(scores, temperature=1.0):
    exp_scores = np.exp(np.array(scores) / temperature)
    return exp_scores / sum(exp_scores)

def intelligent_sampling(similar_words, word_scores, temperature=1.0):
    probabilities = softmax(word_scores, temperature)
    word_index = np.random.choice(range(len(similar_words)), p=probabilities)
    return similar_words[word_index]

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, n_words_random=10, temperature=0.5):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = -float('inf')
    best_combination = []

    all_words = common_words

    for step in range(n_steps):
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)
            similar_words = [w for w, _ in all_similar_words if w not in current_words]

            # Get n_words_random random words from the corpus
            random_words = random.sample(all_words, n_words_random)
            random_words = [w for w in random_words if w not in current_words]

            # Combine similar and random words
            candidate_words = similar_words + random_words

            # Gather scores for candidate words to enable intelligent sampling
            word_scores = []
            for candidate_word in candidate_words:
                combined_words = current_words[:]
                combined_words[i] = candidate_word
                combined_words_str = ' '.join(combined_words)
                if combined_words_str in manager.queried_words:
                    word_scores.append(manager.queried_words[combined_words_str])
                else:
                    try:
                      score = manager.query(combined_words)
                    except JSONDecodeError as e:
                      print("An error occurred while trying to decode the JSON response.")
                      print(combined_words)
                      raise e
                    word_scores.append(score)

            # If no new scores are found, skip to the next word
            if not word_scores:
                continue

            # Select a word using intelligent sampling
            selected_word = intelligent_sampling(candidate_words, word_scores, temperature=temperature)
            new_combination = current_words[:]
            new_combination[i] = selected_word
            score = manager.query(new_combination)

            # Update the best score and combination found so far
            if score > best_score:
                best_score = score
                best_combination = new_combination[:]
                print(f"Step {step+1}, Word {i+1}: New combination is '{new_combination}' with score {score}")

        current_words = best_combination
        if not best_combination:
            print(f"No improvement found in step {step+1}. Ending optimization.")
            break

    return best_combination

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method and a vocab attribute
# - manager: An instance of QueryManager with an appropriate query method

# sw = ['dogs', 'snowboarding_mountain_biking', 'dining_room', 'mankind', 'gale']
# sw = ["human", "person", "snowman", "woman", "snow"]
# sw = 'snowboarder mankind telemarkers human_beings diningroom'.split()
sw = 'person woman telemarker mankind room'.split()

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=100, n_words=100, n_words_random=100
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
query('person woman human man people')

{'message': 0.9}

In [None]:
import numpy as np
import random
from json.decoder import JSONDecodeError

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, n_words_random=10):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = manager.query(start_words)
    print(f"starting score: {best_score}")
    best_combination = start_words[:]

    all_words = common_words

    for step in range(n_steps):
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)
            similar_words = [w for w, _ in all_similar_words if w not in current_words]

            # Get n_words_random random words from the corpus
            random_words = random.sample(all_words, n_words_random)
            random_words = [w for w in random_words if w not in current_words]

            # Combine similar and random words
            candidate_words = similar_words + random_words

            # Gather scores for candidate words
            word_scores = []
            for candidate_word in candidate_words:
                combined_words = current_words[:]
                combined_words[i] = candidate_word
                # combined_words_str = ' '.join(combined_words)
                try:
                    score = manager.query(combined_words)
                    word_scores.append(score)
                except JSONDecodeError as e:
                    print("An error occurred while trying to decode the JSON response.")
                    print(combined_words)
                    raise e

            # Find the word with the highest score among candidates
            if word_scores:
                max_index = word_scores.index(max(word_scores))
                if word_scores[max_index] > best_score:
                    print("!!!!!!! IMPROVED SCORE !!!!!!!!!!!!!")
                    best_score = word_scores[max_index]
                    best_combination = current_words[:]
                    best_combination[i] = candidate_words[max_index]
                    print(f"Step {step+1}, Word {i+1}: New combination is '{best_combination}' with score {best_score}")


        if current_words == best_combination:
          max_index = word_scores.index(max(word_scores))
          best_score = word_scores[max_index]
          best_combination = current_words[:]
          best_combination[i] = candidate_words[max_index]
          print(f"Step {step+1}, Word {i+1}: No Better Word Found but let us update combination is '{best_combination}' with score {best_score}")

        current_words = best_combination


    return current_words

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method and a vocab attribute
# - manager: An instance of QueryManager with an appropriate query method

sw = 'person woman telemarker mankind room'.split()

# sw = 'person woman human man people'.split()

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=100, n_words=100, n_words_random=100
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


In [None]:
import numpy as np
import random
from json.decoder import JSONDecodeError

def optimize_word_closest(word2vec_model, start_words, n_steps=5, n_words=20, n_words_random=10):
    if not isinstance(start_words, list) or len(start_words) != 5:
        raise ValueError("start_words must be a list of exactly five words.")

    current_words = start_words[:]
    best_score = manager.query(start_words)
    print(f"starting score: {best_score}")
    best_combination = start_words[:]

    all_words = common_words
    set_common_words = set(common_words)

    for step in range(n_steps):
        for i, word in enumerate(current_words):
            # Get n_words most similar words to the current one
            all_similar_words = word2vec_model.most_similar(word, topn=n_words * 2)
            similar_words = [w for w, _ in all_similar_words if w not in current_words]
            similar_words = [w.lower() for w in similar_words if w.lower() in set_common_words]

            # Get n_words_random random words from the corpus
            random_words = random.sample(all_words, n_words_random)
            random_words = [w for w in random_words if w not in current_words]

            # Combine similar and random words
            candidate_words = similar_words + random_words

            # Gather scores for candidate words
            word_scores = []
            for candidate_word in candidate_words:
                combined_words = current_words[:]
                combined_words[i] = candidate_word
                # combined_words_str = ' '.join(combined_words)
                try:
                    score = manager.query(combined_words)
                    word_scores.append(score)
                except JSONDecodeError as e:
                    print("An error occurred while trying to decode the JSON response.")
                    print(combined_words)
                    raise e

            # Find the word with the highest score among candidates
            if word_scores:
                max_index = word_scores.index(max(word_scores))
                if word_scores[max_index] > best_score:
                    print("!!!!!!! IMPROVED SCORE !!!!!!!!!!!!!")
                    best_score = word_scores[max_index]
                    best_combination = current_words[:]
                    best_combination[i] = candidate_words[max_index]
                    print(f"Step {step+1}, Word {i+1}: New combination is '{best_combination}' with score {best_score}")


        if current_words == best_combination:
          max_index = word_scores.index(max(word_scores))
          best_score = word_scores[max_index]
          best_combination = current_words[:]
          best_combination[i] = candidate_words[max_index]
          print(f"Step {step+1}, Word {i+1}: No Better Word Found but let us update combination is '{best_combination}' with score {best_score}")

        current_words = best_combination


    return current_words

# Assuming the existence of the required objects and methods:
# - word2vec_model: A word2vec model object with a most_similar method and a vocab attribute
# - manager: An instance of QueryManager with an appropriate query method

# sw = 'person woman telemarker mankind room'.split()
# sw = 'person woman human man people'.split()
sw = ['woman', 'camera', 'human', 'man', 'people']
sw = ['woman', 'woman', 'man', 'camera', 'camera']

sw = ['person', 'woman', 'man', 'camera', 'camera']

optimized_combination = optimize_word_closest(
    word2vec_model, start_words=sw, n_steps=3, n_words=500, n_words_random=1
)
print(f"Optimized combination: '{optimized_combination}'")

max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


starting score: 0.96
query: whosoever woman man camera camera 0.9
query: student woman man camera camera 0.93
query: enquirer woman man camera camera 0.91
query: she woman man camera camera 0.95
query: camper woman man camera camera 0.91
query: shooter woman man camera camera 0.91
query: citizens woman man camera camera 0.92
query: complainant woman man camera camera 0.9
query: or woman man camera camera 0.93
query: player woman man camera camera 0.95
query: driver woman man camera camera 0.91
query: type woman man camera camera 0.92
query: everybody woman man camera camera 0.92
query: hunter woman man camera camera 0.89
query: shopper woman man camera camera 0.91
query: truthfully woman man camera camera 0.91
query: caring woman man camera camera 0.92
query: seeker woman man camera camera 0.9
query: day woman man camera camera 0.92
query: how woman man camera camera 0.92
query: assailant woman man camera camera 0.9
query: villager woman man camera camera 0.91
query: member woman man c

In [None]:

word2vec_model.most_similar("person", topn=200)

[('someone', 0.6657356023788452),
 ('persons', 0.5559711456298828),
 ('woman', 0.5470173358917236),
 ('somebody', 0.5459041595458984),
 ('peson', 0.5421414375305176),
 ('man', 0.5342026352882385),
 ('people', 0.5083409547805786),
 ('anyone', 0.5061744451522827),
 ('guy', 0.48752474784851074),
 ('Someone', 0.473005473613739),
 ('individuals', 0.47029492259025574),
 ('foreigner', 0.46837544441223145),
 ('aperson', 0.4661904275417328),
 ('citizen', 0.4597834348678589),
 ('child', 0.4595995843410492),
 ('perpetrator', 0.45697149634361267),
 ('somone', 0.4560295641422272),
 ('anybody', 0.4523891806602478),
 ('degrating', 0.448263555765152),
 ('businessperson', 0.4365586042404175),
 ('applicant', 0.43481793999671936),
 ('perosn', 0.4329134523868561),
 ('conservatee', 0.43027427792549133),
 ('Anybody', 0.42944034934043884),
 ('sweetest_kindest', 0.42599958181381226),
 ('Lt._Duhamell', 0.42578843235969543),
 ('gentleman', 0.42577871680259705),
 ('DEAR_PERPLEXED', 0.42446368932724),
 ('referers

In [None]:
"whomever" in common_words

False

In [None]:
max_combination, max_score = manager.get_max_score_word()
print(f"The combination with the maximum score is '{max_combination}' with a score of {max_score}")


The combination with the maximum score is 'man woman man camera camera' with a score of 0.96


In [None]:
sw = ['woman', 'woman', 'woman', 'man', 'camera']
sw = ['woman', 'woman', 'man', 'camera', 'camera']
# sw = ['man', 'woman', 'man', 'camera', 'camera']
# sw = ['man', 'woman', 'man', 'camera', 'camera']

sw = ['person', 'woman', 'man', 'camera', 'cameras']
sw = " ".join(['man', 'woman', 'man', 'camera', 'camera'])

manager.query(sw)

query: m a n   w o m a n   m a n   c a m e r a   c a m e r a 0.86


0.86

In [None]:

words_list = ['man', 'woman', 'man', 'camera', 'camera']
words_key = ' '.join(words_list)  # Convert list to string to use as a key

response = requests.post('http://semantle2.advml.com/score', json={'data': words_key})
print(response.text)
response_data = response.json()

print(response_data)


{"message": 0.96}
{'message': 0.96}


In [None]:
sw = ['person', 'woman', 'man', 'tape_recorder', 'Cameras']

manager.query(sw)

query: person woman man tape_recorder Cameras 0.93


0.93

In [None]:
sw = ["deutsch", "woman", "man", "camera", "camera"]

manager.query(sw)

'Failed to process input'

In [None]:
input_data = ["deutsch", "woman", "man", "camera", "camera"]
input_data = " ".join(input_data)

response = requests.post('http://semantle2.advml.com/score', json={'data': input_data})
print(response.text)
response_data = response.json()
print(response_data)

# if 'flag' in response_data:
#     print(response.text)

# score = response_data["message"]
# print(f"query: {words_key} {score}")
# self.queried_words[words_key] = score
# return score

{"message": 0.9}
{'message': 0.9}


In [None]:
input_data

'deutsch woman man camera camera'

In [None]:
manager.queried_words

{'dog forest hat space wind': 0.74,
 'dog jogging table earth wind': 0.76,
 'dogs snowboarding_mountain_biking dining_room mankind wind': 0.83,
 'dogs snowboarding_mountain_biking dining_room mankind gale': 0.83,
 'person woman telemarker mankind room': 0.9,
 'person woman telemarker mankind all': 0.89,
 'someone woman telemarker mankind room': 0.85,
 'persons woman telemarker mankind room': 0.88,
 'somebody woman telemarker mankind room': 0.87,
 'peson woman telemarker mankind room': 0.88,
 'man woman telemarker mankind room': 0.89,
 'people woman telemarker mankind room': 0.88,
 'anyone woman telemarker mankind room': 0.82,
 'guy woman telemarker mankind room': 0.88,
 'Someone woman telemarker mankind room': 0.85,
 'individuals woman telemarker mankind room': 0.87,
 'foreigner woman telemarker mankind room': 0.84,
 'aperson woman telemarker mankind room': 0.87,
 'citizen woman telemarker mankind room': 0.86,
 'child woman telemarker mankind room': 0.88,
 'perpetrator woman telemarker

In [None]:
import nltk
from nltk.corpus import brown

nltk.download('brown')
freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
common_words = [word for word, count in freq_dist.most_common(10_000)]  # get the top 5000 common words

print(len(common_words))


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


10000


In [None]:
common_words = [word for word, count in freq_dist.most_common(10_000)]  # get the top 5000 common words

print(len(common_words))


10000


In [None]:
manager.query(['person', 'woman', 'man', 'camera'])

0.96

In [None]:
curr = ['person', 'woman', 'man', 'camera']
best_score = manager.query(curr)

new_best_score = []
new_best_phrase = []

for common_word in common_words:
  target = ['person', 'woman', 'man', 'camera', common_word]
  score = manager.query(target)
  if score >= best_score:
    print(f"{target} {score}")
    print()

    new_best_score.append(score)
    new_best_phrase.append(target.copy())

print(new_best_score)
print(new_best_phrase)


In [None]:
target

['person', 'woman', 'man', 'camera', 'television']