In [2]:
import torch
import torch.nn as nn
import nltk
from nltk.corpus import wordnet as wn
import json
from itertools import combinations

nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /home/auschra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# word form -> (morphology, orthography, phonology and multi word expressions)
#       properties of the word and characters themselvies. ie. silent letters, sounds like, pre/suffix

# PhonologicalEncoder() To-do: phonemes, (phonology)

# OrthographicEncoder() To-do: graphemes, (orthography)

# MorphologicalEncoder() To-do: morphemes, (morphology)

# LexicalEncoder() To-do: lemmas, (lexicon)

# SemanticEncoder() done: synonyms, hyponyms/hypernyms,(homographs)
        # To-do:  meronyms/holonyms, polysemy (maybe?)

# SyntacticEncoder() To-do: word order, word class, (morphology)

# PragmaticEncoder() To-do: implicature, presupposition, (conversational implicature)

# DiscourseEncoder() To-do: coherence, cohesion, (anaphora)

# WorldEncoder() To-do: encyclopedic, (associations)

# Combinatorial() To-do: combinations



In [None]:
prompt = """Solve today’s NYT Connections game. Here are the instructions for how to play this game:
Find groups of four items that share something in common.
Category Examples:
FISH: Bass, Flounder, Salmon, Trout
FIRE ___: Ant, Drill, Island, Opal
Categories will always be more specific than
‘5-LETTER-WORDS’, ‘NAMES’, or ‘VERBS.’
Example 1:
Words: [‘DART’, ‘HEM’, ‘PLEAT’, ‘SEAM’,
‘CAN’, ‘CURE’, ‘DRY’, ‘FREEZE’, ‘BITE’,
‘EDGE’, ‘PUNCH’, ‘SPICE’, ‘CONDO’, ‘HAW’,
‘HERO’, ‘LOO’]
Groupings:
1. Things to sew: [‘DART’, ‘HEM’, ‘PLEAT’,
‘SEAM’]
2. Ways to preserve food: [‘CAN’, ‘CURE’,
‘DRY’, ‘FREEZE’]
3. Sharp quality: [‘BITE’, ‘EDGE’, ‘PUNCH’,
‘SPICE’]
4. Birds minus last letter: [‘CONDO’, ‘HAW’,
‘HERO’, ‘LOO’]
Example 2:
Words: [1COLLECTIVE’, ‘COMMON’, ‘JOINT’,
‘MUTUAL’, ‘CLEAR’, ‘DRAIN’, ‘EMPTY’,
‘FLUSH’, ‘CIGARETTE’, ‘PENCIL’, ‘TICKET’,
‘TOE’, ‘AMERICAN’, ‘FEVER’, ‘LUCID’,
‘PIPE’]
Groupings:
1. Shared: [‘COLLECTIVE’, ‘COMMON’,
‘JOINT’, ‘MUTUAL’]
2. Rid of contents: [‘CLEAR’, ‘DRAIN’,
‘EMPTY’, ‘FLUSH’]
3. Associated with “stub”: [‘CIGARETTE’,
‘PENCIL’, ‘TICKET’, ‘TOE’]
4. __ Dream: [‘AMERICAN’, ‘FEVER’, ‘LU-
CID’, ‘PIPE’])
Example 3:
Words: [‘HANGAR’, ‘RUNWAY’, ‘TARMAC’,
‘TERMINAL’, ‘ACTION’, ‘CLAIM’, ‘COM-
PLAINT’, ‘LAWSUIT’, ‘BEANBAG’, ‘CLUB’,
‘RING’, ‘TORCH’, ‘FOXGLOVE’, ‘GUMSHOE’,
‘TURNCOAT’, ‘WINDSOCK’]
Groupings:
1. Parts of an airport: [‘HANGAR’, ‘RUNWAY’,
‘TARMAC’, ‘TERMINAL’]
2. Legal terms: [‘ACTION’, ‘CLAIM’, ‘COM-
PLAINT’, ‘LAWSUIT’]
3. Things a juggler juggles: [‘BEANBAG’,
‘CLUB’, ‘RING’, ‘TORCH’]
4. Words ending in clothing: [‘FOXGLOVE’,
‘GUMSHOE’, ‘TURNCOAT’, ‘WIND-
SOCK’]
Categories share commonalities:
• There are 4 categories of 4 words each
• Every word will be in only 1 category
• One word will never be in two categories
• As the category number increases, the connec-
tions between the words and their category
become more obscure. Category 1 is the most
easy and intuitive and Category 4 is the hard-
est
• There may be a red herrings (words that seems
to belong together but actually are in separate
categories)
• Category 4 often contains compound words
with a common prefix or suffix word
• A few other common categories include word
and letter patterns, pop culture clues (such as
music and movie titles) and fill-in-the-blank
phrases
You will be given a new example (Example 4) with
today’s list of words. First explain your reason
for each category and then give your final answer
following the structure below (Replace Category 1,
2, 3, 4 with their names instead)
Groupings:
Category1: [word1, word2, word3, word4]
Category2: [word5, word6, word7, word8]
Category3: [word9, word10, word11, word12]
Category4: [word13, word14, word15, word16]
Remember that the same word cannot be re-
peated across multiple categories, and you need
to output 4 categories with 4 distinct words each.
Also do not make up words not in the list. This is
the most important rule. Please obey
Example 4:
Words : [InsertGame]
Grouping"""

In [None]:
# example 
# Words: [‘DART’, ‘HEM’, ‘PLEAT’, ‘SEAM’, ‘CAN’, ‘CURE’, ‘DRY’, ‘FREEZE’, ‘BITE’, ‘EDGE’, ‘PUNCH’, ‘SPICE’, ‘CONDO’, ‘HAW’, ‘HERO’, ‘LOO’]
# Groupings:
# 1. Things to sew: [‘DART’, ‘HEM’, ‘PLEAT’, ‘SEAM’]
# 2. Ways to preserve food: [‘CAN’, ‘CURE’, ‘DRY’, ‘FREEZE’]
# 3. Sharp quality: [‘BITE’, ‘EDGE’, ‘PUNCH’, ‘SPICE’]
# 4. Birds minus last letter: [‘CONDO’, ‘HAW’, ‘HERO’, ‘LOO’]


In [124]:
# semantics class

class Semantics():
    def __init__(self):
        super().__init__()
        self.min_similarity = 0.0
        self.history = []
        self.words = []
        self.groups = []
        self.entries = 0
        self.correct = 0 

    @property
    def accuracy(self):
        return (self.correct / self.entries) if self.entries > 0 else 0
        
    def load_history(self, path):
        with open(path , 'r') as f:
            self.history = json.load(f)

    # synonyms / homonym
    def get_synsets(self, word):                                
        return wn.synsets(word)
    
    # find the best synonyms for each word to be compared
    def get_best_pair_similarity(self, word1, word2):           
        syns1 = self.get_synsets(word1)
        syns2 = self.get_synsets(word2)
        highest_similarity = 0
        
        for syn1 in syns1:
            for syn2 in syns2:
                similarity = syn1.wup_similarity(syn2)
                if similarity and similarity > highest_similarity:
                    highest_similarity = similarity
                    
        return highest_similarity if highest_similarity >= self.min_similarity else None
        
     # find best group of 4 
    def find_best_word_groups(self, words, group_size=4):          
        if len(words) < group_size:
            return []
            
        # find cossim between words in list
        similarities = {}
        for word1, word2 in combinations(words, 2):
            sim = self.get_best_pair_similarity(word1, word2)
            if sim:
                similarities[(word1, word2)] = sim
                
        # find group of 4 with best average similarity
        best_group = None
        best_group_score = 0
        
        for word_group in combinations(words, group_size):
            group_pairs = list(combinations(word_group, 2))
            if all(pair in similarities for pair in group_pairs):
                group_score = sum(similarities[pair] for pair in group_pairs) / len(group_pairs)
                if group_score > best_group_score:
                    best_group_score = group_score
                    best_group = word_group
                    
        # if cossim not found, likely not synset for that word 
        if not best_group:
            best_group = tuple(words[:group_size])

        else:
            pass
            
        # recursively find best group with remaining words
        remaining_words = [w for w in words if w not in best_group]
        return [best_group] + self.find_best_word_groups(remaining_words, group_size)

            
    # compare predicted with actual
    def compare_groups(self, predicted_groups, actual_groups):
        if not predicted_groups:
            return 0
            
        # convert everything to lower case
        actual_group_sets = [set(word.lower() for word in group['members']) for group in actual_groups]
        predicted_group_sets = [set(word.lower() for word in group) for group in predicted_groups]
        
        correct_groups = 0
        matched_actual_groups = set()
        
        # check pred vs actual, and that not already in matched
        for pred_group in predicted_group_sets:
            for i, actual_group in enumerate(actual_group_sets):
                if i not in matched_actual_groups and pred_group == actual_group:
                    print(f"correct: {pred_group}")
                    correct_groups += 1
                    matched_actual_groups.add(i)
                    break

        return correct_groups

    # process single entry
    def process(self, entry):
        self.groups = entry['answers']
        self.words = [word.lower() for group in self.groups for word in group['members']]
        print("\n words:", self.words)
        
        word_groups = self.find_best_word_groups(self.words)
        correct_groups = self.compare_groups(word_groups, self.groups)
        
        # update metrics
        self.correct += correct_groups
        self.entries += len(self.groups)  

        # running metrics per entry
        print(f'Correct groups in this entry: {correct_groups} out of {len(self.groups)}')
        print('Predicted:', word_groups)
        print('Actual:', [[word.lower() for word in group['members']] for group in self.groups])
        
        return word_groups, self.accuracy
    
    # process whole history
    def process_all_entries(self):
        print("\nProcessing all history entries...")
        all_results = []
        
        for i, entry in enumerate(self.history):
            print(f"\nEntry {i+1}/{len(self.history)}")
            word_groups, current_accuracy = self.process(entry)
            all_results.append({
                'entry_index': i,
                'predicted_groups': word_groups,
                'actual_groups': [[word.lower() for word in group['members']] for group in entry['answers']],
                'correct_groups': self.correct - sum(result.get('correct_groups', 0) for result in all_results),
                'accuracy_so_far': current_accuracy
            })
            
        # final stats
        print("\nFinal Statistics:")
        print(f"Total entries processed: {len(self.history)}")
        print(f"Total groups processed: {self.entries}")
        print(f"Total correct groups: {self.correct}")
        print(f"Final accuracy: {self.accuracy:.2%}")
        
        return all_results

In [None]:
sem = Semantics()
sem.load_history('datasets/history.json')
all_results = sem.process_all_entries()

# all results 
stats = False
if stats:
    for result in all_results:
        print(f"\nEntry {result['entry_index']}:")
        print(f"Correct groups: {result['correct_groups']}")
        print(f"Running accuracy: {result['accuracy_so_far']:.2%}")
        print("Predicted:", result['predicted_groups'])
        print("Actual:", result['actual_groups'])

In [43]:
# phonology test
from nltk.corpus import cmudict
nltk.download('cmudict')
cmu = cmudict.dict()

for entry in history[:1]:
    groups = entry['answers']       # entry is a single game
    words = [word for group in groups for word in group['members']]    # get all words 
    words = [word.lower() for word in words]
    for word in words:
        phoneme = cmu[']

print(phoneme)



[nltk_data] Downloading package cmudict to /home/auschra/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


[['HH', 'AH0', 'L', 'OW1'], ['HH', 'EH0', 'L', 'OW1']]


In [None]:

class Phonology():
    def __init__(self):
        super().__init__()


In [63]:
word = 'racecar'
print(word == word[::-1])

True


In [70]:
class Orthography(): # anagram, palindrome, alliteration
    def __init__(self):
        super().__init__()

# anagram (single use in archive lol)
for entry in history:
    groups = entry['answers']       # entry is a single game
    words = [word for group in groups for word in group['members']]    # get all words 
    words = [word.lower() for word in words]

    an_count = 0
    angrams = []
    palindrone_count = 0
    palindrome = []
    # never this simple
    # alliteration = []

    for i, word in enumerate(words):
        for j, other_word in enumerate(words):
            if i == j:  # skip same word 
                continue
            if sorted(word) == sorted(other_word):
                an_count += 1
                angrams.append(other_word)
            elif word == word[::-1]:
                palindrone_count += 1
                palindrome.append(word)
            else:
                pass

        if an_count == 4:
            print(f"Anagram found in group: {entry['id']} {angrams}")

        if palindrone_count == 2:
            print(f"Palindrome found in group: {entry['id']} {palindrome}")

    an_count = 0
    palindrone_count = 0



Anagram found in group: 94 ['open', 'peon', 'pone', 'nepo']


In [None]:
class PatternMatcher():
    def __init__(self):
        super().__init__()


class Morphology():
    def __init__(self):
        super().__init__()



class MultiWord():
    def __init__(self):
        super().__init__()

class Encyclopedic():
    def __init__(self):
        super().__init__()

class Association():
    def __init__(self):
        super().__init__()

class Combinations():
    def __init__(self):
        super().__init__()


In [None]:
class ConnectionsEncoder(nn.Module):
    def __init__(self, embd=768):
        super().__init__()
            self.model = 'Llama-3-1-8B'
            self.semantic_encoder = Semantics()
            self.pattern_matcher = PatternMatcher()
            self.morphology = Morphology()
            self.orthography = Orthography()
            self.phonology = Phonology()
            self.multi_word = MultiWord()
            self.encyclopedic = Encyclopedic()
            self.association = Association()
            self.combinations = Combinations()


    def relationships(self, words):

        prompt="""
        Consider these words and their semantic relationships.
        Look for:
        1. Direct category membership
        2. Metaphorical connections
        3. Word play or double meanings
        4. Context-dependent relationships
        """

    return self.combine_evidence(semantic_embeddings, patterns)

    # beam search to find optimal groupings
    # should 1 shot the problem, don't fall for red herrings

    def combine_evidence(self, semantic_embeddings, patterns):
        # combine all evidence
        return groupings
    



In [None]:
class BeamSearch(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = ConnectionsEncoder()

    # beam search to find optimal groupings
