In [1]:
import pandas as pd
from pathlib import Path

from src.config import SCOPES
from src.utils.list import flatten_list_of_lists
from src.scoring.google_sheets import GoogleSheets

In [2]:
sheets = GoogleSheets(path_to_token=Path("../../secrets_vault/token.json"), path_to_credentials=Path("../../secrets_vault/credentials.json"), scopes=SCOPES)

In [3]:
all_tables = sheets.get_all_tables()

In [4]:
all_tables.keys()

dict_keys(['competitive_intelligence', 'themes', 'market_intelligence', 'personalities'])

In [5]:
all_tables['market_intelligence']

{'OpenAI': ['OpenAI', 'Open AI'],
 'Anthropic': ['Anthropic'],
 'Cohere': ['Cohere'],
 'AI21': ['AI21 Labs', 'AI 21 Labs'],
 'Mistral AI': ['Mistral AI', 'Mistral'],
 'Scale': ['Scale', 'Scale AI'],
 'Google': ['Google', 'GCP', 'Google Cloud Platform', 'Deepmind'],
 'Microsoft': ['Microsoft', 'Azure'],
 'Amazon': ['Amazon', 'AWS'],
 'NVIDIA': ['NVIDIA', 'NVDA'],
 'Meta': ['Meta',
  'Facebook',
  'Whatsapp',
  'Instragram',
  'FAIR',
  'Facebook AI Research'],
 'HF': ['HuggingFace', 'Hugging Face'],
 'Twitter': ['Twitter', 'X', 'X.AI', 'Grok', 'Tesla']}

In [6]:
all_tables['themes']

{'AI': ['AI',
  'Artificial Intelligence',
  'AGI',
  'Artificial General Intelligence',
  'A.I'],
 'GenAI': ['GenAI', 'Gen AI', 'Generative AI', 'GenerativeAI'],
 'Model': ['LLM',
  'Large Language Model',
  'Foundational Model',
  'Foundational Language Model',
  'Language Model',
  'Transformer'],
 'Agents': ['autonomous agent',
  'AI agent',
  'RL agent',
  'reinforcement learning agent'],
 'Conversational AI': ['conversational AI', 'chatbot'],
 'Computer Vision': ['computer Vision', 'CV'],
 'Autonomous Vehicle': ['autonomous vehicle',
  'self-driving car',
  'autonomous driving'],
 'RAG': ['RAG',
  'Retrieval Augmented Generation',
  'Knowledge management',
  'RAG Graph',
  'Knowledge Search',
  'GraphRAG',
  'Graph RAG',
  'AI Search',
  'semantic search',
  'vector similarity'],
 'Fine-tuning': ['fine-tuning',
  'Fine-tune',
  'SFT',
  'LoRA',
  'finetune',
  'finetuning'],
 'Post-training': ['post-training',
  'RLHF',
  'Constitutional AI',
  'RL',
  'Reinforcement Learning',
 

In [7]:
import spacy
from spacy.matcher import PhraseMatcher

In [10]:
all_tables['market_intelligence']["OpenAI"]

['OpenAI', 'Open AI']

In [12]:
test_text = "SAM ALTMAN SAYS ARTIFICIAL GENERAL    INTELLIGENCE IS ON THE HORIZON.\nSam Altman, " + \
"CEO of OpenAI, stated at the DealBook Summit that artificial  general\n\rintelligence might impact everyday life less significantly than expected. " + \
"He also loves LLM-as-a-judges, but not as a judge, and he thinks A.I with a dot in-between is absolute dope! Actually he wants to see many AIs." + \
" Even though you can't beat artificial intelligences, plain and simple."
print(test_text)
nlp = spacy.load("en_core_web_sm")
doc = nlp(test_text)
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = all_tables['themes']["AI"]
print(terms)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(doc)
print(f"Found {len(matches)} matche(s)")
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

SAM ALTMAN SAYS ARTIFICIAL GENERAL    INTELLIGENCE IS ON THE HORIZON.
Sam Altman, CEO of OpenAI, stated at the DealBook Summit that artificial  general
intelligence might impact everyday life less significantly than expected. He also loves LLM-as-a-judges, but not as a judge, and he thinks A.I with a dot in-between is absolute dope! Actually he wants to see many AIs. Even though you can't beat artificial intelligences, plain and simple.
['AI', 'Artificial Intelligence', 'AGI', 'Artificial General Intelligence', 'A.I']
Found 1 matche(s)
A.I


In [13]:
test_text_split = ' '.join(test_text.split())
nlp = spacy.load("en_core_web_sm")
doc2 = nlp(test_text_split)
lemmas = [token.lemma_ for token in doc2]
lemmas_text = " ".join(lemmas)
new_doc = nlp(lemmas_text)
print(new_doc)

SAM ALTMAN say ARTIFICIAL GENERAL INTELLIGENCE be on the HORIZON . Sam Altman , CEO of OpenAI , state at the DealBook Summit that artificial general intelligence might impact everyday life less significantly than expect . he also love LLM - as - a - judge , but not as a judge , and he think A.I with a dot in - between be absolute dope ! actually he want to see many ai . even though you can not beat artificial intelligence , plain and simple .


In [15]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = all_tables['themes']["AI"]
print(terms)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(new_doc)
print(f"Found {len(matches)} matche(s)")
for match_id, start, end in matches:
    span = doc2[start:end]
    print(span.text)

['AI', 'Artificial Intelligence', 'AGI', 'Artificial General Intelligence', 'A.I']
Found 5 matche(s)
ARTIFICIAL GENERAL INTELLIGENCE
artificial general intelligence
A.I
AIs
artificial intelligences


In [16]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = all_tables['themes']["Evaluation"]
print(terms)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(new_doc)
print(f"Found {len(matches)} matche(s)")
for match_id, start, end in matches:
    span = doc2[start-3:end]
    print(span.text)

['evaluation', 'eval', 'leaderboard', 'benchmark', 'metrics', 'as-a-Judge']
Found 1 matche(s)
loves LLM-as-a-judges
