In [1]:
import pandas as pd
from pathlib import Path

from src.config import SCOPES
from src.utils.list import flatten_list_of_lists
from src.scoring.google_sheets import GoogleSheets

In [2]:
sheets = GoogleSheets(path_to_token=Path("../../secrets_vault/token.json"), path_to_credentials=Path("../../secrets_vault/credentials.json"), scopes=SCOPES)

In [3]:
all_tables = sheets.get_all_tables()

In [4]:
all_tables.keys()

dict_keys(['competitive_intelligence', 'themes', 'market_intelligence', 'personalities'])

In [5]:
all_tables['market_intelligence']

{'OpenAI': ['OpenAI', 'Open AI'],
 'Anthropic': ['Anthropic'],
 'Cohere': ['Cohere'],
 'AI21': ['AI21 Labs', 'AI 21 Labs'],
 'Mistral AI': ['Mistral AI', 'Mistral'],
 'Scale': ['Scale', 'Scale AI'],
 'Google': ['Google',
  'GCP',
  'Google Cloud Platform',
  'Deepmind',
  'Google Deepmind'],
 'Microsoft': ['Microsoft', 'Azure'],
 'Amazon': ['Amazon', 'AWS'],
 'NVIDIA': ['NVIDIA', 'NVDA'],
 'Meta': ['Meta',
  'Facebook',
  'Whatsapp',
  'Instragram',
  'FAIR',
  'Facebook AI Research'],
 'HF': ['HuggingFace', 'Hugging Face'],
 'Twitter': ['Twitter', 'X', 'X.AI', 'Grok', 'Tesla']}

In [6]:
all_tables['themes']

{'AI': ['AI',
  'Artificial Intelligence',
  'AGI',
  'Artificial General Intelligence',
  'A.I'],
 'GenAI': ['GenAI', 'Gen AI', 'Generative AI', 'GenerativeAI'],
 'Model': ['LLM',
  'Large Language Model',
  'Foundational Model',
  'Foundational Language Model',
  'Language Model',
  'Transformer'],
 'Agents': ['autonomous agent',
  'AI agent',
  'RL agent',
  'reinforcement learning agent'],
 'Conversational AI': ['conversational AI', 'chatbot'],
 'Computer Vision': ['computer Vision', 'CV'],
 'Autonomous Vehicle': ['autonomous vehicle',
  'self-driving car',
  'autonomous driving'],
 'RAG': ['RAG',
  'Retrieval Augmented Generation',
  'Knowledge management',
  'RAG Graph',
  'Knowledge Search',
  'GraphRAG',
  'Graph RAG',
  'AI Search',
  'semantic search',
  'vector similarity'],
 'Fine-tuning': ['fine-tuning',
  'Fine-tune',
  'SFT',
  'LoRA',
  'finetune',
  'finetuning'],
 'Post-training': ['post-training',
  'RLHF',
  'Constitutional AI',
  'RL',
  'Reinforcement Learning',
 

In [7]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.3-cp311-cp311-manylinux_2_17_x86

In [18]:
test_text = "SAM ALTMAN SAYS ARTIFICIAL GENERAL INTELLIGENCE IS ON THE HORIZON\nSam Altman, CEO of OpenAI, stated at the DealBook Summit that Artificial  General\n\rIntelligence might impact everyday life less significantly than expected. He also loves LLM-as-a-Judge, but not as a judge."
print(test_text)

SAM ALTMAN SAYS ARTIFICIAL GENERAL INTELLIGENCE IS ON THE HORIZON
Sam Altman, CEO of OpenAI, stated at the DealBook Summit that Artificial  General
Intelligence might impact everyday life less significantly than expected. He also loves LLM-as-a-Judge, but not as a judge.


In [9]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
import spacy
from spacy.matcher import PhraseMatcher

In [11]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(test_text)

In [12]:
#nlp = spacy.load("en_core_web_sm")
#doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

SAM ALTMAN 0 10 ORG
HORIZON 60 67 ORG
Sam Altman 68 78 PERSON
OpenAI 87 93 ORG
the DealBook Summit 105 124 ORG
LLM 239 242 ORG


In [13]:
print(test_text)

SAM ALTMAN SAYS ARTIFICIAL   GENERAL INTELLIGENCE IS ON THE HORIZON
Sam Altman, CEO of OpenAI, stated at the DealBook Summit that Artificial  General
Intelligence might impact everyday life less significantly than expected. He also loves LLM-as-a-Judge, but not as a judge.


In [14]:
all_tables['market_intelligence'].set_index('key').loc["OpenAI"].dropna().values

array(['OpenAI', 'Open AI'], dtype=object)

In [15]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = list(all_tables['market_intelligence'].set_index('key').loc["OpenAI"].dropna().values)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

OpenAI


In [98]:
test_text = "SAM ALTMAN SAYS ARTIFICIAL GENERAL    INTELLIGENCE IS ON THE HORIZON.\nSam Altman, " + \
"CEO of OpenAI, stated at the DealBook Summit that artificial  general\n\rintelligence might impact everyday life less significantly than expected. " + \
"He also loves LLM-as-a-judges, but not as a judge, and he thinks A.I with a dot in-between is absolute dope! Actually he wants to see many AIs." + \
" Even though you can't beat artificial intelligences, plain and simple."
print(test_text)
nlp = spacy.load("en_core_web_sm")
doc = nlp(test_text)
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = list(all_tables['themes'].drop(columns="qualifying").set_index('key').loc["AI"].dropna().values)
print(terms)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(doc)
print(f"Found {len(matches)} matche(s)")
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

SAM ALTMAN SAYS ARTIFICIAL GENERAL    INTELLIGENCE IS ON THE HORIZON.
Sam Altman, CEO of OpenAI, stated at the DealBook Summit that artificial  general
intelligence might impact everyday life less significantly than expected. He also loves LLM-as-a-judges, but not as a judge, and he thinks A.I with a dot in-between is absolute dope! Actually he wants to see many AIs. Even though you can't beat artificial intelligences, plain and simple.
['AI', 'Artificial Intelligence', 'AGI', 'Artificial General Intelligence', 'A.I']
Found 1 matche(s)
A.I


In [99]:
test_text_split = ' '.join(test_text.split())
nlp = spacy.load("en_core_web_sm")
doc2 = nlp(test_text_split)
lemmas = [token.lemma_ for token in doc2]
lemmas_text = " ".join(lemmas)
new_doc = nlp(lemmas_text)
print(new_doc)

SAM ALTMAN say ARTIFICIAL GENERAL INTELLIGENCE be on the HORIZON . Sam Altman , CEO of OpenAI , state at the DealBook Summit that artificial general intelligence might impact everyday life less significantly than expect . he also love LLM - as - a - judge , but not as a judge , and he think A.I with a dot in - between be absolute dope ! actually he want to see many ai . even though you can not beat artificial intelligence , plain and simple .


In [104]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = list(all_tables['themes'].drop(columns="qualifying").set_index('key').loc["AI"].dropna().values)
print(terms)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(new_doc)
print(f"Found {len(matches)} matche(s)")
for match_id, start, end in matches:
    span = doc2[start:end]
    print(span.text)

['AI', 'Artificial Intelligence', 'AGI', 'Artificial General Intelligence', 'A.I']
Found 5 matche(s)
ARTIFICIAL GENERAL INTELLIGENCE
artificial general intelligence
A.I
AIs
artificial intelligences


In [105]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = list(all_tables['themes'].drop(columns="qualifying").set_index('key').loc["Evaluation"].dropna().values)
print(terms)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

matches = matcher(new_doc)
print(f"Found {len(matches)} matche(s)")
for match_id, start, end in matches:
    span = doc2[start-3:end]
    print(span.text)

['evaluation', 'eval', 'leaderboard', 'benchmark', 'metrics', 'as-a-Judge']
Found 1 matche(s)
loves LLM-as-a-judges
