In [1]:
%load_ext autoreload
%autoreload 2

In [106]:
import pandas as pd
import numpy as np
import tqdm
import statistics
import sys
import ast
import logging
import matplotlib.pyplot as plt
#import seaborn as sns
from tqdm.auto import tqdm
from typing import List, Dict
from data.make_data import get_links, is_article, save_links, extract_links, get_date_from_url, extract_links_main

sys.path.append('../hot_links')
# TODO Something like this in dataset.py
# from hot_links.config import PROCESSED_DATA_DIR, RAW_DATA_DIR

from hot_links.vector_db_utils import *
from eval_utils import *

from openai import OpenAI

client = OpenAI()

np.random.seed(41)

logging.basicConfig(filename='out.log', filemode='a', level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [102]:
vs_name = 'hot_links'

vector_store_id = get_vector_store_id(vs_name = vs_name)

if not vector_store_id:
    print('Creating...')
    logger.info('Creating vs {vs_name}')
    vector_store_id = create_vs(vs_name)
    
logger.info('Set up {vs_name}')

In [4]:
df_seed = pd.read_csv('../data/processed/seed.csv')
## Assumes you haven't done snowballing

df_seed['links'] = df_seed['links'].apply(ast.literal_eval)

## Mean snippet length could be useful

In [75]:
df_seed['links'].apply(lambda x:[len(l['link'].split(' ')) for l in x]).explode().mean()

np.float64(5.116363182838613)

In [10]:
df_snowball = pd.read_csv('../data/processed/snowball_1.csv')
df_snowball['links'] = df_snowball['links'].apply(ast.literal_eval)

In [11]:
df_snowball.head()

Unnamed: 0.1,Unnamed: 0,url,date,links,content
0,0,https://www.theguardian.com/australia-news/201...,1439769600,[{'href': 'https://www.theguardian.com/austral...,The Senate has blocked the passage of a bill t...
1,1,https://www.theguardian.com/technology/2016/fe...,1455840000,[{'href': 'https://www.theguardian.com/technol...,The FBI accused Apple of prioritizing its publ...
2,2,https://www.theguardian.com/commentisfree/2016...,1454630400,[],The Zika outbreak has now been declared a glob...
3,3,https://www.theguardian.com/sport/2015/dec/01/...,1448928000,[],"The embattled IAAF president, Sebastian Coe, h..."
4,4,https://www.theguardian.com/sport/2016/jan/18/...,1453075200,[],"Before a ball was struck, the 2016 Australian ..."


## Starting with an article and a document link, find the snippet to insert it

In [64]:
test_article = df_seed.iloc[5,:]

In [65]:
test_article_links = [d['href']+'.txt' for d in test_article['links']]
test_article_snippets = [d['link'] for d in test_article['links']]

In [66]:
n_test_snippet = 1

In [67]:
test_article_links[n_test_snippet]

'https://www.theguardian.com/sport/2015/dec/01/sebastian-coe-damien-collins-humility-iaaf.txt'

In [68]:
test_article_snippets[n_test_snippet]

'Sebastian Coe regarding doping'

In [16]:
df_snowball.head()['url'].values

array(['https://www.theguardian.com/australia-news/2015/aug/17/push-to-bring-back-abcc-building-industry-watchdog-blocked-by-senate',
       'https://www.theguardian.com/technology/2016/feb/19/fbi-apple-san-bernardino-shooter-court-order-iphone',
       'https://www.theguardian.com/commentisfree/2016/feb/05/the-zika-outbreak-share-your-stories-views-and-experiences',
       'https://www.theguardian.com/sport/2015/dec/01/sebastian-coe-damien-collins-humility-iaaf',
       'https://www.theguardian.com/sport/2016/jan/18/tennis-match-fixing-claims-authorities-absolutely-reject-any-cover-up'],
      dtype=object)

In [27]:
link_content = (df_snowball[df_snowball['url']+'.txt' == test_article_links[1]]).iloc[0,:].content

In [60]:
query = 'I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2. Return the anchor text only\n \
--- \n Document 1 \n -----\n {:s}'  '\n \
--- \n Document 2 \n -----\n {:s}'.format(test_article['bodyContent'], link_content)

In [32]:
print(query)

I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2.
 --- 
 Document 1 
 -----
 Giles Clarke, the England and Wales Cricket Board president, will be summoned to answer questions by the Commons culture, media and sport select committee over his role in the controversial “Big Three” takeover of the International Cricket Council. Clarke, the former ECB chairman, was central to the reforms in early 2014 that led to India, England and Australia taking greater control of cricket’s governing body and allocating themselves 52% of revenues generated by international events. With the select committee having spoken to Greg Dyke, the FA chairman, over Fifa corruption, the athletics chief Sebastian Coe regarding doping and Chris Kermode of the ATP on the subject of match-fixing in tennis, it will now turn its attention to cricket’s governance.
“The committee has decided to look into the conduct of the ECB in relation to the gove

In [61]:
response = client.responses.create(
  model="gpt-4.1",
  input=query
)

In [62]:
response.output[0].content[0].text

'athletics chief Sebastian Coe regarding doping'

In [None]:
df_seed[df_seed['webUrl'] == link.split('.txt')[0]]

Unnamed: 0.1,Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id,links


In [None]:
test_article = df_seed.iloc[6,:]

test_article_links = [d['href']+'.txt' for d in test_article['links']]
test_article_snippets = [d['link'] for d in test_article['links']]

for link,snippet in zip(test_article_links, test_article_snippets):
    print('Link: {:s}\n>>>>>>> {:s}'.format(link,snippet))
    
    link_content = None
    
    results = (df_snowball[df_snowball['url']+'.txt' == link])
    if results.shape[0] == 0:
        print('No results found in snowball\n')
        # TODO find out why some links are missing
        # Hiding in df_seed? Seems yes
        
    else:
        link_content = results.iloc[0,:].content
        
    if not link_content:
        results = df_seed[df_seed['webUrl'] == link.split('.txt')[0]]
        
        if results.shape[0] == 0:
            print('No results found in seed\n')
        else:
            print('Found in seed')
            link_content = results.iloc[0,:].bodyContent

    if link_content:
        query = 'I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2. The average number of words in the substring is 5. Return the anchor text only\n \
        --- \n Document 1 \n -----\n {:s}'  '\n \
        --- \n Document 2 \n -----\n {:s}'.format(test_article['bodyContent'], link_content)

        
        response = client.responses.create(
        model="gpt-4o",
        input=query)
        
        print('<<<<<<', response.output[0].content[0].text)
        
        # TODO test to see if snippet is a substring of response
    print('\n')

Link: https://www.theguardian.com/politics/2016/feb/18/jeremy-corbyn-david-cameron-migrant-benefit-brake-ineffectual-eu.txt
>>>>>>> saying it is irrelevant
<<<<<< "the theatrical sideshow"


Link: https://www.theguardian.com/politics/2016/feb/24/tories-legal-status-david-cameron-eu-deal-conservatives-michael-gove-european-court.txt
>>>>>>> question the legal status of Cameron’s deal
No results found in snowball

Found in seed
<<<<<< "[Michael] Gove, the justice secretary and leave campaigner"




In [136]:
link

'https://www.theguardian.com/politics/2016/feb/24/tories-legal-status-david-cameron-eu-deal-conservatives-michael-gove-european-court.txt'

In [137]:
link.split('.txt')[0]

'https://www.theguardian.com/politics/2016/feb/24/tories-legal-status-david-cameron-eu-deal-conservatives-michael-gove-european-court'

In [138]:
df_seed['webUrl'] = df_seed['webUrl'].astype(str)

In [139]:
df_seed[df_seed['webUrl'] == link.split('.txt')[0]]

Unnamed: 0.1,Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id,links
1488,3430,politics/2016/feb/24/tories-legal-status-david...,Politics,EU referendum: Tories in open warfare over leg...,https://www.theguardian.com/politics/2016/feb/...,Open warfare has broken out in Conservative ra...,2016-02-25 07:36:26+00:00,3431,[{'href': 'https://www.theguardian.com/politic...


In [111]:
check_uploaded_file(vector_store_id,link.split('.txt')[0])

Testing: https://www.theguardian.com/australia-news/2016/feb/29/coalition-rules-out-changes-to-below-the-line-senate-voting
File found in cloud
FileObject(id='file-2xWhmCCSvFxWe9rC5rzo8d', bytes=5223, created_at=1757057756, filename='https://www.theguardian.com/australia-news/2016/feb/29/coalition-rules-out-changes-to-below-the-line-senate-voting.txt', object='file', purpose='assistants', status='processed', expires_at=None, status_details=None)
Found file in VS
VectorStoreFile(id='file-2xWhmCCSvFxWe9rC5rzo8d', created_at=1757057757, last_error=None, object='vector_store.file', status='completed', usage_bytes=7171, vector_store_id='vs_68b9f6481fb0819190be4981a98c48dd', attributes={'date': 1456726097.0, 'generation': 0.0, 'filename': 'https://www.theguardian.com/australia-news/2016/feb/29/coalition-rules-out-changes-to-below-the-line-senate-voting'}, chunking_strategy=StaticFileChunkingStrategyObject(static=StaticFileChunkingStrategy(chunk_overlap_tokens=400, max_chunk_size_tokens=800),

In [113]:
results = (df_snowball[df_snowball['url']+'.txt' == link])
results.shape


(0, 5)

In [116]:
results = (df_seed[df_seed['webUrl']+'.txt' == link])
results.shape


(1, 9)