In [1]:
%load_ext autoreload
%autoreload 2

## Overview

Create a file for supervised fine-tuning following [this](https://platform.openai.com/docs/guides/supervised-fine-tuning). Format of file is query:response pairs. Query is as below and response is the correct snippet

query = 'I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2. Return the anchor text only\n \
--- \n Document 1 \n -----\n {:s}'  '\n \
--- \n Document 2 \n -----\n {:s}'.format(test_article['bodyContent'], link_content)

In [2]:
import pandas as pd
import numpy as np
import tqdm
import re
import math
import statistics
import sys
import string
import ast
import logging
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from tqdm.auto import tqdm
from typing import List, Tuple
from data.make_data import get_links, is_article, save_links, extract_links, get_date_from_url, extract_links_main

sys.path.append('../hot_links')
# TODO Something like this in dataset.py
# from hot_links.config import PROCESSED_DATA_DIR, RAW_DATA_DIR

from hot_links.vector_db_utils import *
from hot_links.eval_utils import *

from openai import OpenAI

client = OpenAI()

np.random.seed(41)

logging.basicConfig(filename='out.log', filemode='a', level=logging.DEBUG)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm
[32m2026-01-26 08:56:07.365[0m | [1mINFO    [0m | [36mhot_links.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /workspaces/hot-links[0m


In [3]:
df_seed = pd.read_csv('../data/processed/seed.csv')
## Assumes you haven't done snowballing

df_seed['links'] = df_seed['links'].apply(ast.literal_eval)

In [4]:
df_snowball = pd.read_csv('../data/processed/snowball_1.csv')
df_snowball['links'] = df_snowball['links'].apply(ast.literal_eval)

In [5]:
df_seed.head()

Unnamed: 0.1,Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id,links
0,1942,australia-news/2016/mar/01/electoral-commissio...,Australia news,New Senate voting rules could be ready for Jul...,https://www.theguardian.com/australia-news/201...,The Australian Electoral Commission (AEC) says...,2016-02-29 23:49:14+00:00,1943,[{'href': 'https://www.theguardian.com/austral...
1,1943,politics/2016/feb/29/civil-service-chief-defen...,Politics,Iain Duncan Smith asks civil servants to ignor...,https://www.theguardian.com/politics/2016/feb/...,Iain Duncan Smith has asked civil servants in ...,2016-02-29 23:42:19+00:00,1944,[{'href': 'https://www.theguardian.com/politic...
2,1944,technology/2016/feb/29/apple-fbi-case-drug-dea...,Technology,Apple case: judge rejects FBI request for acce...,https://www.theguardian.com/technology/2016/fe...,A federal judge on Monday rejected an FBI requ...,2016-02-29 23:30:51+00:00,1945,[{'href': 'https://www.theguardian.com/technol...
3,1945,world/2016/feb/29/zika-virus-scientists-eviden...,World news,Zika virus: scientists present strong evidence...,https://www.theguardian.com/world/2016/feb/29/...,Scientists have amassed the strongest evidence...,2016-02-29 23:30:21+00:00,1946,[{'href': 'https://www.theguardian.com/comment...
4,1946,music/2016/feb/29/adele-review-a-lesson-in-con...,Music,Adele review – a lesson in control from the en...,https://www.theguardian.com/music/2016/feb/29/...,Three songs into the first date of her world t...,2016-02-29 23:12:50+00:00,1947,[]


In [25]:
test_tuning_article = df_seed.iloc[0]

In [26]:
test_tuning_article

Unnamed: 0                                                         1942
article_id            australia-news/2016/mar/01/electoral-commissio...
sectionName                                              Australia news
webTitle              New Senate voting rules could be ready for Jul...
webUrl                https://www.theguardian.com/australia-news/201...
bodyContent           The Australian Electoral Commission (AEC) says...
webPublicationDate                            2016-02-29 23:49:14+00:00
id                                                                 1943
links                 [{'href': 'https://www.theguardian.com/austral...
Name: 0, dtype: object

In [27]:
test_tuning_article_links = [d['href']+'.txt' for d in test_tuning_article['links']]
test_tuning_article_snippets = [d['link'] for d in test_tuning_article['links']]


In [28]:
test_tuning_article_links

['https://www.theguardian.com/australia-news/2015/aug/17/push-to-bring-back-abcc-building-industry-watchdog-blocked-by-senate.txt',
 'https://www.theguardian.com/australia-news/2016/feb/29/coalition-rules-out-changes-to-below-the-line-senate-voting.txt']

In [29]:
test_tuning_article_snippets

['rejected by the Senate in August',
 'continue to require voters to number every box']

In [18]:
df_snowball.url.head().values

array(['https://www.theguardian.com/australia-news/2015/aug/17/push-to-bring-back-abcc-building-industry-watchdog-blocked-by-senate',
       'https://www.theguardian.com/technology/2016/feb/19/fbi-apple-san-bernardino-shooter-court-order-iphone',
       'https://www.theguardian.com/commentisfree/2016/feb/05/the-zika-outbreak-share-your-stories-views-and-experiences',
       'https://www.theguardian.com/sport/2015/dec/01/sebastian-coe-damien-collins-humility-iaaf',
       'https://www.theguardian.com/sport/2016/jan/18/tennis-match-fixing-claims-authorities-absolutely-reject-any-cover-up'],
      dtype=object)

In [31]:
link_content = (df_snowball[df_snowball['url']+'.txt' == test_tuning_article_links[0]]).iloc[0,:].content

In [20]:
link_content

'The Senate has blocked the passage of a bill to reintroduce the Howard-era building industry watchdog, in a development overshadowed by the dispute over the future of the royal commission into trade unions.The bill failed on Monday because the Senate vote was tied at 33 in favour and 33 against. The government continued, however, to push ahead with a separate bill to toughen penalties for union officials who breach their duties.The Senate has previously blocked the latter bill, known as the registered organisation legislation, and if the upper house does so a second time it would create a trigger for a potential double-dissolution election.Senators debated the registered organisations bill on Monday evening and a vote was expected to occur some time on Tuesday.The government’s efforts to put the spotlight on union conduct – which it had wanted to use to apply pressure on the Labor party – have been undermined by the scrutiny of the former high court judge heading the royal commission,

In [21]:
def make_fine_tuning_json(doc_1, doc_2, snippet):
    ## Overview
    # Create a file for supervised fine-tuning following [this](https://platform.openai.com/docs/guides/supervised-fine-tuning). Format of file is query:response pairs. Query is as below and response is the correct snippet

    # query = 'I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2. Return the anchor text only\n \
    # --- \n Document 1 \n -----\n {:s}'  '\n \
    # --- \n Document 2 \n -----\n {:s}'.format(test_article['bodyContent'], link_content)
    prompt = 'I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2. Return the anchor text only\n \
    --- \n Document 1 \n -----\n {:s}'  '\n \
    --- \n Document 2 \n -----\n {:s}'.format(doc_1, doc_2)
    
    completion = snippet
    
    d = {}
    
    d['messages'] = []
    d['messages'].append({'role':'user', 'content': prompt})
    d['messages'].append({'role':'assistant', 'content': completion})
    
    return d

In [22]:
make_fine_tuning_json(test_tuning_article['bodyContent'], link_content, test_tuning_article_snippets[0])

{'messages': [{'role': 'user',
   'content': 'I want to link to document 2 from document 1. Give me the most appropriate substring from document 1 to link to document 2. Return the anchor text only\n     --- \n Document 1 \n -----\n The Australian Electoral Commission (AEC) says it could implement new Senate voting rules within 100 days, clearing the way for the system to be implemented in time for a July double-dissolution election. On Tuesday the AEC commissioner, Tom Rodgers, told a truncated inquiry into the voting overhaul the “three-month clock” would begin as soon as legislation was passed but “the AEC stands ready to deliver an election whenever the government call it with the legislation that’s in force at the time”. “If I get less time or resources, internally that’s not going to be a pretty look but we will deliver a successful election,” Rodgers told the joint select committee on electoral matters.\nThe government, backed by the Greens and Nick Xenophon, is proposing to abo