In [18]:
import openai
import pandas as pd 
import numpy as np 
import sys
sys.path.insert(0, '../make_source_label_hierarchy/')
import os
# os.environ['OPENAI_API_KEY']=open('/Users/spangher/.openai-my-project-key.txt').read().strip()
os.environ['OPENAI_API_KEY']=open('/Users/spangher/.openai-bloomberg-project-key.txt').read().strip()
client = openai.OpenAI()
def call_openai_completions(prompt, model_name='gpt-4o', max_tokens=1000):
    """
    Call the OpenAI completions API with a given prompt.
    
    Args:
    prompt (str): The prompt to send to the API.
    model_name (str): The name of the model to use.
    max_tokens (int): The maximum number of tokens to generate.
    
    Returns:
    dict: The response from the OpenAI API.
    """
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful journalist assistant."},
            {"role": "user", "content": prompt}
        ],
    )
    return response.choices[0].message.content

LEAF_NODE_PROMPT = """
You are a helpful assistant. I will give you a set of discourse labels and descriptions for sentences in news articles, in the format:
"Label": Description.
Please summarize the labels and the descriptions into a single label and description. Make the label 2-3 words, max.
If all the labels are the same, just copy the label. Ignore any topical information. Here are the labels:

```{labels}```

Be descriptive but not too broad. Please return just one label and one description. 
Make it in this format: "Label": Description
"""

In [10]:
import pandas as pd 
from sentence_transformers import SentenceTransformer
import random

In [11]:
import os 
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [13]:
model_path = '../scripts/make_source_label_hierarchy/trained-model'
model = SentenceTransformer(model_path)

In [14]:
discourse_df = pd.read_csv('../data/all_extracted_discourse_with_clusters.csv.gz', index_col=0)

In [19]:
labels = (
    discourse_df
        .loc[lambda df: df['cluster'] == 3]
        .drop_duplicates()
        .pipe(lambda df: df.sample(min(len(df), 50)))['discourse_label'].str.strip()
)
call_openai_completions(prompt=LEAF_NODE_PROMPT.format(labels='\n'.join(labels)))

'"Main Event": This sentence directly describes a primary event, focusing on its conclusion as the central point of the document.'

# Generate labels for the clusters

In [30]:
from tqdm.auto import tqdm
groups = list(discourse_df.groupby('cluster')['discourse_label'])
child_node_summaries = {}
for cluster_id, g in tqdm(groups):
    g = g.dropna()
    label_sample = g.sample(min(len(g), 50))
    summary_label = call_openai_completions(prompt=LEAF_NODE_PROMPT.format(labels='\n'.join(label_sample)))
    child_node_summaries[cluster_id] = summary_label

  0%|          | 0/1024 [00:00<?, ?it/s]

In [31]:
child_node_summaries_s = pd.Series(child_node_summaries)
child_node_summaries_s.to_csv('../data/initial_cluster_labels_2.csv')

In [32]:
pwd

'/Users/spangher/Projects/usc-research/news-discourse-students/notebooks'

# Cluster the clusters

In [35]:
bins = [5, 7, 9, 11, 13, 15, 20, 25, 30]

In [36]:
from sklearn.cluster import KMeans
import numpy as np

cluster_centers = np.load('../data/cluster_centroids.npy')

In [37]:
MIDDLE_TREE_PROMPT = """
You are a helpful assistant. I will give you a set of discourse labels and descriptions for sentences in news articles, in the format:
"Label": Description.
Please summarize the labels and the descriptions into a single label and description. Make the label 2-3 words, max.
If all the labels are the same, just copy the label. Ignore any topical information. Here are the labels:

{labels}

Be descriptive but not too broad. Please return just one label and one description. 
Make it in this format: "Label": Description
"""

In [38]:
sample_weights = discourse_df['cluster'].value_counts().sort_index()
initial_node_summaries_df = child_node_summaries_s.to_frame('cluster_summary')
initial_node_summaries_df['sample_num'] = sample_weights

In [39]:
all_cluster_summaries = {}
for b in tqdm(bins):
    kmeans = KMeans(b)
    initial_node_summaries_df[f'cluster_{b}'] = kmeans.fit_predict(cluster_centers, sample_weight=sample_weights)
    groups = list(initial_node_summaries_df.groupby(f'cluster_{b}')['cluster_summary'])
    child_node_summaries = {}
    for cluster_id, g in tqdm(groups):
        g = g.dropna()
        label_sample = g.sample(min(len(g), 50))
        summary_label = call_openai_completions(prompt=MIDDLE_TREE_PROMPT.format(labels='\n'.join(label_sample)))
        num_orig_items = initial_node_summaries_df.loc[lambda df: df[f'cluster_{b}'] == cluster_id]['sample_num'].sum()
        child_node_summaries[cluster_id] = (summary_label, num_orig_items)
    all_cluster_summaries[b] = child_node_summaries

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

In [49]:
idx_to_focus = 11
clustered_labels = (
    pd.DataFrame(all_cluster_summaries[idx_to_focus]).T#[0]
         .rename(columns={0:'Description', 1:'Count'})
         .sort_values('Count', ascending=False)
)

In [50]:
clustered_labels

Unnamed: 0,Description,Count
1,"""Background Information"": This label encompass...",40269
6,"""Specific Detail"": This label encompasses sent...",29996
2,"""Quote"": A direct statement from a source prov...",26557
3,"""Main Event"": This label encompasses sentences...",22933
10,"""Reflection"": This description provides insigh...",22649
0,"""Transition"": This sentence signals a shift in...",16033
4,"""Introduction"": This sentence introduces the m...",11366
5,"""Future Outlook"": This category encompasses se...",9901
9,"""Conclusion"": This sentence serves as a final ...",6523
7,"""Counterpoint"": This sentence provides an oppo...",5501


In [56]:
data_to_label = (
    discourse_df
         .merge(
             initial_node_summaries_df[['cluster_5', 'cluster_11', 'cluster_13', 'cluster_20']],
             left_on='cluster',
             right_index=True
         )
         .drop(columns="cluster")
         .merge(clustered_labels, left_on=f'cluster_{idx_to_focus}', right_index=True)
)   

# Make Few-Shot

In [117]:
import numpy as np 

FEW_SHOT_PROMPT = """I am trying to find good examples to use for demonstrating a label.
Here is the label: {label}. The definition for the label is: {definition}.

Here are a large set of examples I have, alone with notes for each one:
[Start Examples]
{examples}
[End Examples]

Some examples are bad. Please choose 4 examples that best represent this label. Try to pick diverse ones. 
Return the examples and the notes, and copy them fully. 
Return as a json. Be careful to format the quotes correctly.
"""
CORRECT_JSON_PROMPT = '''Here is a poorly formatted json. Please correct it: 

{r}

Return just the corrected JSON. Be careful about getting the quote structure right. Don't change anything about the content.
'''

In [158]:
import glob
files = glob.glob('../data/article_text/*')
article_text_df = pd.concat(list(map(lambda x: pd.read_json(x, lines=True), files)))

import spacy
nlp = spacy.load('en_core_web_lg')

_spacy_model = None
def load_spacy_model():
    global _spacy_model
    if _spacy_model is None:
        import spacy
        _spacy_model = spacy.load("en_core_web_lg")
        _spacy_model.disable_pipes(*[pipe for pipe in _spacy_model.pipe_names if pipe != 'senter'])
        _spacy_model.add_pipe('sentencizer')
    return _spacy_model


def sentencize_text_column(s):
    nlp = load_spacy_model()
    all_sents = []
    for doc in tqdm(nlp.pipe(s.tolist(), batch_size=100, n_process=1), total=len(s)):
        all_sents.append(list(map(str, doc.sents)))
    return all_sents

article_text_df['sentences'] = article_text_df['response'].str.replace('`', '').str.strip().pipe(sentencize_text_column)

valid_sentence_parses = pd.concat([
    (data_to_label['key']
         .drop_duplicates()
         .str.split('__')
         .str.get(0)
         .str.strip()
         .value_counts()
   ).to_frame('discourse_sent_counts'),
    article_text_df.explode('sentences')['key'].value_counts().to_frame('orig_sent_counts')  
], axis=1).loc[lambda df: df['discourse_sent_counts'] == df['orig_sent_counts']]

# article_text = pd.read_csv('../data/batch_article_text.csv', index_col=0)
article_text = pd.read_csv('../data/all_extracted_discourse_with_clusters.csv.gz', index_col=0)

data_to_label_with_text = (
    article_text_df
         .loc[lambda df: df['key'].isin(valid_sentence_parses.index)]
         .explode('sentences')
         .assign(sent_idx=lambda df: df.groupby('key').cumcount())
         .assign(key=lambda df: df['key'] + '__' + df['sent_idx'].astype(str))
         [['key', 'sentences', 'response']]
         .rename(columns={'response':'document'})
         .merge(data_to_label, on='key')
)

  0%|          | 0/7240 [00:00<?, ?it/s]

In [None]:
import json
import ast 
for idx_to_focus_on in tqdm([5, 11, 13, 20]):
    examples_to_write = {}
    label_df = (
        pd.Series(all_cluster_summaries[idx_to_focus_on])
             .str.get(0)
             .str.split(':', expand=True)
             .rename(columns={0:'keyword', 1:'definition'})
             .assign(keyword=lambda df: df['keyword'].str.strip('```').str.replace('"', ''))
    )
    example_df = (
        data_to_label_with_text
         [['sentences', f'cluster_{idx_to_focus_on}']]
         .merge(label_df, left_on=f'cluster_{idx_to_focus_on}',right_index=True)
    )
    
    for (keyword, definition), examples in example_df.groupby(['keyword', 'definition'])[['sentences']]:
        examples = examples.sample(20)
        formatted_examples = (
            examples
            .apply(lambda x: f"Sentence: ```{x['sentences'].replace('\n', ' ').strip()}```", axis=1)
        )
        examples_to_choose_from = '\n\n'.join(formatted_examples)
        prompt = FEW_SHOT_PROMPT.format(label=keyword, definition=definition, examples=examples_to_choose_from)
        while True:
            r = call_openai_completions(prompt)
            r = r.replace('`', '').replace('python', '').replace('json', '').strip()
            try:
                r_parsed = ast.literal_eval(r)
                break
            except:
                corrected_json = call_openai_completions(prompt=CORRECT_JSON_PROMPT.format(r=r))
                r = r.replace('`', '').replace('json', '')
                try:
                    r_parsed = ast.literal_eval(r)
                    break
                except Exception as e:
                    print(f'failed {e}: {r}')
                    pass 
        examples_to_write[keyword] = r_parsed
        
    with open(f'../scripts/schema_labeling/few_shot_examples/examples_{idx_to_focus_on}.json', 'w') as f:
        json.dump(examples_to_write, f)

In [130]:
ls ../scripts/schema_labeling/few_shot_examples/

examples_11.json  examples_13.json  examples_20.json  examples_5.json


# Label Sentence

In [138]:
DEFINITIONS_PROMPT = """I will give you a news article and a single sentence from that article.
Your goal is to assign a label to that sentence with a general discourse role that best describes it's purpose in the overall script.

Choose from this list:

{discourse_labels}

Do NOT return any labels NOT in that list. Here are some shortened examples:

```{examples}```

Now it's your turn. Here is a news article:

```{full_document}```

What discourse role is this sentence in it serving?

Sentence: ```{sentence}```
Answer:"""

In [None]:
quote_labels = label_df.loc[lambda df: df['keyword'].str.lower().str.contains('quote')]['keyword'].tolist()
examples.drop(quote_labels)

In [318]:
text_df = data_to_label_with_text.copy()

In [319]:
USE_QUOTES = False

# text_df = article_text_df.explode('sentences')[['key', 'response', 'sentences']].rename(columns={'response': 'document'})
text_df = data_to_label_with_text.copy()

import jsonlines
for idx_to_focus_on in tqdm([5, 11, 13, 20]):
    label_df = (
        pd.Series(all_cluster_summaries[idx_to_focus_on])
             .str.get(0)
             .str.split(':', expand=True)
             .rename(columns={0:'keyword', 1: 'definition'})
             .assign(keyword=lambda df: df['keyword'].str.strip('```').str.replace('"', ''))
    )
    if not USE_QUOTES:
        quote_labels = label_df.loc[lambda df: df['keyword'].str.lower().str.contains('quote')]['keyword'].tolist()
        label_df = label_df.loc[lambda df: ~df['keyword'].isin(quote_labels)]
    
    definitions_block = '* ' + '\n* '.join(label_df.pipe(lambda df: '"' + df['keyword'] + '": ' + df['definition']))    
    examples_path = f'../scripts/schema_labeling/few_shot_examples/examples_{idx_to_focus_on}.json'
    examples = json.load(open(examples_path))
    examples = (
        pd.Series(examples)
             .apply(lambda x: x['examples'] if isinstance(x, dict) else x)
             .apply(lambda x: x[:2]).explode().sample(frac=1)
    )
    if not USE_QUOTES:
        examples = examples.drop(quote_labels)
    
    examples_str = []
    for k, v in examples.items():
        sentence = v.get('Sentence') or v.get('sentence')
        assert sentence is not None
        
        examples_str.append(
            f'''Sentence: ```{sentence}```
            Answer: {k}
            '''
        )
    examples_str = '\n'.join(examples_str)
    
    all_prompts = []
    for _, row in text_df.iterrows():
        prompt = DEFINITIONS_PROMPT.format(
            full_document=row['document'],
            sentence=row['sentences'],
            discourse_labels=definitions_block,
            examples=examples_str,
        )
        all_prompts.append({
            'key': row['key'],
            f'prompt_{idx_to_focus_on}': prompt,
            'use_quotes': USE_QUOTES
        })
    
    prompt_df = pd.DataFrame(all_prompts)
    text_df = text_df.merge(prompt_df)
    # text_df = text_df.merge(
        # label_df.rename(columns=lambda x: f'{x}_{idx_to_focus_on}'), right_index=True, left_on=f'cluster_{idx_to_focus_on}'
    # )
    
    if False:
        path = f'../scripts/schema_labeling/final_scripts_to_label/prompts_for_{idx_to_focus_on}'
        os.makedirs(path, exist_ok=True)
        prompt_df.to_csv(
            f'{path}/final_prompt_df__schema_size_{idx_to_focus_on}.csv.gz',
            compression='gzip'
        )

  0%|          | 0/4 [00:00<?, ?it/s]

In [257]:
cols = [
    'key', 
    'sentences',
    'keyword_5', 
    'keyword_11',
    'keyword_13',
    'keyword_20'
]
prompt_header_cols = [
    'key',
    'sentences',
    'document',
]
prompt_cols = [
    'prompt_5',
    'prompt_11',
    'prompt_13',
    'prompt_20'
]
all_prompt_cols = prompt_header_cols + prompt_cols

In [258]:
short_articles_to_sample = (
    text_df['key'].str.split('__').str.get(0)
         .value_counts()
         .loc[lambda s: s > 15]
         .loc[lambda s: s < 40]
         .index
)

In [320]:
REDO = False
if REDO:
    sample_to_prompt = (
        text_df
            .loc[lambda df: df['key'].str.split('__').str.get(0).isin(short_articles_to_sample.to_series().sample(5))]
            [all_prompt_cols]
    )
else:
    sample_to_prompt = text_df.loc[lambda df: df['key'].isin(output_df.index)][all_prompt_cols]

In [324]:
# output_df.loc[lambda df: df['prompt_11__with_quote_False'].isnull()]
# data_to_label_with_text.loc[lambda df: df['key'].str.contains('/2023/04/13/usc-has-a-new-dps-chief/')]

In [331]:
sample_to_prompt['key'].iloc[0]

'/2022/11/03/not-underestimating-cal-is-the-focus-of-wednesdays-practice/__0'

In [333]:
sample_to_prompt = sample_to_prompt.loc[lambda df: df['key'].isin(output_df.loc[lambda df: df['prompt_11__with_quote_False'].isnull()].index)]

In [304]:
# output = list(filter(lambda x: 'with_quote' not in x['k'], output))

In [334]:
# output = []
for _, row in tqdm(sample_to_prompt.iterrows(), total=len(sample_to_prompt)):
    for k, p in row[prompt_cols].items():
        output.append({
            'key': row['key'],
            'k': f'{k}__with_quote_{USE_QUOTES}',
            'response': call_openai_completions(p)
        })

  0%|          | 0/29 [00:00<?, ?it/s]

In [362]:
output_df = pd.DataFrame(output)
output_df = output_df.pivot(index='key', columns='k', values='response')#[prompt_cols]

prompt_cols_w_wo_quote = (
    list(filter(lambda x: any(map(lambda y: y in x, prompt_cols)) and 'with_quote_False' not in x, output_df.columns)) + 
    list(filter(lambda x: any(map(lambda y: y in x, prompt_cols)) and 'with_quote_False' in x, output_df.columns))
)
output_df = output_df[prompt_cols_w_wo_quote]
output_df = output_df.reset_index()
output_df[['url', 'sent_idx']] = output_df['key'].str.split('__', expand=True)
output_df['sent_idx']= output_df['sent_idx'].astype(int)
output_df= output_df.sort_values(['url', 'sent_idx'])
output_df = output_df.merge(data_to_label_with_text[['key', 'sentences']])
output_df = output_df.drop(columns='key').reset_index(drop=True)

In [363]:
output_df.to_csv('cache/2024-11-05__prompts-to-evaluate.csv')

In [400]:
pyperclip.copy(text_df[['key', 'discourse_label']]
 .assign(url=lambda df: df['key'].str.split('__').str.get(0))
 .assign(sent_idx=lambda df: df['key'].str.split('__').str.get(1).astype(int))
 .sort_values(['url', 'sent_idx'])
 .loc[lambda df: df['url'] == '/2023/02/15/rhode-island-school-of-design-omits-itself-from-us-news-world-report-rankings/']
 ['discourse_label']
 .str.replace('"', '')
 .str.strip()
 .str.split(':', expand=True)
 .to_csv(index=False, sep='\t')
)

In [370]:
idx_to_focus_on

20

In [375]:
label_df = (
        pd.Series(all_cluster_summaries[13])
             .str.get(0)
             .str.split(':', expand=True)
             .rename(columns={0:'keyword', 1: 'definition'})
             .assign(keyword=lambda df: df['keyword'].str.strip('```').str.replace('"', ''))
    )

In [376]:
import pyperclip
pyperclip.copy(label_df.to_csv(sep='\t'))

# Try with the results we have

In [113]:
example_df

Unnamed: 0,key,sentences,discourse_label,cluster_20,keyword,definition
0,/2023/02/09/the-march-toward-march-red-hot-ind...,The March Toward March: Red-hot Indiana may re...,"\n\n""Title"": This sentence serves as the titl...",9,Introduction,This sentence introduces the main topic of th...
1,/2023/02/09/the-march-toward-march-red-hot-ind...,\n\nIndiana forward Trayce Jackson-Davis (23) ...,"\n\n""Error"": This sentence was misparsed from...",12,Introduction,Sentences that introduce and provide context ...
2,/2023/02/09/the-march-toward-march-red-hot-ind...,\n\nIf you look up the word “underachieving” i...,"\n\n""Introduction"": This sentence introduces ...",9,Introduction,This sentence introduces the main topic of th...
3,/2023/02/09/the-march-toward-march-red-hot-ind...,"The Hoosiers, a ‘blue blood’ of college basket...","\n\n""Background Information"": This sentence p...",19,Background Information,This label encompasses sentences that provide...
4,/2023/02/09/the-march-toward-march-red-hot-ind...,Last year in their appearance at the Big Dance...,"\n\n""Background Information"": This sentence p...",19,Background Information,This label encompasses sentences that provide...
...,...,...,...,...,...,...
44905,/2023/02/09/organized-collective-of-students-f...,\n\nThe occupational therapy professional fiel...,"\n\n""Background Information"": This sentence p...",19,Background Information,This label encompasses sentences that provide...
44906,/2023/02/09/organized-collective-of-students-f...,\n\n“I have grandparents who have been mistrea...,"\n\n""Personal Motivation"": This sentence prov...",11,Personal Reflection,This label encompasses sentences that provide...
44907,/2023/02/09/organized-collective-of-students-f...,"Clark said. ""","\n\n""Error"": This sentence was misparsed from...",12,Introduction,Sentences that introduce and provide context ...
44908,/2023/02/09/organized-collective-of-students-f...,That’s why I went into the field in the first ...,"\n\n""Personal Motivation"": This sentence expl...",11,Personal Reflection,This label encompasses sentences that provide...
