In [1]:
import pandas as pd 
import json

In [2]:
train_queries = json.load(open('../data/v2_queries/query_train_prompt1_v2.json'))
test_queries = json.load(open('../data/v2_queries/query_test_prompt1_v2.json'))
train_query_df = pd.DataFrame(train_queries)
train_query_df['split'] = 'train'
test_query_df = pd.DataFrame(test_queries)
test_query_df['split'] = 'test'
full_query_df = pd.concat([train_query_df, test_query_df])

In [3]:
test_narr = json.load(open('../data/v2_narr_parsed/v2_test_set_narr.json'))
train_narr = json.load(open('../data/v2_narr_parsed/v2_train_set_narr.json'))

In [4]:
narr_df = pd.concat([
    pd.DataFrame(train_narr),
    pd.DataFrame(test_narr)
])
source_narr_df = narr_df.explode('sources').dropna().reset_index(drop=True)
source_narr_df = pd.concat([
    source_narr_df[['url']],
    source_narr_df['sources'].pipe(lambda s: pd.DataFrame(s.tolist()))
], axis=1)

In [5]:
source_narr_df.iloc[0].to_dict()

{'url': 'www.fox61.com/article/news/nation-world/steps-to-limit-info-facebook-collects-about-you-and-shares-to-advertisers/507-07ea653e-e06c-4da3-935f-84a315fa1f68',
 'Name': 'Jonathan S. Weissman',
 'Original name': ['Jonathan S. Weissman'],
 'Narrative function': "Provides expert opinion on the risks of Facebook's data collection practices.",
 'Perspective': 'Skeptical',
 'Centrality': 'Medium',
 'Justification': "Weissman's expert opinion provides a critical perspective on Facebook's data collection practices, highlighting the risks to users' privacy. His perspective is skeptical because he expresses concerns about the potential harm of Facebook's practices."}

In [None]:
source_df_with_clusters = pd.read_csv('cache/2024-09-19__source-df-with-all-clusters.csv', index_col=0)

# Old Work

In [9]:
(
    source_df_with_clusters
         .loc[lambda df: df['cluster_descript_leaf_1'] == df['cluster_descript_leaf_1'].unique()[1]]
         .drop_duplicates('Narrative Function').iloc[0].to_dict()
)

{'url': 'www.theatlantic.com/sponsored/qualcomm-2016/the-space-within/768/',
 'Name': 'Researchers',
 'Original Name': 'researchers',
 'Narrative Function': '"Authority": This source is used to provide information about the Framingham Heart Study and its findings.',
 'cluster_descript_leaf_1': '"Expert Endorsement": These sources are used to provide authoritative and credible information, data, and guidance on various aspects of the COVID-19 pandemic, including vaccines, testing, public health guidelines, and pandemic control measures.',
 'cluster_descript_leaf_2': '"Authoritative Source": These sources provide credible, expert-backed information and validations that lend legitimacy and reliability across various topics, enhancing the trust and accuracy of the articles they support.',
 'cluster_descript_leaf_3': '"Credibility Anchor": These sources provide verified, expert-backed information that enhances the trustworthiness and reliability of the content they support.',
 'cluster_desc

In [7]:
source_df_with_clusters['cluster_descript_leaf_4'].value_counts()#$.head(2)

cluster_descript_leaf_4
"Core Narrative Sources": These sources provide key insights, credibility, context, and diverse perspectives crucial for developing, substantiating, and advancing the main narrative of the article.                                                                                           148915
"Extensive Context"\n\n"These sources provide in-depth background, integral details, and thorough narrative foundations that enhance the understanding of various topics and events, ensuring comprehensive insight."                                                                            70762
"Insightful Narratives": These sources offer detailed, informative, and contextual explanations with comprehensive data and innovative solutions, enhancing reader understanding and decision-making across various subjects.                                                                    55396
"Nuanced Discourse": These sources offer comparative analysis, opposing viewpoints, and cri

In [124]:
# source_narr_df[['url', 'Name', 'Perspective', 'Centrality']].merge(source_df_with_clusters, on=['url', 'Name'])

In [9]:
source_df_with_clusters_and_queries = source_df_with_clusters.merge(full_query_df)

In [15]:
source_df_with_clusters_and_queries.iloc[0]

url                        blog.cleveland.com/metro/2011/01/light_snow_ca...
Name                                                                  Police
Original Name                                                 Police, police
Narrative Function                                            Primary Source
cluster_descript_leaf_1                                       Primary Source
cluster_descript_leaf_2                                       Primary Source
cluster_descript_leaf_3                             Primary Narrative Source
cluster_descript_leaf_4                               Core Narrative Sources
cluster_descript_leaf_5                                  Credible Narratives
query                      What is the impact of a light snowfall on traf...
split                                                                   test
Name: 0, dtype: object

In [10]:
to_process = ['Narrative Function', 'cluster_descript_leaf_1', 'cluster_descript_leaf_2', 'cluster_descript_leaf_3', 'cluster_descript_leaf_4', 'cluster_descript_leaf_5']
source_df_with_clusters_and_queries[to_process] = (
    source_df_with_clusters_and_queries
     [to_process]
     .apply(lambda s: s.str.split(':').str.get(0).str.split('\n\n').str.get(0).str.strip().str.replace('"', ''))
)

In [11]:
broad_clusters_categories = (
    source_df_with_clusters_and_queries
         .groupby('url')['cluster_descript_leaf_5']
         .value_counts()
         .unstack()
)

In [None]:
## classification problem:

## input: initial query — initial summary of the story 
## predict: multilabel output of the different source-categories


## process —
##    start with your query
##    planner tells you what KINDS of sources you need
##    execute query 1
##       you determine that you filled "Central Figure" category of source
##       your planner tells you you still need an "Enriched Narratives" category of source
##    this helps you formulate/execute a better followup/interleaving query
## -> can we make an HMM or Graph neural network to predict this?

## directly sample from the distribution of discourse labels?
##    slightly more advanced - use kmeans clustering to cluster 5-6 clusters of different story types and then try to assign 
##    stories to each cluster

In [192]:
train_query_df.iloc[0].to_dict()

{'url': 'www.vice.com/en/article/jg8743/facebook-spending-dollar50m-rese-to-not-ruin-metaverse-like-it-ruined-the-real-world',
 'query': 'Can a company like Facebook, with a history of contributing to real-world problems, be trusted to responsibly develop a virtual world like the metaverse?',
 'split': 'train'}

In [190]:
broad_clusters_categories.sort_values('Enriched Narratives', ascending=False)

cluster_descript_leaf_5,Central Figure,Contextual Narratives,Credible Narratives,Critical Analysis,Enriched Narratives,User Interaction
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
www.fox23.com/news/trending/teacher-appreciation-week-2021-deals-freebies-educators/SOQPCHBO3JAC3MXNVHGY3IFYHM/,,,2.0,,24.0,
www.chron.com/life/pets/article/New-website-guesses-what-dog-breed-you-are-6842859.php,,,,,24.0,
www.usatoday.com/story/travel/flights/todayinthesky/2017/09/11/irma-snarling-flights-five-major-airline-hubs-all-once/653483001/,,,1.0,,22.0,
www.theguardian.com/technology/2014/feb/12/10-things-to-know-about-lifelogging,,,2.0,,21.0,
www.cbsnews.com/miami/news/a-guide-to-veterans-day-2020-deals-and-freebies/,,,1.0,,21.0,
...,...,...,...,...,...,...
wwwrm1.ansa.it/pressrelease/canale_salute_benessere/index.shtml,,,1.0,,,
y108.cbslocal.com/2014/07/23/luke-bryan-to-honor-u-s-troops-with-free-tickets-to-chicago-concert/,1.0,,1.0,,,
youpix.virgula.uol.com.br/app/os-114-aplicativos-mais-uteis-pra-voce-baixar-agora/,1.0,1.0,11.0,1.0,,
yourvoicematters.news.blog/,,,10.0,,,


In [175]:
training_data = (
    broad_clusters_categories
        .fillna(0)
        .pipe(lambda df: (df > 0).astype(float))
        .merge(full_query_df, right_on='url', left_index=True, how='left')
)
#.pipe(lambda df: df.divide(df.sum(axis=1), axis=0))#.sort_values('Contextual Narratives', ascending=False))

# Train BERT Classifier

In [182]:
training_data.to_csv('../narrative_function/bert_training_data.csv')