## Imports and Reading Data

In [1]:
import sys
import os
cwd = os.getcwd()
print(cwd)
nwd = os.path.join(cwd, "..", "narratives")
sys.path.insert(0, nwd)

C:\narrative-maps\tests


In [2]:
import pandas as pd
import numpy as np
import re
import networkx as nx
import itertools  
from math import log, exp, sqrt
import matplotlib.pyplot as plt
from ast import literal_eval
import json
import plotly.express as px
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
from math import log, exp, pi, sqrt, ceil
from scipy.spatial import distance

## Reading Data

In [3]:
def clean_publication(graph_df):
    publication_dict = {'www.bbc.com': 'bbc',
                    'www.aljazeera.com': 'ajz',
                    'www.nytimes.com': 'nyt',
                    'www.theguardian.com': 'tgn',
                    'edition.cnn.com': 'cnn',
                    'www.cnn.com': 'cnn',
                    'Reuters': 'reu',
                    'www.reuters.com': 'reu',
                    'Business Insider': 'bin',
                    'CNN': 'cnn',
                    'The Hill': 'hill',
                    'The New York Times': 'nyt',
                    'CNBC': 'cnbc',
                    'Breitbart': 'brb',
                    'www.breitbart.com': 'brb',
                    'Fox News': 'fox',
                    'www.foxnews.com': 'fox',
                    'CIA': 'cia',
                    'FBI': 'fbi',
                    'Army CID': 'cid',
                    'INS': 'ins',
                    'Sanctioned Intercepts': 'sai',
                    'NSA': 'nsa',
                    'abcnews.go.com': 'abc',
                    'apnews.com': 'apn',
                    'www.firstpost.com': 'fpost',
                    'theconversation.com': 'tcon',
                    'nypost.com': 'nyp',
                    'newscomworld.com': 'nwc',
                    'havanatimes.org': 'hvt',
                    'www.nbcnews.com': 'nbc',
                    'www.local10.com': 'l10',
                    'www.trtworld.com': 'trt',
                    'www.washingtonpost.com': 'wapo',
                    'www.xinhuanet.com': 'xinhua'
                    }
    graph_df['publication'] = graph_df['publication'].map(publication_dict).fillna('default')
    return graph_df

In [4]:
def read_query(dataset, start_date=None, end_date=None, partial=False):
    data_file = str(dataset) + '.csv'
    data_file_path = os.path.join(cwd, "..", "data", data_file)
    if partial:
        columns_to_skip = 'embed'
        query = pd.read_csv(data_file_path, usecols=lambda x: x not in columns_to_skip)
    else:
        query = pd.read_csv(data_file_path)
    query = clean_publication(query)

    # Inferring datatime format (this might bring issues depending on the data!)
    query['date'] = pd.to_datetime(query['date'], infer_datetime_format=True) #8/22/2019 12:15

    if start_date is not None:
        query = query[(query['date'] >= pd.to_datetime(start_date, format='%Y-%m-%d'))]

    if end_date is not None:
        query = query[(query['date'] <= pd.to_datetime(end_date, format='%Y-%m-%d'))]

    if not partial:
        embed_list = ['embed']
        query[embed_list] = query[embed_list].replace(r'( )+', ',', regex=True)
        query[embed_list] = query[embed_list].replace('\[,', '[', regex=True)
        query[embed_list] = query[embed_list].replace(',]', ']', regex=True)
        query[embed_list] = query[embed_list].applymap(literal_eval).applymap(np.array)

        if 'cluster_vec' in query.columns: # Predefined clusters!
            query[['cluster_vec']] = query[['cluster_vec']].applymap(literal_eval).applymap(np.array)

    query.reset_index(inplace=True)
    query['id'] = [str(i) for i in list(query.index)]

    return query

In [5]:
dataset = "cv"
query = read_query(dataset)

In [6]:
query

Unnamed: 0,index,id,title,url,date,publication,full_text,embed
0,0,0,China pneumonia outbreak: Mystery virus probed...,https://www.bbc.com/news/world-asia-china-5098...,2020-01-03,bbc,Image copyright Getty Images Image caption The...,"[-0.0233086105, 0.0509259216, 0.00989757478, 0..."
1,1,1,China pneumonia: Sars ruled out as dozens fall...,https://www.bbc.com/news/world-asia-china-5100...,2020-01-05,bbc,Image copyright Getty Images Image caption Hon...,"[0.015340873, 0.0619768389, 0.0247640405, 0.05..."
2,2,2,China mystery illness: travellers checked as o...,https://www.theguardian.com/world/2020/jan/07/...,2020-01-07,tgn,Pneumonia-like illness in Wuhan is unclear in ...,"[0.0163851921, 0.035141822, 0.0828021616, 0.09..."
3,3,3,China reports first death from mysterious outb...,https://www.aljazeera.com/news/2020/01/china-r...,2020-01-10,ajz,A 61-year-old man has died from pneumonia in t...,"[-0.0423171073, 0.0609870628, 0.00887921918, 0..."
4,4,4,Japan confirms first case of coronavirus infec...,https://www.aljazeera.com/news/2020/01/japan-c...,2020-01-15,ajz,Japan has confirmed the first case of infectio...,"[-0.00389743759, 0.0557561405, 0.0307840351, 0..."
5,5,5,Coronavirus: more cases and second death repor...,https://www.theguardian.com/world/2020/jan/17/...,2020-01-17,tgn,More cases of coronavirus have been confirmed ...,"[-0.00912287459, 0.0402604192, 0.00620971294, ..."
6,6,6,CDC to screen at three US airports for signs o...,https://edition.cnn.com/2020/01/17/health/wuha...,2020-01-17,cnn,(CNN) More than 100 staffers from the US Cente...,"[0.0704639703, -0.0273922719, -0.00230043917, ..."
7,7,7,Vaccine for new Chinese coronavirus in the works,https://edition.cnn.com/2020/01/20/health/coro...,2020-01-20,cnn,(CNN) The National Institutes of Health is wor...,"[-0.00837997813, 0.00753747113, 0.0238223914, ..."
8,8,8,China confirms human-to-human transmission of ...,https://www.aljazeera.com/news/2020/01/china-c...,2020-01-20,ajz,Human-to-human transmission of a new coronavir...,"[-0.0341011547, 0.0514796823, 0.0138446391, 0...."
9,9,9,New China virus: Cases triple as infection spr...,https://www.bbc.com/news/world-asia-china-5117...,2020-01-20,bbc,Media playback is unsupported on your device M...,"[-0.0224215426, 0.002624318, 0.010948102, 0.01..."


## Defining Chat

In [7]:
# Chat
import os
import openai

In [8]:
# Chatbot setup - Source: https://github.com/plotly/dash-sample-apps/blob/main/apps/dash-gpt3-chatbot/app.py
# Authentication
openai.api_key = os.getenv("OPENAI_API_KEY")
print(openai.api_key)
# Description
description = """
You are an AI model designed to assist intelligence analysts, journalists, and fact checkers in narrative sensemaking tasks.
You will be handling a model called narrative maps.
You will be helping users analyze the events and stories contained in the data.
The users will provide you with specific queries about storylines, events, or connections between events.
"""

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


## Generating Narrative Maps with ChatGPT (Ex 1)

In [9]:
def gpt(model_input):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages = [{"role": "system", "content": description},
                    {'role': 'user', 'content': model_input}],
        max_tokens=2000,
        temperature=0.9,
    )
    return response['choices'][0]['message']['content']

In [10]:
event_list = ""
gv_format = ""
for index, row in query.iterrows():
    event_list += str(index) + ": " + str(row['date']) + " - " + row['title'] + " \n"
    gv_format += str(index) + " [label = \"" + row['title'] + "\"]\n"
print(gv_format)

0 [label = "China pneumonia outbreak: Mystery virus probed in Wuhan"]
1 [label = "China pneumonia: Sars ruled out as dozens fall ill in Wuhan"]
2 [label = "China mystery illness: travellers checked as officials fear lunar new year could spread bug"]
3 [label = "China reports first death from mysterious outbreak in Wuhan"]
4 [label = "Japan confirms first case of coronavirus infection"]
5 [label = "Coronavirus: more cases and second death reported in China"]
6 [label = "CDC to screen at three US airports for signs of new virus from China"]
7 [label = "Vaccine for new Chinese coronavirus in the works"]
8 [label = "China confirms human-to-human transmission of new coronavirus"]
9 [label = "New China virus: Cases triple as infection spreads to Beijing and Shanghai"]
10 [label = "Coronavirus: health officials announce first known US case"]
11 [label = "The Test a Deadly Coronavirus Outbreak Poses to China's Leadership"]
13 [label = "Coronavirus: China advises against travel to Wuhan as deat

In [14]:
prompt = """Your goal is to generate a two dimensional projection of documents for visualization purposes.
            You will be given a series of text documents.
            You must assign documents to storylines of a narrative.
            Your output must contain a list of all the documents, their coordinates X and Y, and their cluster assignment in the format (X,Y,C).
            Here is the list of documents:
         """
prompt += event_list

In [12]:
response = gpt(prompt)

In [13]:
print(response)

Based on the documents provided, I have identified two main storylines:
1. Outbreak of a new virus in Wuhan, China
2. Global spread and impact of the virus

Here is the list of documents with their assigned coordinates (X,Y) and storyline cluster (C) in the format (X,Y,C):
    
0: (-4,-2,1)
1: (-3,-1,1)
2: (-2,-3,1)
3: (-1,-2,1)
4: (2,-4,2)
5: (1,-3,2)
6: (0,-2,2)
7: (1,-1,1)
8: (0,-1,1)
9: (2,-1,1)
10: (3,2,2)
11: (2,1,2)
12: (3,1,1)
13: (1,0,1)
14: (0,0,1)
15: (-1,3,1)
16: (0,3,1)
17: (1,3,1)
18: (2,3,1)
19: (3,-3,2)
20: (4,-2,2)
21: (4,-1,2)
22: (4,0,2)
23: (3,3,2)
24: (4,2,2)
25: (5,-1,1)
26: (6,-3,2)
27: (5,-2,2)
28: (5,-1,2)
29: (7,-2,2)
30: (7,-1,2)
31: (6,-1,2)
32: (8,-3,2)
33: (3,-4,1)
34: (8,-2,2)
35: (8,-1,2)
36: (9,-2,2)
37: (9,-1,2)
38: (10,-2,2)
39: (10,-1,2)

Note: X and Y coordinates are arbitrary and do not hold any specific meaning. The clusters are assigned based on the identified storylines.


In [197]:
prompt = """Your goal is to generate a narrative map.
            A narrative map is defined as a directed acyclic graph where nodes are events and edges represent connections between events.
            Each document represents an event of the narrative.
            You will be given a series of text documents.
            Not all documents are relevant, you may discard as many as you want and only keep the important ones.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            The main storyline should have a length of 4.
            The other storylines should be of similar length.
            Your output must contain a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            Here is the list of documents:
         """
prompt += event_list

In [28]:
response = gpt(prompt)

In [29]:
print(response) # This is good.

I have identified the following events as important for constructing the narrative map:

0, 1, 3, 5, 8, 10, 18, 20, 23, 25, 27, 28, 29, 31, 32, 35, 38, 39

I have grouped these events into three storylines:

Storyline 1: Outbreak in Wuhan
0: 2020-01-03 00:00:00 - China pneumonia outbreak: Mystery virus probed in Wuhan 
1: 2020-01-05 00:00:00 - China pneumonia: Sars ruled out as dozens fall ill in Wuhan 
3: 2020-01-10 00:00:00 - China reports first death from mysterious outbreak in Wuhan 
5: 2020-01-17 00:00:00 - Coronavirus: more cases and second death reported in China 
8: 2020-01-20 00:00:00 - China confirms human-to-human transmission of new coronavirus 
18: 2020-01-23 00:00:00 - Coronavirus: panic and anger in Wuhan as China orders city into lockdown 
20: 2020-01-24 00:00:00 - China expands coronavirus outbreak lockdown to 56 million people 
23: 2020-01-25 00:00:00 - China's Omnivorous Markets Are in the Eye of a Lethal Outbreak Once Again 
25: 2020-01-26 00:00:00 - China says coro

In [34]:
def narrative_map_prompt(K, mincover, event_list):
    prompt = f"""Your goal is to generate a narrative map.
            A narrative map is defined as a directed acyclic graph where nodes are events and edges represent connections between events.
            Each document represents an event of the narrative.
            You will be given a series of text documents.
            Not all documents are relevant, you may discard as many as you want and only keep the important ones.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            The main storyline should have a length of {K}.
            The other storylines should be of similar length.
            You should cover at least {mincover}% of the topics in the data set.
            Your output must contain a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            Here is the list of documents:
         """
    prompt += event_list
    return prompt

### Generating maps with different parameters

In [38]:
prompt = narrative_map_prompt(K=4, mincover=50, event_list=event_list)
response = gpt(prompt)

In [39]:
print(response)

I have analyzed the given documents and identified the following storylines:

1. Outbreak in Wuhan
2. Global spread of the virus
3. Response and measures to control the outbreak
4. Economic and social impact of the outbreak

I have selected "Outbreak in Wuhan" as the main storyline, which includes the following events:

0->1->3->8

The other storylines are of similar length and cover at least 50% of the topics in the data set. The edges for these storylines are:

Global spread of the virus:
4->5->10->11

Response and measures to control the outbreak:
6->7->9->12

Economic and social impact of the outbreak:
23->24->27->31

Note: The numbering of the events corresponds to their order in the given set of documents.


In [40]:
prompt = narrative_map_prompt(K=8, mincover=80, event_list=event_list)
response = gpt(prompt)

In [41]:
print(response) # Issues of this one, has duplicate edges.

Main storyline: 

0->1->3->8->16->17->18->20

Name: The Outbreak in Wuhan

Other storylines:

1) 5->8->18->20->28

Name: Global Impact

2) 9->10->11->12->21->22->23

Name: Calls for Transparency

3) 2->4->14->15->16->17


Edges of the narrative map:

0->1
1->3
3->8
8->16
16->17
17->18
18->20
5->8
8->18
18->20
20->28
9->10
10->11
11->12
12->21
21->22
22->23
2->4
4->14
14->15
15->16
16->17


In [42]:
prompt = narrative_map_prompt(K=5, mincover=10, event_list=event_list)
response = gpt(prompt)

In [43]:
print(response) # Wrong size.

After analyzing the documents, we have identified two major storylines: 

1. The emergence and spread of the coronavirus outbreak in China 
2. The global impact and response to the outbreak 

We have selected the first storyline as the main storyline, due to the higher number of relevant documents and events. 

Here is the narrative map with the list of edges: 

Main Storyline: Emergence and spread of coronavirus outbreak in China 

0->1
1->2
2->3
3->8
8->9
9->10
10->11
11->12
12->13
13->14
14->15
15->16
16->17
17->18
18->20
20->22
22->23
23->24
24->25
25->32
32->33
33->35
35->36
36->38
38->39

Secondary Storyline: Global impact and response to the outbreak

4->5
5->6
6->7
7->29
29->30
30->31
31->32
32->37
37->38
38->39


In [44]:
prompt = narrative_map_prompt(K=3, mincover=0, event_list=event_list) # No cover?
response = gpt(prompt)

In [45]:
print(response) # No inter story connections.

Main storyline: 

1. 0->1->2->3: China pneumonia outbreak in Wuhan is probed as dozens fall ill; officials fear lunar new year could spread the bug; China reports the first death from the mysterious outbreak in Wuhan. 

Other storylines: 

1. 4->5->8: Japan confirms first case of coronavirus; more cases and the second death were reported in China; China confirms the human-to-human transmission of new coronavirus. 
2. 6->10->11->15: CDC screens at three US airports for signs of new virus from China; health officials announce the first known US case; Deadly Coronavirus outbreak poses a test to China's Leadership; Chinese state media downplays coronavirus as Xi strikes a positive tone. 
4. 16->17->18->20->25: Lockdown measures rise across Hubei province; panic and anger in Wuhan as China orders city into lockdown; ten cities locked down and Beijing festivities scrapped; China expands the coronavirus outbreak lockdown to 56 million people; China says coronavirus can spread before symptoms 

### Generating maps with inter story and single start

In [49]:
def narrative_map_prompt_interstory(K, mincover, start, event_list):
    prompt = f"""Your goal is to generate a narrative map.
            A narrative map is defined as a directed acyclic graph where nodes are events and edges represent connections between events.
            Each document represents an event of the narrative.
            You will be given a series of text documents.
            Not all documents are relevant, you may discard as many as you want and only keep the important ones.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            The main storyline should have a length of {K}.
            The other storylines should be of similar length.
            You should cover at least {mincover}% of the topics in the data set.
            Your narrative map must start with event {start}.
            There must be edges that connect the different storylines.
            Your output must contain a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            Here is the list of documents:
         """
    prompt += event_list
    return prompt

In [50]:
### Generating maps with different parameters
prompt = narrative_map_prompt_interstory(K=4, mincover=10, start=0, event_list=event_list)
response = gpt(prompt)

In [51]:
print(response) # No single start!

After analyzing the documents, we have identified three storylines:
- Storyline 1: Outbreak in Wuhan and spread to other countries.
- Storyline 2: Measures taken by China and the impact on citizens.
- Storyline 3: Global response and economic impact.

For our main storyline, we choose "Storyline 1: Outbreak in Wuhan and spread to other countries".

Selected documents:

0: 2020-01-03 00:00:00 - China pneumonia outbreak: Mystery virus probed in Wuhan 
3: 2020-01-10 00:00:00 - China reports first death from mysterious outbreak in Wuhan 
5: 2020-01-17 00:00:00 - Coronavirus: more cases and second death reported in China 
8: 2020-01-20 00:00:00 - China confirms human-to-human transmission of new coronavirus 
10: 2020-01-21 00:00:00 - Coronavirus: health officials announce first known US case 
13: 2020-01-22 00:00:00 - Coronavirus: China advises against travel to Wuhan as deaths surge 
14: 2020-01-22 00:00:00 - Life inside ground zero of Wuhan coronavirus outbreak 
16: 2020-01-23 00:00:00 - 

### Parseable Prompt

In [150]:
def narrative_map_prompt_interstory_topics(K, mincover, start, event_list):
    prompt = f"""Instructions: Your goal is to generate a narrative map.
            A narrative map is defined as a connected Directed Acyclic Graph.
            Nodes of this graph are events and edges represent connections between events.
            You will be given a series of events to build the narrative map.
            Events must be discarded if they are irrelevant.
            Events without edges must be discarded.
            Organize events into storylines.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            There can be edges between events of the different storylines.
            Consider the following constraints:
            The main storyline must contain {K} events.
            Side stories should not be too large.
            If two storylines are sequential, they must be merged.
            You should cover at least {mincover}% of the topics in the data set.
            Your narrative map must start with event {start}.
            Edges must be in chronological order. There can be no cycles.
            Output format: Your output must contain the following sections and nothing else: SL, MS, and ED.
            Everything should be separated by line breaks.
            SL is a list of the identified storylines, with the associated event indices in curly braces. 
            Storyline names start with the identifier 'subgraph cluster_' followed by the number of the storyline.
            MS identifies the main storyline from the previous list.
            ED is a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [151]:
prompt = narrative_map_prompt_interstory_topics(K=6, mincover=50, start=0, event_list=event_list)

In [152]:
response = gpt(prompt)

In [153]:
print(response) # This one is pretty good!

SL: 
subgraph cluster_0 {0}
subgraph cluster_1 {1,2,3}
subgraph cluster_2 {4,5}
subgraph cluster_3 {6,7,8,9,10}
subgraph cluster_4 {11,12,13,14}
subgraph cluster_5 {15,16,17,18}
subgraph cluster_6 {19,20,21,22,23,24}
subgraph cluster_7 {25,26,27,28,29,30,31,32,33,34,35,36}
subgraph cluster_8 {37,38,39}

MS: subgraph cluster_1

ED:
0->1
1->2
2->3
1->4
4->5
0->6
6->7
7->8
8->9
9->10
2->11
11->12
12->13
13->14
3->15
15->16
16->17
17->18
5->18
18->19
19->20
20->21
21->22
22->23
23->24
3->25
25->26
26->27
27->28
28->29
29->30
30->31
31->32
32->33
33->34
34->35
35->36
5->37
37->38
38->39


## Generating Reports with ChatGPT with and without Maps

In [209]:
def narrative_map_prompt_interstory_topics(K, mincover, start, event_list):
    prompt = f"""Instructions: Your goal is to generate a narrative map.
            A narrative map is defined as a connected Directed Acyclic Graph.
            Nodes of this graph are events and edges represent connections between events.
            You will be given a series of events to build the narrative map.
            Events must be discarded if they are irrelevant.
            Events without edges must be discarded.
            Organize events into storylines.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            There can be edges between events of the different storylines.
            Consider the following constraints:
            The main storyline must contain {K} events.
            Side stories should not be too large.
            If two storylines are sequential, they must be merged.
            You should cover at least {mincover}% of the topics in the data set.
            Your narrative map must start with event {start}.
            Edges must be in chronological order. There can be no cycles.
            Output format: Your output must contain the following sections and nothing else: SL, MS, ED, and RP.
            Everything should be separated by line breaks.
            SL is a list of the identified storylines, with the associated event indices in curly braces. 
            Storyline names start with the identifier 'subgraph cluster_' followed by the number of the storyline.
            MS identifies the main storyline from the previous list.
            ED is a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            RP is a report summarizing the narrative, including a paragraph for each storyline. 
            Your report must not repeat the events, but instead analyze the storylines and provide useful insights for an analyst.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [210]:
prompt = narrative_map_prompt_interstory_topics(K=6, mincover=50, start=0, event_list=event_list)

In [212]:
response = gpt(prompt)

In [213]:
print(response)

SL
subgraph cluster_0 {0, 1, 2, 3, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23}
subgraph cluster_1 {4, 5, 6, 7, 10, 20, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}

MS
subgraph cluster_0 is the main storyline.

ED
0->1
1->2
2->3
3->8
8->9
9->11
11->12
12->13
13->14
14->15
15->16
16->17
17->18
18->19
19->21
21->22
22->23
4->5
5->6
6->7
7->8
8->9
9->10
10->20
20->24
24->25
25->26
26->27
27->28
28->29
29->30
30->31
31->32
32->33
33->34
34->35
35->36
36->37
37->38
38->39

RP
The COVID-19 pandemic originated in Wuhan, China, in December 2019. The initial outbreak was characterized by a range of symptoms, including high fever, coughing, and difficulty breathing. At first, the disease was not well understood, and there were concerns that it could be a new form of pneumonia, or even a variant of the SARS virus. Chinese health officials confirmed on January 10, 2020, that the outbreak was caused by a new coronavirus.
 
Subgraph cluster_0 covers the early stages of t

In [214]:
prompt = narrative_map_prompt_interstory_topics(K=4, mincover=50, start=0, event_list=event_list)

In [215]:
response = gpt(prompt)

In [216]:
print(response)

SL
subgraph cluster_0 {0 1 2 3}
subgraph cluster_1 {4 5}
subgraph cluster_2 {6 7}
subgraph cluster_3 {8 9 10 11 12 13 14 15 16 17 18}
subgraph cluster_4 {19 20 21 22 23 24}
subgraph cluster_5 {25 26 27 28 29 30 31 32 33 34 35 36}
subgraph cluster_6 {37 38 39}

MS
Main storyline: subgraph cluster_0

ED
0->1
1->2
2->3
3->4
4->5
6->8
8->9
9->10
10->11
11->12
12->13
13->17
14->16
15->16
16->17
17->19
19->21
21->22
22->23
23->24
25->27
26->27
27->28
28->29
29->30
30->31
31->32
32->33
33->35
35->36
37->38
38->39

RP
The narrative map includes several storylines that help to understand the events related to the coronavirus outbreak. The main storyline, subgraph cluster_0, starts with the first reports of the outbreak in Wuhan, China (event 0) and ends with the confirmation of human-to-human transmission (event 8) and the spread of the infection to other parts of China (event 9), leading to a lockdown in the region (event 16). This storyline highlights the rapid spread of the virus and the eff

In [11]:
def report_without_map(mincover, start, event_list):
    prompt = f"""Instructions: Your goal is to generate a narrative report.
            You will be given a series of events to build the narrative report.
            Events must be discarded if they are irrelevant.
            Organize events into storylines.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            You should cover at least {mincover}% of the topics in the data set.
            Your narrative must start with event {start}.
            Your output is a report summarizing the narrative, including a paragraph for each storyline. 
            Your report must not repeat the events, but instead analyze the storylines and provide useful insights for an analyst.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [12]:
prompt = report_without_map(mincover=50, start=0, event_list=event_list)

In [13]:
response = gpt(prompt)

In [14]:
print(response)

On January 3, 2020, the China pneumonia outbreak was first reported in Wuhan, leading to a flurry of activity and investigations by Chinese officials. A few days later, Sars was ruled out, but the illness continued to spread, with travelers being checked to prevent further infections. On January 10, a death was reported, and on January 15, Japan confirmed its first case of coronavirus infection. On January 20, a vaccine was being developed, and it became clear that human-to-human transmission was possible as the number of cases tripled and spread to Beijing and Shanghai. From that point on, the virus has spread across the world, causing widespread panic and uncertainty, leading to lockdowns, travel restrictions, and economic disruptions. 

The major storylines in the narrative map are the timeline of the virus spread and its impact on the Chinese government, the outbreak's effect on global markets and economies, and the efforts to develop a vaccine. The timeline of the virus spread thr

## Semantic Interaction: Removing Topics

In [31]:
def narrative_map_si(K, mincover, event_list):
    prompt = f"""Instructions: Your goal is to generate a narrative map.
            A narrative map is defined as a connected Directed Acyclic Graph.
            Nodes of this graph are events and edges represent connections between events.
            You will be given a series of events to build the narrative map.
            Events must be discarded if they are irrelevant.
            Events without edges must be discarded.
            Organize events into storylines.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            There can be edges between events of the different storylines.
            Consider the following constraints:
            The main storyline must contain {K} events.
            Side stories should not be too large.
            If two storylines are sequential, they must be merged.
            You should cover at least {mincover}% of the topics in the data set..
            Edges must be in chronological order. There can be no cycles.
            User given feedback: It is very important to include the feedback in the model.
            User feedback: Exclude events related to 'Economy' from the narrative.
            Output format: Your output must contain the following sections and nothing else: SL, MS, and ED.
            Everything should be separated by line breaks.
            SL is a list of the identified storylines, with the associated event indices in curly braces. 
            Storyline names start with the identifier 'subgraph cluster_' followed by the number of the storyline.
            MS identifies the main storyline from the previous list.
            ED is a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [32]:
prompt = narrative_map_si(K=6, mincover=50, event_list=event_list)

In [33]:
response = gpt(prompt)

In [34]:
print(response)

SL: 
subgraph cluster_0 {0 1 2 3}
subgraph cluster_1 {4}
subgraph cluster_2 {5 8}
subgraph cluster_3 {6 10}
subgraph cluster_4 {11 12 13 14}
subgraph cluster_5 {15 16 17 18}
subgraph cluster_6 {19 20 21 22 23 24}
subgraph cluster_7 {25 26 27 28 30 31 33 34 35 36}
subgraph cluster_8 {29}
subgraph cluster_9 {32 37 38 39}

MS: subgraph cluster_0

ED:
0->1
1->2
2->3
3->5
4->5
5->8
6->10
10->11
11->12
12->13
13->14
15->16
16->17
17->18
19->20
20->21
21->22
22->23
23->24
25->33
26->27
27->28
28->37
30->31
31->32
35->36
39->32


## XAI: Connection Explanations + Labels

In [9]:
def narrative_map_xai(K, mincover, event_list):
    prompt = f"""Instructions: Your goal is to generate a narrative map.
            A narrative map is defined as a connected Directed Acyclic Graph.
            Nodes of this graph are events and edges represent connections between events.
            You will be given a series of events to build the narrative map.
            Events must be discarded if they are irrelevant.
            Events without edges must be discarded.
            Organize events into storylines.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            There can be edges between events of the different storylines.
            Consider the following constraints:
            The main storyline must contain {K} events.
            Side stories should not be too large.
            If two storylines are sequential, they must be merged.
            You should cover at least {mincover}% of the topics in the data set.
            Edges must be in chronological order. There can be no cycles.
            Output format: Your output must contain the following sections and nothing else: SL, MS, and ED.
            Everything should be separated by line breaks.
            SL is a list of the identified storylines, with the associated event indices in curly braces. 
            Storyline names start with the identifier 'subgraph cluster_' followed by the number of the storyline.
            MS identifies the main storyline from the previous list.
            ED is a list of all the edges of the narrative map in the form X->Y, where X and Y represent the number of the event.
            The list of edges must include a short explanation of why events are connected in brackets.
            Connections should be classified depending on why they are connected into four categories: topical, causal, entity, and similarity.
            Make sure to include explanations of your choices.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [15]:
prompt = narrative_map_xai(K=6, mincover=50, event_list=event_list)

In [16]:
response = gpt(prompt)

In [17]:
print(response)

SL:
subgraph cluster_0 {0}
subgraph cluster_1 {1,2,3}
subgraph cluster_2 {4,5}
subgraph cluster_3 {6,7,8,9}
subgraph cluster_4 {10,11,12,13,14}
subgraph cluster_5 {15,16,17,18}
subgraph cluster_6 {19,20,21,22,23}
subgraph cluster_7 {24,25,26,27,28}
subgraph cluster_8 {29,30,31}
subgraph cluster_9 {32,33,34}
subgraph cluster_10 {35,36,37,38,39}

MS: subgraph cluster_1

ED:
0->1 [entity: China pneumonia outbreak in Wuhan]
1->2 [entity: initial misdiagnosis of the virus]
2->3 [entity: fear of further spread during lunar new year]
3->18 [entity: first reported death and lockdown measures]
4->5 [topical: first confirmed case in Japan]
5->8 [causal: second death and human-to-human transmission confirmed in China]
8->9 [topical: rapid increase in cases in Beijing and Shanghai]
9->10 [entity: first known US case]
11->12 [entity: concerns over cover-up by Chinese government]
13->16 [entity: lockdown measures implemented in Hubei province]
16->17 [entity: panic and anger in Wuhan over the lockdo

## Causal Chain

In [25]:
def causal_chain(K, event_list):
    prompt = f"""Instructions: Your goal is to generate a causal chain.
            You will be given a series of events to build the causal chain.
            Events must be discarded if they are irrelevant.
            Provide a name for your causal chain.
            The chain must contain {K} events.
            The chain must be in chronological order.
            Output format: Your output must contain the following sections and nothing else: NM and ED.
            Everything should be separated by line breaks.
            NM contains the name of the causal chain.
            ED is a list of all the connections in the chain in the form X->Y, where X and Y represent the number of the event.
            The list of connections must include a short explanation of why events are connected in brackets.
            The explanation must include a section on the 'Cause' and another on the 'Effect'.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [26]:
prompt = causal_chain(K=6, event_list=event_list)

In [27]:
response = gpt(prompt)

In [28]:
print(response)

NM: The Spread of the Coronavirus

ED:
0->1 [Cause: Sars is ruled out as dozens of people fall ill with a mystery virus in Wuhan; Effect: The virus is not identified as SARS]
1->3 [Cause: Official reports identify a new virus in Wuhan; Effect: The first death from the virus is reported]
3->8 [Cause: The virus spreads from animals to humans in Wuhan; Effect: The Chinese government confirms human-to-human transmission of the virus, indicating the possibility of a pandemic]
8->9 [Cause: People travel for the Lunar New Year; Effect: The virus spreads to Beijing and Shanghai]
9->10 [Cause: A person who traveled from Wuhan to the US is diagnosed with the virus; Effect: The virus is not contained in China]
10->21 [Cause: The virus spreads globally; Effect: China locks down its cities and cancels Lunar New Year festivities]
21->27 [Cause: Oil consumption decreases as fear of the virus spreads; Effect: Oil prices fall]
27->38 [Cause: The virus spreads rapidly worldwide; Effect: The World Health

In [40]:
def causal_map(K, mincover, event_list):
    prompt = f"""Instructions: Your goal is to generate a causal map.
            A causal map is defined as a connected Directed Acyclic Graph.
            Nodes of this graph are events and edges represent causal connections between events.
            You will be given a series of events to build the causal map.
            Events must be discarded if they are irrelevant.
            Events without edges must be discarded.
            Organize events into storylines.
            Provide names for the different storylines of your narrative map.
            Select one story to be the main storyline.
            There can be edges between events of the different storylines.
            Consider the following constraints:
            The main storyline must contain {K} events.
            Side stories should not be too large.
            Storylines should contain more than one event.
            If two storylines are sequential, they must be merged.
            You should cover at least {mincover}% of the topics in the data set.
            Edges must be in chronological order. There can be no cycles.
            Output format: Your output must contain the following sections and nothing else: SL, MS, and ED.
            Everything should be separated by line breaks.
            SL is a list of the identified storylines, with the associated event indices in curly braces. 
            Storyline names start with the identifier 'subgraph cluster_' followed by the number of the storyline.
            MS identifies the main storyline from the previous list.
            ED is a list of all the causal connections of the narrative map in the form X->Y, where X and Y represent the number of the event.
            The list of edges must include a short explanation of why events are connected in brackets.
            Connections must be based on a cause-effect relationship.
            The explanation of the connections must mention the 'Cause' and the 'Effect'.
            Here is the list of events:
         """
    prompt += event_list
    return prompt

In [41]:
prompt = causal_map(K=6, mincover=20, event_list=event_list)

In [42]:
response = gpt(prompt)

In [43]:
print(response)

SL: 
subgraph cluster_0 {0}
subgraph cluster_1 {1, 2}
subgraph cluster_2 {3}
subgraph cluster_3 {4, 5, 6}
subgraph cluster_4 {7, 8, 9}
subgraph cluster_5 {10, 11, 12}
subgraph cluster_6 {13, 14}
subgraph cluster_7 {15, 16, 17, 18}
subgraph cluster_8 {19, 20, 21, 22, 23, 24}
subgraph cluster_9 {25, 26, 27, 28}
subgraph cluster_10 {29, 30, 31}
subgraph cluster_11 {32, 33, 34, 35, 36}
subgraph cluster_12 {37, 38, 39}

MS: 
Main storyline: subgraph cluster_7

ED: 
0->1 [Cause: outbreak in Wuhan, Effect: investigation]
1->2 [Cause: suspicion of virus, Effect: travelers screened]
2->3 [Cause: fear of spreading, Effect: first death reported]
3->4 [Cause: infection spreading, Effect: first case in Japan]
3->5 [Cause: infection spreading, Effect: second death in China]
3->6 [Cause: infection spreading, Effect: screening for virus]
4->5 [Cause: infection spreading, Effect: increase in number of cases and deaths]
5->7 [Cause: virus spreading, Effect: panic and anger in Wuhan]
5->8 [Cause: virus s