In [62]:
import openai
import pandas as pd
import json

In [63]:
df = pd.read_csv("labeled_managed_cwh_sample_200_04-18.csv")
df['gpt_validation_ambiguity'] = None
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,conversation_id,created_at,author_id,lang,text,source,id,retweet_count,...,user_listed_count,user_location,advice,caution,resource,remediation,other impact,service_interruption,weather_information,gpt_validation_ambiguity
0,0,0,1465843812979384320,2021-12-01T00:42:53.000Z,28612106,en,Remember to write a list of important contact ...,Hootsuite Inc.,1465843812979384320,4,...,722.0,,True,False,True,False,False,False,False,
1,1,1,1466107083493945350,2021-12-01T18:09:02.000Z,1022139761421561856,en,It's #BCStorm Season ~ Do you own property alo...,Sprout Social,1466107083493945350,1,...,7.0,"Parksville, British Columbia",False,False,True,False,False,False,False,
2,2,2,1466132501961789456,2021-12-01T19:50:02.000Z,1468394269,en,During an emergency having a working cell phon...,Hootsuite Inc.,1466132501961789456,2,...,26.0,Langley BC,True,False,False,False,False,False,False,
3,3,3,1466138826133450755,2021-12-01T20:15:10.000Z,3064610619,en,We are pleased to announce that MetNotes are n...,Hootsuite Inc.,1466138826133450755,2,...,45.0,"Newfoundland and Labrador, Canada",False,False,True,False,False,False,True,
4,4,4,1466154334274523136,2021-12-01T21:16:47.000Z,41618221,en,⚠️UPDATE #BCHwy99 - OPEN to essential travel o...,TweetDeck,1466154334274523136,17,...,1422.0,British Columbia,False,True,True,False,False,True,True,


In [64]:
def create_prompt(text, label_json):
    prompt = f"""
The following triple backtick delimited tweet was posted during a Canadian Weather Event (CWE). 
```
{text}
```

The tweet was labeled according to the following categories, \
relevant to Weather-Affected People (WAPs):
- "weather information": tweets which contain meteorological information \
    such as weather warnings, information about precipitation/temperatures, \
    weather forecasts, etc.
- "caution": tweets which inform WAPs about some risk-carrying situation. \
    Tweets which *only* contain weather information should be labeled with \
    "weather information" but not "caution".
- "advice": tweets which go beyond informing about a risk-carrying situation, \
    and suggest or indicate some action or behavioral change useful to WAPs \
    in the context of the CWE.
- "resource": tweets which provide a concrete external resource that WAPs can \
    leverage to improve their response to the CWE (e.g. improving \
    preparedness/wellbeing, or minimizing risk/impact).
- "service interruption": tweets which inform WAPs about CWE-caused interruptions \
    to services (e.g. road closures, public transit interruptions, power outages, \
    closures of businesses, schools and other services). 
- "remediation": tweets about responses to the CWE which remediate its impact \
    (e.g. snowplowing, shelters, volunteering, etc).
- "other impact" tweets which inform about impacts of the CWE beyond service \
    interruptions (e.g. accidents, injuries, damages). This label is not mutually \
    exclusive with e.g. "caution" if the impact informs WAPs about \
    risk-carrying situations.


JSON of labels (with each category as a key, and "true" or \
"false" as a value based on how applicable it is to the tweet):
{label_json}

Your task is to evaluate the label values in the previous JSON, which may be incorrect or ambiguous.
Specifically, you should identify and explain any errors (i.e. incorrect label values) \
or ambiguities (i.e. label values which are unclear or could be interpreted in multiple ways). 

To this end, output a nested JSON object with the same keys as the previous JSON. 
For each key, output a tuple with two entries. First, a string in ["correct", "incorrect", "ambiguous"] \
indicating your evaluation of the provided label value. Second, a string explanation or justification or your evaluation. 

"""
    return prompt

In [65]:
openai.api_key = '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'

In [66]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [67]:
def get_label_json(row):
    label_json = {
        "advice": row["advice"],
        "caution": row["caution"],
        "resource": row["resource"],
        "remediation": row["remediation"],
        "other impact": row["other impact"],
        "service interruption": row["service_interruption"],
        "weather information": row["weather_information"],
    }
    return json.dumps(label_json, indent=2)

In [68]:
for i, row in df.iterrows():
    if row['gpt_validation_ambiguity'] is not None:
        continue
    text = row['text']
    label_json = get_label_json(row)
    prompt = create_prompt(text, label_json)
    response = get_completion(prompt)
    df.loc[i, 'gpt_validation_ambiguity'] = response
    print(f"Tweet {i+1} of {len(df)}")
    print(f"Tweet: {text}")
    print(f"Response: {response}")
    print("")


Tweet 1 of 200
Tweet: Remember to write a list of important contact numbers in your outage plan and emergency kit. Here are some more tips: https://t.co/Csh1uXODwH #BCStorm https://t.co/kRhdODSRhY
Response: {
  "advice": ["correct", "The tweet provides advice on writing a list of important contact numbers in the outage plan and emergency kit."],
  "caution": ["incorrect", "The tweet does not inform about any risk-carrying situation."],
  "resource": ["correct", "The tweet provides a resource with additional tips on preparedness."],
  "remediation": ["incorrect", "The tweet does not mention any response or action to remediate the impact of the CWE."],
  "other impact": ["correct", "The tweet does not inform about any impacts beyond service interruptions."],
  "service interruption": ["incorrect", "The tweet does not inform about any service interruptions."],
  "weather information": ["incorrect", "The tweet does not contain any meteorological information."]
}

Tweet 2 of 200
Tweet: It's

In [78]:
from pathlib import Path
import json


Path("validation_ambiguity.jsonl").write_text("\n".join([json.dumps(json.loads(l.replace(",\n}", "\n}"))) for l in df['gpt_validation_ambiguity']]))

180958

In [76]:
json.loads(l.replace(",\n}", "\n}"))

{'advice': ['correct',
  'The tweet does not provide any advice or suggest any action to WAPs. It only provides information about the weather event.'],
 'caution': ['correct',
  'The tweet does not inform WAPs about any risk-carrying situation. It only provides information about the weather event.'],
 'resource': ['correct',
  'The tweet does not provide any external resource for WAPs to leverage. It only provides information about the weather event.'],
 'remediation': ['correct',
  'The tweet does not mention any response or remediation to the CWE. It only provides information about the weather event.'],
 'other impact': ['correct',
  'The tweet does not inform about any impacts of the CWE beyond service interruptions. It only provides information about the weather event.'],
 'service interruption': ['correct',
  'The tweet does not inform about any CWE-caused interruptions to services. It only provides information about the weather event.'],
 'weather information': ['correct',
  'The

In [75]:
l

'{\n  "advice": ["correct", "The tweet does not provide any advice or suggest any action to WAPs. It only provides information about the weather event."],\n  "caution": ["correct", "The tweet does not inform WAPs about any risk-carrying situation. It only provides information about the weather event."],\n  "resource": ["correct", "The tweet does not provide any external resource for WAPs to leverage. It only provides information about the weather event."],\n  "remediation": ["correct", "The tweet does not mention any response or remediation to the CWE. It only provides information about the weather event."],\n  "other impact": ["correct", "The tweet does not inform about any impacts of the CWE beyond service interruptions. It only provides information about the weather event."],\n  "service interruption": ["correct", "The tweet does not inform about any CWE-caused interruptions to services. It only provides information about the weather event."],\n  "weather information": ["correct", "

In [25]:
from collections import defaultdict

d = defaultdict(int)
for c in completions:
    for k,v in json.loads(c).items():
        if v:
            d[k] += 1

print(json.dumps(d, indent=2))

{
  "advice": 52,
  "resource": 60,
  "weather information": 110,
  "caution": 46,
  "service interruption": 38,
  "remediation": 24,
  "other impact": 13
}


In [27]:
df['advice'] = [json.loads(c)['advice'] for c in completions]
df['caution'] = [json.loads(c)['caution'] for c in completions]
df['resource'] = [json.loads(c)['resource'] for c in completions]
df['remediation'] = [json.loads(c)['remediation'] for c in completions]
df['other impact'] = [json.loads(c)['other impact'] for c in completions]
df['service_interruption'] = [json.loads(c)['service interruption'] for c in completions]
df['weather_information'] = [json.loads(c)['weather information'] for c in completions]

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,conversation_id,created_at,author_id,lang,text,source,id,retweet_count,reply_count,...,user_tweet_count,user_listed_count,user_location,advice,caution,resource,remediation,other impact,service_interruption,weather_information
0,0,1465843812979384320,2021-12-01T00:42:53.000Z,28612106,en,Remember to write a list of important contact ...,Hootsuite Inc.,1465843812979384320,4,0,...,54622.0,722.0,,True,False,True,False,False,False,False
1,1,1466107083493945350,2021-12-01T18:09:02.000Z,1022139761421561856,en,It's #BCStorm Season ~ Do you own property alo...,Sprout Social,1466107083493945350,1,0,...,3872.0,7.0,"Parksville, British Columbia",False,False,True,False,False,False,False
2,2,1466132501961789456,2021-12-01T19:50:02.000Z,1468394269,en,During an emergency having a working cell phon...,Hootsuite Inc.,1466132501961789456,2,0,...,5425.0,26.0,Langley BC,True,False,False,False,False,False,False
3,3,1466138826133450755,2021-12-01T20:15:10.000Z,3064610619,en,We are pleased to announce that MetNotes are n...,Hootsuite Inc.,1466138826133450755,2,1,...,329.0,45.0,"Newfoundland and Labrador, Canada",False,False,True,False,False,False,True
4,4,1466154334274523136,2021-12-01T21:16:47.000Z,41618221,en,⚠️UPDATE #BCHwy99 - OPEN to essential travel o...,TweetDeck,1466154334274523136,17,0,...,155379.0,1422.0,British Columbia,False,True,True,False,False,True,True


In [29]:
df.to_csv("managed_cwh_sample_200_04-18_labeled.csv")