In [1]:
%matplotlib inline

In [2]:
import glob
import gzip
import json

import numpy as np
import pandas as pd

In [3]:
run_title = "baseline"

In [4]:
category_df = pd.read_csv("trec2019b_test_results_run_%s_multi.csv" % run_title, dtype={
    "tweet_id": np.int64,
})
priority_df = pd.read_csv("trec2019b_test_results_priority_run_%s.csv" % run_title, dtype={
    "tweet_id": np.int64,
})

In [5]:
header_map = {
    "Donations": "CallToAction-Donations",
    "MovePeople": "CallToAction-MovePeople",
    "Volunteer": "CallToAction-Volunteer",
    "Advice": "Other-Advice",
    "ContextualInformation": "Other-ContextualInformation",
    "Discussion": "Other-Discussion",
    "Irrelevant": "Other-Irrelevant",
    "Sentiment": "Other-Sentiment",
    "CleanUp": "Report-CleanUp",
    "EmergingThreats": "Report-EmergingThreats",
    "Factoid": "Report-Factoid",
    "FirstPartyObservation": "Report-FirstPartyObservation",
    "Hashtags": "Report-Hashtags",
    "Location": "Report-Location",
    "MultimediaShare": "Report-MultimediaShare",
    "News": "Report-News",
    "NewSubEvent": "Report-NewSubEvent",
    "Official": "Report-Official",
    "OriginalEvent": "Report-OriginalEvent",
    "ServiceAvailable": "Report-ServiceAvailable",
    "ThirdPartyObservation": "Report-ThirdPartyObservation",
    "Weather": "Report-Weather",
    "GoodsServices": "Request-GoodsServices",
    "InformationWanted": "Request-InformationWanted",
    "SearchAndRescue": "Request-SearchAndRescue",
}

In [6]:
category_df.columns = [header_map.get(x, x) for x in category_df.columns]

In [7]:
topic_num_to_id_map = {}

with open("../data/TRECIS-2019-B-Test.topics", "r") as in_file:
    topic_num = ""
    topic_id = ""
    
    for line in in_file:
        
        if line.strip() == "</top>":
            topic_num_to_id_map[topic_id] = topic_num
        
        if line.startswith("<num>"):
            topic_num = line.partition(">")[-1].partition("<")[0]
              
        if line.startswith("<dataset>"):
            topic_id = line.partition(">")[-1].partition("<")[0]

In [8]:
topic_num_to_id_map

{'albertaWildfires2019': 'TRECIS-CTIT-H-Test-029',
 'cycloneKenneth2019': 'TRECIS-CTIT-H-Test-030',
 'philippinesEarthquake2019': 'TRECIS-CTIT-H-Test-031',
 'coloradoStemShooting2019': 'TRECIS-CTIT-H-Test-032',
 'southAfricaFloods2019': 'TRECIS-CTIT-H-Test-033',
 'sandiegoSynagogueShooting2019': 'TRECIS-CTIT-H-Test-034'}

In [9]:
tweet_id_to_topic_map = {}
tweet_id_to_count_map = {}

for file_path in glob.iglob("../data/2019/*2019*B-test*.json.gz"):
    print(file_path)
    counter = 1
    with gzip.open(file_path, "rb") as in_file:
        for line_ in in_file:
            line = line_.decode("utf8")
            tweet_entry = json.loads(line)
            
            tweet_id = np.int64(tweet_entry["allProperties"]["id"])
            tweet_topic = tweet_entry["topic"]
            
            tweet_id_to_topic_map[tweet_id] = topic_num_to_id_map[tweet_topic]
            tweet_id_to_count_map[tweet_id] = counter
            
            counter += 1

../data/2019/trecis2019-B-test.coloradoStemShooting2019.json.gz
../data/2019/trecis2019-B-test.southAfricaFloods2019.json.gz
../data/2019/trecis2019-B-test.cycloneKenneth2019.json.gz
../data/2019/trecis2019-B-test.philippinesEarthquake2019.json.gz
../data/2019/trecis2019-B-test.sandiegoSynagogueShooting2019.json.gz
../data/2019/trecis2019-B-test.albertaWildfires2019.json.gz


In [10]:
len(tweet_id_to_count_map), priority_df.shape[0]

(13916, 15000)

In [11]:
set(tweet_id_to_count_map.keys()).difference(priority_df["tweet_id"])

set()

In [12]:
cols_list = [x for x in category_df.columns if x != "tweet_id"]

def row2labels(row):
    output = []
    for col in cols_list:
        if ( row[col] != 0 ):
            output.append(col)
            
    return output

In [13]:
category_map = {row["tweet_id"]: row2labels(row) for idx, row in category_df.iterrows()}
priority_map = {row["tweet_id"]: row["priority"] for idx, row in priority_df.iterrows()}

In [14]:
category_df["inc_id"] = category_df["tweet_id"].apply(tweet_id_to_topic_map.get)
category_df["count"] = category_df["tweet_id"].apply(tweet_id_to_count_map.get)
category_df["priority"] = category_df["tweet_id"].apply(priority_map.get)
category_df["label"] = category_df["tweet_id"].apply(category_map.get)

In [15]:
priority_scorer = {
    1 : 0.75,
    0 : 0.25
}

with open("nyu-smapp_run_%s_multi.csv" % run_title, "w") as out_file:
    for row in category_df.drop_duplicates(subset="tweet_id").itertuples():
        content = [
            row.inc_id, 
            "Q0", 
            row.tweet_id,
            row.count,
            priority_scorer[row.priority],
            str(row.label).replace("'", '"'),
            "nyu-smapp_%s_multi" % run_title
        ]
        out_file.write("\t".join([str(x) for x in content]) + "\n")