In [286]:
import pandas as pd
from collections import Counter
import numpy as np
import datetime

In [3]:
STORMS_PATH  = "/shared/3/projects/newsDiffusion/data/processed/stormDfs/20000_90_storms.tsv"
ARTICLES_PATH = "/shared/3/projects/newsDiffusion/data/processed/fullDataWith20000.tsv"

stormsDf = pd.read_csv(STORMS_PATH, sep="\t")
artDf = pd.read_csv(ARTICLES_PATH, sep="\t")

artDf = artDf.drop(columns=["Unnamed: 0"])
stormsDf = stormsDf.drop(columns=["Unnamed: 0"])

SCOL = "20000_90"
stormsDf[SCOL + "_storm"] = True

merged = pd.merge(artDf, stormsDf.drop(columns=[SCOL]), on="key", how="left")

#important: remove any dates before our first local date 
merged = merged[merged["date"] >= "2020-04-01"]

#shouldn't have filtered on string up above but it did the job 
merged["date"] = pd.to_datetime(merged["date"])

In [7]:
stormDf = merged[merged["20000_90_storm"] == True]

#sanitfy check to confirm we have our 98 storms 
len(stormDf["20000_90"].unique())

In [20]:
pStormDf = stormDf[["key", "date", "title", "content", "url", "fips", 'source', 'description',
       'onlinesince', 'rank', 'state', 'city', 'lon', 'lat', 'county',
       'total_population', 'white_pct', 'black_pct', 'hispanic_pct',
       'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
       'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct',
       'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct',
       'rural_pct', 'ruralurban_cc', 'predrt_0', 'predrt_12', 'predrt_3',
       'national', 'id', 'author', 'published', 'published_utc',
       'collection_utc','20000_90']]

pStormDf = pStormDf.rename(columns={"key":"articleID", "20000_90":"stormID"})

#write a nice version of the storm df to an output file for github 
pStormDf.to_csv("/shared/3/projects/newsDiffusion/data/processed/pubData/20000_90_stormData.tsv", sep="\t")

In [291]:
#now get table, 1 row per storm 
stormTab = pStormDf[["stormID", "articleID", "national", "date"]].groupby("stormID").agg(list)

#get the columns we need 
stormTab["startDay"] = stormTab["date"].apply(lambda x: min(x))
stormTab["endDay"] = stormTab["date"].apply(lambda x: max(x))

In [292]:
stormTab["duration"] = (stormTab["endDay"] - stormTab["startDay"]) + datetime.timedelta(days=1)

In [293]:
stormTab["articleCount"] = stormTab["articleID"].apply(len)
stormTab["peakDay"] = stormTab["date"].apply(lambda x: Counter(x).most_common()[0][0])
stormTab["nationalPercent"] = stormTab["national"].apply(lambda x: 100*np.mean(x))

#select and sort 
stormTab = stormTab[["startDay", "peakDay", "duration",  "articleCount", "nationalPercent"]]
stormTab = stormTab.sort_values("startDay")

In [295]:
stormDescriptions = ["Boris Johnson's contraction and recovery from COVID-19", "Minnesota Gov. Walz gives COVID-19 update",
                     "Joe Biden accused of sexual assault", 
                     "Dallas salon owner jailed for violating COVID restrictions", "Mike Pence criticized for not wearing mask to Mayo Clinic", 
                    "Adam Schiff and 'Russiagate' transcripts","Response to Trump taking hydroxychloroquine",
                    "Court trials of officers in George Floyd's murder", "Decline and subsequent rise of COVID cases in Minnesota",
                    "Minneapolis bans police chokeholds in response to George Floyd's murder", "Support for defunding,abolishing police in Minneapolis", 
                    "Renaming of army bases named after Confederate leaders", "Trump attempts to prevent Bolton from publishing his memoir", 
                    "COVID-19 updates in Minnesota, North Dakota", "Mississippi to remove Confederate emblem from flag", 
                    "Top Manhattan prosecutor fired by Trump", "Bubba wallace finds noose in garage; subsequent coverage", 
                    "Trump pushes states to reopen their schools", "Death of 'Glee' actress Naya Rivera", 
                    "Trump commutes Roger Stone's sentence; ensuing backlash", "Major retailers update mask policies", 
                    "Trumps says he will ban TikTok", "Joe Biden selects Kamala Harris as V.P.", "Postmaster general's involvment in mail-in voting controversy",
                    "TikTok's owner partners with Oracle rather than Microsoft", "Controversy over presidential debate after Trump contracts COVID-19", 
                    "Ruth Bader Ginsburg dies; Trump picks Amy Coney Barrett to replace", "Vice-presidential debate 2020", 
                    "Hurricane Delta", "Controversy surrounding potential for Biden to pack the Supreme Court", 
                    "Amy Coney Barrett's confirmation hearings and confirmation", "Controversy surrounding extension of ballot counting period in Minnesota", 
                    "Biden wins 2020 election", "Trump's challenge to 2020 election results in Pennsylvania", 
                    "Michigan certifies election results, sealing Biden's win", "Biden wins Wisconsin recount", 
                    "SCOTUS rejects Trump and Texas's attempt to overturn election results", "Electoral College casts votes", 
                    "US request to extradite Julian Assange is blocked", "Impeachment attempt after January 6", 
                    "Parler removed from app store", "Marjorie Taylor Green ousted from her committees", 
                    "Coronavirus updates in California; Deaths decline", "Horoscopes referencing celebrities", 
                    "Trump's second impeachment trial", "Andrew Cuomo accused of harassment", 
                    "Trial of Derek Chauvin", "Cargo ship blocks Suez Canal", "Arkansas bans trans healthcare for youth", 
                    "Concern over blood clots after Johnson and Johnson vaccine", "Liz Cheney ousted from house leadership role", 
                    "Biden's infrastructure bill", "Hacking interferes with Colonial Pipeline", 
                    "Netanyahu ousted by Isreali coalition", "Biden and Putin meet in Geneva", 
                    "Biden, senate make Juneteenth a federal holiday", "Derek Chauvin Sentenced", 
                    "Extreme heatwave hits Pacific Northwest", "Deal reached on Biden's infrastructure bill", 
                    "Trump organization charged with tax crimes", "Tropical Storm Elsa", 
                    "Miami death toll climbs after Condo collapses", "Infrastructure bill passes through senate", 
                    "Facebook and Biden clash over COVID-19 misinformation", "Biden seeks extension for eviction moratorium", 
                    "Massive California wildfire", "Cuomo resigns due to sexual harassment allegations", 
                    "Wildfires in Greece", "Texas, Florida schools clash with governments over mask mandates", 
                    "Controversy over Biden's Afghanistan withdrawal deadline", "COVID-19 updates in Minnesota", 
                    "Wildfires approach Lake Tahoe", "Hurricane Ida", "Gavin Newsome wins recall ellection", 
                    "Controversy over General Mark Milley's communication with China", "Wildfires threaten to destroy California's sequoias", 
                    "Official bodies approve COVID booster", "Minnesota COVID-19 updates", "Tight governer race in Virginia", 
                    "Controversy, political response to Texas abortion law", "House votes to hold Steve Bannon in contempt over Jan. 6", 
                    "Chicago Sky wins first WNBA title", "Official bodies approve mixing COVID vaccines and boosters", 
                    "Official bodies approve COVID vaccine for children 5-11", "Alec Baldwin kills Halyna Hutchins on set", 
                    "Judge refuses Trump's request to block Jan. 6 records", "Court case of Kyle Rittenhouse", 
                    "'Unite the Right' trial developments and verdict", "Lauren Boebert makes anti-Muslim comments, apologizes", 
                    "Duante Wright manslaughter trial", "CNN fires Chris Cuomo for helping brother", "death of Bob Dole", 
                    "Horoscopes featuring celebrity names", "House votes to hold Mark Meadows in contempt", 
                    "Deadly tornadoes in Kentucky, southeastern US", "Coverage of Omicron variant", 
                    "CDC shortens COVID isolation window", "Airlines cancel flights due to COVID staffing shortages"]

In [244]:
#for getting descriptions 
idList = list(stormTab.index)

stormIndex = 94
print(stormDescriptions[stormIndex])
list(pStormDf.loc[pStormDf["stormID"] == idList[stormIndex]].sample(10).sort_values("date")["title"])

Deadly tornadoes in Kentucky, southeastern US


['Kentucky hardest hit as storms leave dozens dead in 5 states — VIDEO',
 '100 FEARED DEAD',
 'Kentucky hardest hit as storms leave dozens dead in 5 states',
 'Dozens feared dead as tornadoes tear through Kentucky, southeastern US',
 'Kentucky governor says storms may have killed at least 70 people',
 "'Worst in long time'...",
 'Kentucky Tornado Toll In Dozens; Less Than Feared At Candle Factory',
 'A stretch of more than 250 miles might have been hit by one violent, long-track tornado',
 'Thousands without heat, water after tornadoes kill dozens',
 'At Least 64 People Confirmed Dead in Kentucky After Barrage of Tornadoes']

In [305]:
stormTab["duration"] = stormTab["duration"].astype(str)

In [306]:
stormTab["description"] = stormDescriptions
#stormTab = stormTab[["startDay", "peakDay", "articleCount", "nationalPercent", "description"]]

In [307]:
#to csv 
stormTab.to_csv("/shared/3/projects/newsDiffusion/data/processed/pubData/stormDescriptions.tsv", sep="\t")

In [309]:
#to latex table
stormTab.style.format({'nationalPercent': "{:.1f}", "startDay":"{:%b %d, %y}",  "peakDay":"{:%b %d, %y}"}).hide().to_latex("/home/blitt/projects/localNews/reports/figures/drafts/stormsTable.tex")