In [5]:
import pandas as pd
import altair as alt

In [6]:
# run scraping script to save briefings as csv
%run scrape_briefings.py

In [7]:
# import scraped csv to pandas df
briefings_df = pd.read_csv('../data/all_briefings.csv')
briefings_df

Unnamed: 0,date,timestamp,speaker,text
0,2020-02-26,05:39,Donald Trump,Thank you very much everybody. Thank you very ...
1,2020-02-26,06:59,Donald Trump,A lot of people thought we shouldn’t have done...
2,2020-02-26,07:51,Donald Trump,We have a total of 15. We took in some from Ja...
3,2020-02-26,09:58,Donald Trump,China you know about. Where it started. I spok...
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly ..."
...,...,...,...,...
9677,2020-04-27,01:01:23,Reporters,[crosstalk 00:13:23].
9678,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back."
9679,2020-04-27,01:01:26,Speaker 16,If an American president loses more Americans ...
9680,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if yo..."


### Checking for null values:

In [8]:
briefings_df[briefings_df['text'].isnull()]

Unnamed: 0,date,timestamp,speaker,text
7217,2020-04-13,16:03,Donald Trump,
8339,2020-04-19,23:03,Andrew Cuomo,
9181,2020-04-22,01:15:03,Dr. Birx,


Manually comparing with the full transcripts and videos we can see these correspond to the speaker being cut off or uttering something inaudible. Let's simply drop these three rows.

In [9]:
briefings_df = briefings_df.dropna(subset=['text']).reset_index(drop=True)

### Cleaning up speaker names

In [10]:
# how many paragraphs of text for the top speakers?
briefings_df['speaker'].value_counts()[:50]

Donald Trump              4013
Mike Pence                 751
Dr. Birx                   244
Dr. Fauci                  237
John                       161
Deborah Birx               101
Admiral Giroir              94
Dr. Deborah Birx            94
Speaker 8                   93
Speaker 7                   92
Speaker 5                   88
Steve Mnuchin               84
Speaker 11                  82
Speaker 12                  82
Speaker 22                  79
Speaker 3                   78
Speaker 9                   76
Speaker 10                  76
Speaker 6                   73
Speaker 4                   73
Speaker 13                  72
Speaker 14                  68
Speaker 2                   65
Speaker 19                  64
Speaker 15                  61
Speaker 16                  60
Mike Pompeo                 59
Reporter                    58
Jeff                        56
Vice President Pence        53
Speaker 18                  52
Speaker 23                  52
Speaker 

Looking at the output above and inspecting the data in more detail we can see there's opportunity to:
- Consolidate numbered unnamed speakers and reporters, since numbering is not consisent across briefings
- Note specific recurring reports who are referred to by first name
- Standardize the name used for other important speakers who have multiple spellings/titles/variations present

In [11]:
# replace speaker names using basic regex
briefings_df['speaker'].replace(regex={r'.*Trump.*': 'Donald Trump', 
                                       r'.*Pence.*': 'Mike Pence',
                                       r'.*Fauci.*': 'Dr. Anthony Fauci',
                                       r'.*Birx.*': 'Dr. Deborah Birx',
                                       r'.*Berks.*': 'Dr. Deborah Birx',
                                       r'.*Pompeo.*': 'Mike Pompeo',
                                       r'.*Report.*': 'Unnamed (Reporter)',
                                       r'.*Audience Member.*': 'Unnamed',
                                       r'.*Speaker .*': 'Unnamed', 
                                       r'.*Jeff\Z': 'Jeff (Reporter)',
                                       r'.*John\Z': 'John (Reporter)',
                                       r'.*Peter\Z': 'Peter (Reporter)',
                                       r'.*Jim\Z': 'Jim (Reporter)',
                                       r'.*Steve\Z': 'Steve (Reporter)',
                                       r'.*Pete\Z': 'Pete Gaynor',
                                       r'.*Novarro.*': 'Peter Navarro',
                                       r'.*Surgeon General.*': 'Jerome Adams',
                                       r'.*Giroir.*': 'Brett Giroir',
                                       r'.*Polowczyk.*': 'John Polowczyk',
                                       r'.*Verma.*': 'Seema Verma',
                                       r'.*Azar.*': 'Alex Azar',
                                       r'.*Hahn.*': 'Dr. Steven Hahn',
                                       r'.*Mnuchin.*': 'Steve Mnuchin'},
                                inplace = True)

In [12]:
# how many paragraphs of text per speaker?
top_speaker_counts = briefings_df['speaker'].value_counts()[:25]
top_speaker_counts

Donald Trump          4091
Unnamed               2038
Mike Pence             849
Dr. Deborah Birx       491
Dr. Anthony Fauci      317
Unnamed (Reporter)     215
John (Reporter)        162
Steve Mnuchin          136
Brett Giroir           100
Mike Pompeo             81
Dr. Steven Hahn         64
Alex Azar               63
Jeff (Reporter)         56
Seema Verma             56
John Polowczyk          49
Jerome Adams            48
Jim (Reporter)          47
Steve (Reporter)        46
Peter Navarro           35
Peter (Reporter)        33
Kelly                   23
Jared Kushner           23
General Semonite        23
Mark Esper              22
Bill Bryan              19
Name: speaker, dtype: int64

In [13]:
# save cleaned df to csv
briefings_df.to_csv("../data/cleaned_briefings.csv",index=False)