In [14]:
#importing necessary modules - pandas, nltk, regex, spacy, and RNG
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re
import random
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

nlp = en_core_web_sm.load()
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adenweiser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [16]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')

#takes in a tweet and removes the retweet and any hyperlinks
def clean_tweet(tweet_text):
    retweet_re = "^[rR][tT] @[a-zA-Z0-9_]*: "
    hyperlink_re = "http://[a-zA-Z0-9./-]*"
    return re.sub(retweet_re, "", re.sub(hyperlink_re, "", tweet_text))

#read in tweets and make them nice and clean
for i in range(0, len(tweets)):    
    tweets.at[i, 'text'] = clean_tweet(tweets.loc[i]['text'])
tweets

# Ceremony Name
ceremony_name = "Golden Globes 2013"

Unnamed: 0,text,user,id,timestamp_ms
0,JLo's dress! #eredcarpet #GoldenGlobes,"{'screen_name': 'Dozaaa_xo', 'id': 557374298}",290620657987887104,2013-01-14 00:45:38
1,What's making Sofia Vergara's boobs stay like ...,"{'screen_name': 'theAmberShow', 'id': 14648726}",290620657887219713,2013-01-14 00:45:38
2,Kerry Washington is EVERYTHING. Dying over her...,"{'screen_name': 'SweetyPW', 'id': 35498686}",290620657828524032,2013-01-14 00:45:38
3,Anne Hathaway has got me living. #GoldenGlobes,"{'screen_name': '_NicoleEdwards', 'id': 144430...",290620657799159809,2013-01-14 00:45:38
4,Jennifer Lopez's lace dress? Thoughts? #Golden...,"{'screen_name': 'lolaogunnaike', 'id': 134953223}",290620657778188288,2013-01-14 00:45:38
...,...,...,...,...
174638,I was sad that Mandy Patinkin didn't win #Gold...,"{'screen_name': 'dana1204', 'id': 18091543}",290675889379876864,2013-01-14 04:25:07
174639,Jennifer Lawrence aceptando su premio #GoldenG...,"{'screen_name': 'IamTrisEverdeen', 'id': 55126...",290675889128230914,2013-01-14 04:25:07
174640,"Golden Globes, lots of fashion messes...but gl...","{'screen_name': 'Dpharmakis23', 'id': 852045842}",290675893024747523,2013-01-14 04:25:08
174641,Did they have mug shots at the golden globes?!...,"{'screen_name': 'reynaramirez22', 'id': 22732662}",290675888763314178,2013-01-14 04:25:08


In [18]:
tweets_with_host = [] #list of strings containing the word "host", "hosts", or "hosting"
    
for i in range(0, len(tweets)):
    tweet_text = tweets.loc[i]['text']
    if re.search("host(s*)", tweet_text.lower()):
        tweets_with_host.append(tweet_text)

['Looking forward to watching Tina Fey and Amy Poehler host the #GoldenGlobes',
 "It's our hosts Tina Fey and Amy Poehler! #goldenglobes #redcarpet ",
 "Tonight's dual hosting duties represent the culmination of a decade of Amy and Tina partnerships.  #GoldenGlobes",
 'My green suede tuxedo pinching a bit here at the Velvet Rope Awards honoring best in crowd control. Topo Gigio + I hosting #GoldenGlobes',
 'Tina Fey &amp; Amy Poehler Talk #GoldenGlobes Hosting, Drinking Game  ',
 '@goldenglobes Best choice for host ever.  Nice job GG people.',
 '“We’re going to keep things loose,” said Amy Poehler of her and co-host Tina Fey’s plan for the evening. #GoldenGlobes ',
 '#GoldenGlobes hosts Tina Fey, Amy Poehler show off matching "husband and wife" outfits on red carpet ',
 '#GoldenGlobes hosts Tina Fey, Amy Poehler show off matching "husband and wife" outfits on red carpet ',
 "Tonight's dual hosting duties represent the culmination of a decade of Amy and Tina partnerships.  #GoldenGlobes

In [25]:
entities = {}
for tweet in tweets_with_host:
    parsed_tweet = nlp(tweet)
    for entity in parsed_tweet.ents:
        if entity.label_ == "PERSON":
            person = entity.text
            if person in entities:
                entities[person] += 1
            else:
                entities[person] = 1
entities

{'Tina Fey': 527,
 'Amy Poehler': 614,
 'Amy': 518,
 'GoldenGlobes': 1522,
 'Tina Fey’s': 2,
 'Frederic J. Brown': 3,
 'Rolonda Watts': 1,
 'Dalia MacPhee': 1,
 'Jay Leno': 6,
 'Kelly': 1,
 "Amy Poehler's": 15,
 "Jay Leno's": 1,
 '\ue106\ue106\ue106': 1,
 'Amy Pohler': 20,
 'Tina Fey + Amy Poehler': 11,
 'SGS': 1,
 'Ryan Seacrest': 2,
 'TinaFey': 12,
 'Golden Globes': 28,
 "Giuliana Rancic's": 1,
 'Golden': 1,
 'iloveamypoehler': 1,
 "Glenn Close's": 1,
 'Amy Polher': 1,
 'Norberry': 1,
 'Tina': 144,
 "Tina Fey's": 3,
 'Amy Poehler - @BostonCollege': 3,
 'Amy Poeler': 3,
 'Amy Pholer &': 1,
 'Tiny Fay': 1,
 'Amy hostin': 1,
 'omg tina': 1,
 'Jamas': 1,
 'gustado tina': 1,
 'Bill Murray': 3,
 'Globes': 6,
 'tinafey': 1,
 'Tina Faye': 1,
 'Ricky Gervais': 15,
 'Tina Fey&amp;Amy Poehler': 1,
 'Tina Fey RT': 1,
 'Oscar': 6,
 'Hathaway': 2,
 'Franko': 1,
 'LoveEm': 1,
 'James Franco': 5,
 'Queen Amy': 1,
 'Queen Tina': 1,
 'god': 1,
 'James Cameron': 8,
 'Omg Amy Poehler': 2,
 'Tina Fay': 4

<re.Match object; span=(0, 4), match='host'>