In [1]:
#importing necessary modules - pandas, nltk, regex, spacy, and RNG
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re
import random
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

nlp = en_core_web_sm.load()
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adenweiser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')

#takes in a tweet and removes the retweet and any hyperlinks
def clean_tweet(tweet_text):
    retweet_re = "^[rR][tT] @[a-zA-Z0-9_]*: "
    hyperlink_re = "http://[a-zA-Z0-9./-]*"
    hashtag_re = "#[a-zA-Z0-9_]+"
    return re.sub(hyperlink_re, "", re.sub(hashtag_re, "", tweet_text))

#read in tweets and make them nice and clean
for i in range(0, len(tweets)): 
    cleaned_tweet = clean_tweet(tweets.loc[i]['text'])
    tweets.at[i, 'text'] = cleaned_tweet
tweets

# Ceremony Name
ceremony_name = "Golden Globes 2013"

Unnamed: 0,text,user,id,timestamp_ms
0,JLo's dress!,"{'screen_name': 'Dozaaa_xo', 'id': 557374298}",290620657987887104,2013-01-14 00:45:38
1,What's making Sofia Vergara's boobs stay like ...,"{'screen_name': 'theAmberShow', 'id': 14648726}",290620657887219713,2013-01-14 00:45:38
2,RT @FabSugar: Kerry Washington is EVERYTHING. ...,"{'screen_name': 'SweetyPW', 'id': 35498686}",290620657828524032,2013-01-14 00:45:38
3,Anne Hathaway has got me living.,"{'screen_name': '_NicoleEdwards', 'id': 144430...",290620657799159809,2013-01-14 00:45:38
4,Jennifer Lopez's lace dress? Thoughts?,"{'screen_name': 'lolaogunnaike', 'id': 134953223}",290620657778188288,2013-01-14 00:45:38
...,...,...,...,...
174638,RT @authorViviAnna: I was sad that Mandy Patin...,"{'screen_name': 'dana1204', 'id': 18091543}",290675889379876864,2013-01-14 04:25:07
174639,RT @_ItzelMartinez_: Jennifer Lawrence aceptan...,"{'screen_name': 'IamTrisEverdeen', 'id': 55126...",290675889128230914,2013-01-14 04:25:07
174640,"Golden Globes, lots of fashion messes...but gl...","{'screen_name': 'Dpharmakis23', 'id': 852045842}",290675893024747523,2013-01-14 04:25:08
174641,Did they have mug shots at the golden globes?!...,"{'screen_name': 'reynaramirez22', 'id': 22732662}",290675888763314178,2013-01-14 04:25:08


In [5]:
tweets_with_host = [] #list of strings containing the word "host", "hosts", or "hosting"
    
for i in range(0, len(tweets)):
    tweet_text = tweets.loc[i]['text']
    if re.search("host(s*)", tweet_text.lower()) and not re.search("^[Rr][Tt]", tweet_text):
        tweets_with_host.append(tweet_text)

In [None]:
#TAG ENTITIES
entities_counts = {} #dictionary with key: entity name and value: number of appearances in tweets
entities_clusters = {} #dictionary with key: entity name and value: name of "representative" entity
parsed_tweets = []
for tweet in tweets_with_host:
    parsed_tweet = nlp(tweet)
    parsed_tweets.append(parsed_tweet)
    for entity in parsed_tweet.ents:
        if entity.label_ == "PERSON" and re.match("[a-zA-Z0-9.'’+-_@/]+", entity.text):
            person = entity.text
            if person in entities_clusters:
                entities_counts[person] += 1
            else:
                entities_counts[person] = 1
                entities_clusters[person] = person
#entities_counts

entities_to_remove = set()
for entity_a in entities_clusters:
    #print("\n\n\n ENTITY A: " + entity_a)
    for entity_b in entities_clusters:
        #print("\n ENTITY B: " + entity_b + "\n_____________\n")
        if entities_counts[entity_a] >= entities_counts[entity_b]:
            #entity a is more popular than entity b
            entity_a_tokens = entity_a.split(" ")
            entity_b_tokens = entity_b.split(" ")
            for entity_a_token in entity_a_tokens:
                #print("entity a token: " + entity_a_token)
                if len(entity_a_token) >= len(entity_b) and re.search(entity_a_token, entity_b):
                    #entities are a match! cluster them accordingly
                    #print("clustering: " + entity_b + " in group led by " + entities_clusters[entity_a] + "\n")
                    entities_clusters[entity_b] = entities_clusters[entity_a]
                    entities_to_remove.add(entity_b)
                    break
            for entity_b_token in entity_b_tokens:
                #print("entity b token:" + entity_b_token)
                if len(entity_b_token) >= len(entity_a) and re.search(entity_b_token, entity_a):
                    #entities are a match! cluster them accordingly
                    #print("clustering: " + entity_b + " in group led by " + entities_clusters[entity_a] + "\n")
                    entities_clusters[entity_b] = entities_clusters[entity_a]
                    entities_to_remove.add(entity_b)
                    break
#print("\n\n\n\n___________________________\n\n\n\n")
#entities_clusters
#print("\n\n\n\n___________________________\n\n\n\n")
#entities_to_remove

#combine counts in the entity_counts dictionary
for non_rep in entities_to_remove:
    entities_counts[entities_clusters[non_rep]] += entities_counts.pop(non_rep)
#print("\n\n\n\n___________________________\n\n\n\n")
#entities_counts
    

{"Jennifer Lopez's": 21,
 'Hugh Jackman': 18,
 'Salmon Fishing': 3,
 'Kerry Washington': 67,
 'Miu Miu': 8,
 'Natalie Morales': 2,
 'Awkward Savannah': 1,
 "Kerry Washington's": 17,
 'Jlo': 21,
 "Jennifer Lawrence's": 26,
 'Ben Affleck': 66,
 'George Clooney': 45,
 'Helen Mirren': 11,
 'Drag': 1,
 'Jennifer Lopez': 87,
 'Anjelica Huston': 1,
 "Sienna Miller's": 6,
 'Jennifer Lawrence': 187,
 'Rachel Weisz': 3,
 'Kristen Wiig': 28,
 'Anne Hathaway': 151,
 "Helen Mirren's": 3,
 'Yuck': 5,
 "Anne Hathaway's": 35,
 'Kate Hudson': 193,
 'Marion Cotillard': 17,
 'Ewan McGregor': 4,
 'Sienna Miller': 21,
 "Looooove Anne Hathaway's": 1,
 'Audrey': 1,
 'Ricky Gervais': 4,
 'Sofia Vergara': 28,
 "Kate Hudson's": 61,
 'Tina Fey': 179,
 'Amy Poehler': 65,
 'Matt Lauer': 4,
 'Lea Michele': 51,
 'Jessica Chastain': 63,
 'Meryl Streep': 4,
 'Paranorman': 1,
 'Nicole Kidmans': 1,
 'Anne Hathaways': 2,
 'Kristin Wiig': 8,
 'Leonardo Dicaprio': 4,
 'Nicole Kidman': 37,
 'Harry Winston': 2,
 'Halle Berry

entity a token: Jennifer
entity a token: Lopez's
entity b token:Jennifer
entity b token:Lopez's
entity a token: Jennifer
entity a token: Lopez's
entity b token:Hugh
entity b token:Jackman
entity a token: Jennifer
entity a token: Lopez's
entity b token:Salmon
entity b token:Fishing
entity a token: Jennifer
entity a token: Lopez's
entity b token:Miu
entity b token:Miu
entity a token: Jennifer
entity a token: Lopez's
entity b token:Natalie
entity b token:Morales
entity a token: Jennifer
entity a token: Lopez's
entity b token:Awkward
entity b token:Savannah
entity a token: Jennifer
entity a token: Lopez's
entity b token:Kerry
entity b token:Washington's
entity a token: Jennifer
entity a token: Lopez's
entity b token:Jlo
entity a token: Jennifer
entity a token: Lopez's
entity b token:Helen
entity b token:Mirren
entity a token: Jennifer
entity a token: Lopez's
entity b token:Drag
entity a token: Jennifer
entity a token: Lopez's
entity b token:Anjelica
entity b token:Huston
entity a token: Je

In [80]:
#find out if there is one host or multiple hosts

host_as_verb = [] #list of all tweets with host/hosts/hosting as a verb
host_as_noun = [] #list of all tweets with host/hosts as a noun
host_singular = 0 #count of all tweets implying a single host
host_plural = 0 #count of all tweets implying more than one host
for tweet in parsed_tweets:
    host_index = 0
    for token in tweet:
        if re.search("host", token.text.lower()):
            if token.pos_ == "VERB":
                host_as_verb.append(tweet)
                if token.text.lower() == "host":
                    host_plural += 1
                elif token.text.lower() == "hosts":
                    host_singular += 1
                break
            elif token.pos_ == "NOUN":
                host_as_noun.append(tweet)
                if token.text.lower() == "hosts":
                    host_plural += 1
                elif token.text.lower() == "host":
                    host_singular += 1
                break
        else:
            host_index += 1

#host_as_verb
#host_as_noun
#print("one host: " + str(host_singular) + "; multiple hosts: " + str(host_plural) + "\n")

if host_singular >= host_plural:
    likely_host = ""
    likely_host_count = 0
    for candidate in entities_counts:
        if entities_counts[candidate] >= likely_host_count:
            likely_host = candidate
            likely_host_count = entities_counts[candidate]
    print("The host is " + likely_host)
else:
    vote_distribution = list(entities_counts.values())
    average_count = sum(vote_distribution)/len(vote_distribution)
    likely_hosts = []
    for candidate in entities_counts:
        if entities_counts[candidate] >= average_count:
            likely_hosts.append(candidate)
    likely_hosts = sorted(likely_hosts, key = lambda e : entities_counts[e], reverse = True)
    #must be at least 2 hosts - start cutoffs at index 1
    for i in range(1, len(likely_hosts) - 1):
        if entities_counts[likely_hosts[i + 1]] / entities_counts[likely_hosts[i]] < 0.5:
            likely_hosts = likely_hosts[:i+1]
            break
    print("The hosts are " + str(likely_hosts))

The hosts are ['Amy Poehler', 'Tina Fey']
