In [23]:
#importing necessary modules - pandas, nltk, regex, spacy, and RNG
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re
import math
import random
import spacy
from spacy import displacy
from spacytextblob.spacytextblob import SpacyTextBlob
from collections import Counter
import en_core_web_sm

nlp = en_core_web_sm.load()
nltk.download("punkt")
nlp.add_pipe('spacytextblob')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adenweiser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<spacytextblob.spacytextblob.SpacyTextBlob at 0x7f87bbfe4370>

In [21]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [18]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')

#takes in a tweet and removes the retweet and any hyperlinks
def clean_tweet(tweet_text):
    retweet_re = "^[rR][tT] @[a-zA-Z0-9_]*: "
    hyperlink_re = "http://[a-zA-Z0-9./-]*"
    hashtag_re = "#[a-zA-Z0-9_]+"
    return re.sub(hyperlink_re, "", re.sub(hashtag_re, "", tweet_text))

#read in tweets and make them nice and clean
for i in range(0, len(tweets)): 
    cleaned_tweet = clean_tweet(tweets.loc[i]['text'])
    tweets.at[i, 'text'] = cleaned_tweet
tweets

# Ceremony Name
ceremony_name = "Golden Globes 2013"

Unnamed: 0,text,user,id,timestamp_ms
0,JLo's dress!,"{'screen_name': 'Dozaaa_xo', 'id': 557374298}",290620657987887104,2013-01-14 00:45:38
1,What's making Sofia Vergara's boobs stay like ...,"{'screen_name': 'theAmberShow', 'id': 14648726}",290620657887219713,2013-01-14 00:45:38
2,RT @FabSugar: Kerry Washington is EVERYTHING. ...,"{'screen_name': 'SweetyPW', 'id': 35498686}",290620657828524032,2013-01-14 00:45:38
3,Anne Hathaway has got me living.,"{'screen_name': '_NicoleEdwards', 'id': 144430...",290620657799159809,2013-01-14 00:45:38
4,Jennifer Lopez's lace dress? Thoughts?,"{'screen_name': 'lolaogunnaike', 'id': 134953223}",290620657778188288,2013-01-14 00:45:38
...,...,...,...,...
174638,RT @authorViviAnna: I was sad that Mandy Patin...,"{'screen_name': 'dana1204', 'id': 18091543}",290675889379876864,2013-01-14 04:25:07
174639,RT @_ItzelMartinez_: Jennifer Lawrence aceptan...,"{'screen_name': 'IamTrisEverdeen', 'id': 55126...",290675889128230914,2013-01-14 04:25:07
174640,"Golden Globes, lots of fashion messes...but gl...","{'screen_name': 'Dpharmakis23', 'id': 852045842}",290675893024747523,2013-01-14 04:25:08
174641,Did they have mug shots at the golden globes?!...,"{'screen_name': 'reynaramirez22', 'id': 22732662}",290675888763314178,2013-01-14 04:25:08


In [4]:
tweets_with_fashion_mention = [] #list of strings containing fashion-related-words
    
def is_fashion_relevant(tweet_text):
    fashion_keywords = ["dress", "outfit", "fashion", "red carpet", "suit", "look", "jewelry", "accessor[y|ies]"]
    for keyword in fashion_keywords:
        if re.search(keyword, tweet_text.lower()):
            return True
    return False

for i in range(0, len(tweets)):
    tweet_text = tweets.loc[i]['text']
    if is_fashion_relevant(tweet_text) and not re.search("^[Rr][Tt]", tweet_text):
        tweets_with_fashion_mention.append(tweet_text)

In [22]:
#TAG ENTITIES
entities_counts = {} #dictionary with key: entity name and value: number of appearances in tweets
entities_clusters = {} #dictionary with key: entity name and value: name of "representative" entity
entities_polarities = {}
parsed_tweets = []
for tweet in tweets_with_fashion_mention:
    parsed_tweet = nlp(tweet)
    parsed_tweets.append(parsed_tweet)
    for entity in parsed_tweet.ents:
        if entity.label_ == "PERSON" and re.match("[a-zA-Z0-9\.'’+-_@/]+", entity.text) and not re.search("\(|\)", entity.text):
            person = entity.text
            if person in entities_clusters:
                entities_polarities[person] = (entities_counts[person] * entities_polarities[person] + parsed_tweet._.blob.polarity) / (entities_counts[person] + 1) 
                entities_counts[person] += 1
            else:
                entities_counts[person] = 1
                entities_polarities[person] = parsed_tweet._.blob.polarity
                entities_clusters[person] = person
#entities_counts

entities_to_remove = set()
for entity_a in entities_clusters:
    #print("\n\n\n ENTITY A: " + entity_a)
    for entity_b in entities_clusters:
        #print("\n ENTITY B: " + entity_b + "\n_____________\n")
        if entities_counts[entity_a] >= entities_counts[entity_b]:
            #entity a is more popular than entity b
            entity_a_tokens = entity_a.split(" ")
            entity_b_tokens = entity_b.split(" ")
            for entity_a_token in entity_a_tokens:
                #print("entity a token: " + entity_a_token)
                if len(entity_a_token) >= len(entity_b) and re.search(entity_a_token, entity_b):
                    #entities are a match! cluster them accordingly
                    #print("clustering: " + entity_b + " in group led by " + entities_clusters[entity_a] + "\n")
                    entities_clusters[entity_b] = entities_clusters[entity_a]
                    entities_to_remove.add(entity_b)
                    break
            for entity_b_token in entity_b_tokens:
                #print("entity b token:" + entity_b_token)
                if len(entity_b_token) >= len(entity_a) and re.search(entity_b_token, entity_a):
                    #entities are a match! cluster them accordingly
                    #print("clustering: " + entity_b + " in group led by " + entities_clusters[entity_a] + "\n")
                    entities_clusters[entity_b] = entities_clusters[entity_a]
                    entities_to_remove.add(entity_b)
                    break
#entities_clusters
#entities_to_remove

#combine counts in the entity_counts dictionary
for non_rep in entities_to_remove:
    if entities_clusters[non_rep] in entities_counts:
        entities_polarities[entities_clusters[non_rep]] = (entities_polarities[entities_clusters[non_rep]] * entities_counts[entities_clusters[non_rep]] + entities_polarities[non_rep] * entities_counts[non_rep])/(entities_counts[entities_clusters[non_rep]] + entities_counts[non_rep])
        entities_counts[entities_clusters[non_rep]] += entities_counts.pop(non_rep)

#entities_polarities
    

{"Jennifer Lopez's": 0.24627976190476192,
 'Hugh Jackman': 0.38830236892736897,
 'Salmon Fishing': 0.01666666666666668,
 'Kerry Washington': 0.4572348451196399,
 'Miu Miu': 0.47812499999999997,
 'Natalie Morales': -0.07499999999999996,
 'Awkward Savannah': -0.39999999999999997,
 "Kerry Washington's": 0.3799251700680272,
 'Jlo': 0.19905222505668935,
 "Jennifer Lawrence's": 0.19928951257076255,
 'Ben Affleck': 0.23850413212159854,
 'George Clooney': 0.31414007092198576,
 'Helen Mirren': 0.36868686868686873,
 'Drag': -0.1,
 'Jennifer Lopez': 0.27201160309996525,
 'Anjelica Huston': 0.0,
 "Sienna Miller's": 0.145,
 'Jennifer Lawrence': 0.2953335634644126,
 'Rachel Weisz': 0.5285714285714286,
 'Kristen Wiig': 0.3849396789965987,
 'Anne Hathaway': 0.34287831926774615,
 "Helen Mirren's": -0.1611111111111111,
 'Yuck': -0.122,
 "Anne Hathaway's": 0.21363510101010108,
 'Kate Hudson': 0.45681283948851964,
 'Marion Cotillard': 0.28396012050653596,
 'Ewan McGregor': 0.3255208333333333,
 'Sienna Mil

In [39]:
vote_distribution = list(entities_counts.values())
average_count = sum(vote_distribution)/len(vote_distribution)
fashion_icons = []
for candidate in entities_counts:
    if entities_counts[candidate] >= average_count:
        fashion_icons.append(candidate)
fashion_icons = sorted(fashion_icons, key = lambda e : entities_counts[e] * entities_polarities[e], reverse = True)
fashion_scores = {}
for icon in fashion_icons:
    fashion_scores[icon] = entities_counts[icon] * entities_polarities[icon]
best_dressed = []
worst_dressed = []
for i in range(0, len(fashion_icons) - 1):
            if fashion_scores[fashion_icons[i + 1]] / fashion_scores[fashion_icons[i]] < 0.75:
                best_dressed = fashion_icons[:i+1]
                break
for i in range(1, len(fashion_icons) - 1):
            if fashion_scores[fashion_icons[-1 * (i + 1)]] >= 0 or fashion_scores[fashion_icons[-1 * (i + 1)]] / fashion_scores[fashion_icons[-1 * i]] < 0.75:
                worst_dressed = fashion_icons[-1*(i+1):]
                break
print("the top " + str(len(best_dressed)) + " fashion icons were: " + str(best_dressed))
print("the top " + str(len(worst_dressed)) + " fashion disasters were: " + str(worst_dressed))

the top 4 fashion icons were: ['Amy', 'Tina Fey', 'Kate Hudson', 'Adele']
the top 3 fashion disasters were: ['Sienna Miller', 'Mel Gibson', 'Glenn Close']


In [35]:
fashion_scores

{'Amy': 138.72201163419916,
 'Tina Fey': 114.93413115530304,
 'Kate Hudson': 88.1648780212843,
 'Adele': 84.66838474025975,
 'Jessica Alba': 57.9738371155754,
 'Jennifer Lawrence': 55.22737636784515,
 '@msleamichele': 55.18151041666667,
 'Anne Hathaway': 51.77462620942967,
 'Taylor Swift': 37.11029784451661,
 'Jodie Foster': 36.42613095238101,
 'Jennifer Garner': 34.572478693181814,
 'Claire Danes': 32.46197691197693,
 'Kerry Washington': 30.634734623015873,
 'Jennifer Lopez': 23.665009469696976,
 'Ben Affleck': 23.373404947916658,
 'Golden Globes': 22.13968712421839,
 'Lucy Liu': 21.080729166666668,
 "Lucy Liu's": 21.067832341269842,
 "Kate Hudson's": 20.843497925685426,
 'Jessica Chastain': 20.21923363095238,
 'Eva Longoria': 19.537554112554112,
 'Julianne Moore': 19.132291666666674,
 'Bradley Cooper': 18.94805555555556,
 'Lea Michele': 16.629375000000003,
 'Kristen Bell': 15.183194444444444,
 'Bill Clinton': 15.107436305014428,
 'George Clooney': 14.76458333333333,
 'Julia Roberts':