In [1]:
# Importing necessary modules
import pandas as pd
import nltk
from nltk.cluster import KMeansClusterer
import re
import random
from heapq import nlargest
from gensim.models import Word2Vec
import numpy as np

In [2]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')

# Subsetting to tweets that are not retweets
no_retweets = []
for j in range(0, len(tweets)):
    text = tweets.loc[j]['text']
    if not re.search("^RT", text):
        no_retweets.append(text.lower())

no_retweets_df = pd.DataFrame({'text': no_retweets})
no_retweets_df

Unnamed: 0,text
0,jlo's dress! #eredcarpet #goldenglobes
1,what's making sofia vergara's boobs stay like ...
2,anne hathaway has got me living. #goldenglobes
3,jennifer lopez's lace dress? thoughts? #golden...
4,podrán criticar a #adele de su moda y su maniq...
...,...
105785,thank god anne hathaway and hugh jackman won f...
105786,ben affleck celebrates his win backstage. #gol...
105787,"golden globes, lots of fashion messes...but gl..."
105788,did they have mug shots at the golden globes?!...


In [13]:
# Searching tweets for the specified regular expression
possible_award_tweets = [] # List of possible tweets mentioning awards
hashtags = {} # Dictionary of hashtags used in tweets
for j in range(0, len(no_retweets_df)):
    text = no_retweets_df.loc[j]['text']
    if re.search("wins best", text) or re.search("nominated for best", text):
        possible_award_tweets.append(text) # Save a tweet if it contains "wins best" or "nominated for best"
        matches = re.finditer("#[a-z0-9]+\s{1}|#[a-z0-9]+$", text) # Among the extracted tweets, pull out and save all the unique hashtags
        for hashtag in matches:
            ht = re.sub("#", "", hashtag.group().strip())
            if ht not in hashtags:
                hashtags[ht] = 1
            else:
                hashtags[ht] += 1

# Keeping the top 10 hashtags
hashtags_list = nlargest(10, hashtags, key = hashtags.get)

# Extracting all possible words of length 4-20 from the top 10 unique hashtags
word_count = 0
position_count = 0
word_length = 4
extracted_word_list = []
while word_count < len(hashtags_list):
    while word_length <= 20:
        while position_count <= len(hashtags_list[word_count]) - word_length:
            extracted_word = ""
            for i in range(0, word_length):
                extracted_word += hashtags_list[word_count][position_count + i]
            extracted_word_list.append(extracted_word)
            position_count += 1
        word_length += 1
        position_count = 0
    word_count += 1
    position_count = 0
    word_length = 4

# Splitting the possible tweets mentioning awards on best, and keeping only the right-hand side
possible_award_names = []
for award in possible_award_tweets:
    split_award = award.split("best")
    split_award_rhs = split_award[1]
    split_award_rhs = "best" + split_award_rhs # Appending 'best' back to the beginning of the right-hand side
    # Cleaning up the possible award names by taking some extra stuff out
    split_award_rhs = re.sub(" for .*", "", split_award_rhs) # Deleting everything that falls after the word "for", which is usually the winner name
    split_award_rhs = re.sub("#.*", "", split_award_rhs) # Deleting everything that falls after a hashtag, since these are usually after the award names
    split_award_rhs = re.sub("@.*", "", split_award_rhs) # Deleting everything that falls after an @
    split_award_rhs = re.sub("\.|!.*", "", split_award_rhs) # Deleting everything that falls after a period or exclamation point, since these are after award names
    split_award_rhs = re.sub(" at .*", "", split_award_rhs) # Deleting everything that falls after "at ", since that is usually followed by the ceremony name
    split_award_rhs = re.sub("http.*", "", split_award_rhs) # Deleting all web addresses
    split_award_rhs = re.sub("congrat.*", "", split_award_rhs) # Deleting everything following "congrats" or "congratulations"
    for word in reversed(extracted_word_list): # Deleting all possible words from the top 10 unique hashtags
        split_award_rhs = re.sub(word, "", split_award_rhs)    
    split_award_rhs = re.sub(",", "-", split_award_rhs) # Changing all commas to dashes
    split_award_rhs = re.sub("-$", "", split_award_rhs) # Deleting all dashes that end lines
    split_award_rhs = re.sub(r'([a-z])(-)', r'\g<1> \g<2>', split_award_rhs) # Adding a space in between a character and dash
    split_award_rhs = re.sub(r'(-)([a-z])', r'\g<1> \g<2>', split_award_rhs) # Adding a space in between a dash and character
    split_award_rhs = re.sub(" +", " ", split_award_rhs) # Changing all multiple spaces to single spaces
    split_award_rhs = split_award_rhs.strip() # Deleting leading and trailing spaces
    split_award_rhs_list = split_award_rhs.split(" ") # Splitting the possible awards names into separate words, generating a list of lists
    possible_award_names.append(split_award_rhs_list)

# Generating word embeddings for each word contained in the possible award names using Word2Vec
embeddings = Word2Vec(possible_award_names, min_count = 1)

# Defining a function that averages the word embeddings for each word comprising each unique possible award name
def create_award_embedding(possible_award, embeddings):
    award_embedding = []
    award_word_number = 0
    for word in possible_award:
        if award_word_number == 0:
            award_embedding = embeddings.wv[word]
        else:
            award_embedding = np.add(award_embedding, embeddings.wv[word])
        award_word_number += 1
    return np.asarray(award_embedding) / award_word_number

# Generating the award-level embeddings for each unique possible award name
award_embeddings = []
for possible_award in possible_award_names:
    award_embeddings.append(create_award_embedding(possible_award, embeddings))

# Clustering the award-level embeddings into 25 clusters using K-means clustering
kmeans = KMeansClusterer(25, distance = nltk.cluster.util.cosine_distance, repeats = 10, avoid_empty_clusters = True)
award_clusters = kmeans.cluster(award_embeddings, assign_clusters = True)

# Joining together the individual words of the possible award names to generate one phrase per award
possible_award_names_clusters = []
for award in possible_award_names:
    full_award_name = ' '.join(award)
    possible_award_names_clusters.append(full_award_name)

# Linking in the assigned award clusters to each award name
award_clusters_dict = {}
for i in range(len(award_clusters)):
    if award_clusters[i] not in award_clusters_dict:
        award_clusters_dict[award_clusters[i]] = []
    award_clusters_dict[award_clusters[i]].append(possible_award_names_clusters[i])

print(award_clusters_dict)    
# NEXT COUNT AWARD NAME OCCURRENCES, AND PICK THE MOST FREQUENT ONE FROM EACH CLUSTER

# Counting the occurrences of all possible award names
# possible_award_names_count = {}
# for award in possible_award_names:
#     if award != "best": # Possible award string is not only the word "best"
#         if award not in possible_award_names_count:
#             possible_award_names_count[award] = 1
#         else:
#             possible_award_names_count[award] += 1
# print(possible_award_names_count)

# Keeping the top 25 mentioned award names
# awards = nlargest(25, possible_award_names_count, key = possible_award_names_count.get)
# print(awards)

{23: ['best actor', 'best actor', 'best actor', 'best actor', 'best supportin actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', "best actor i'm shocked", 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor', 'best actor'], 18: ['best movie', "best dressed i'm obsessed with that dress", 'best minies', 'best life', 'best picture?', 'best score', 'best score poor cloud atlas', 'best original score', 'best score', 'best original score', 'best soundtrack score', 'best original score', 'best song:-)&gt;&lt;', 'best score?', 'best speech so far', 'best', 'best animation', 'best song:', 'best pic', 'best actress comedy/musical first win of the night', 'best', 'best screenplay: 2013', 'best screenplaylast won', 'best 

In [15]:
print(possible_award_names_clusters)
print(award_clusters)

print(len(possible_award_names_clusters))
print(len(award_clusters))

print(award_clusters_dict[21])

['best actor', 'best movie', "best dressed i'm obsessed with that dress", 'best supporting actor', 'best supporting actor', 'best supporting actor', 'best supporting', 'best supporting actor', 'best supporting actor', 'best supporting actor', 'best supporting actor', 'best supporting actor did not think that was gonna happen', 'best supporting actor', 'best suppporting actor', 'best supporting actor', 'best supporting actor', "best supporting actor haven't seen yet but i can already guarantee this is definitely deserved", 'best supporting actor', 'best supporting actor - motion picture', 'best supporting actor', 'best supporting actor', 'best supporting actor', 'best supporting actor', 'best supporting actress in a minies', 'best supporting actor', 'best supporting actor', 'best director', 'best supporting actor in unchained\nmaggie smith wins', "best supporting actress - and i'm 1- for -1 (a pretty obvious pick though - so i'm not getting cocky yet)", 'best supporting actor', 'best su

In [5]:
# Adding the award name to the 'answers' DataFrame
answers = pd.DataFrame({'award': awards})
answers

Unnamed: 0,award
0,best director
1,best song
2,best supporting actor
3,best actor
4,best supporting actress
5,best actress
6,best original song
7,best picture
8,best screenplay
9,best picture - drama
