In [1]:
# Importing necessary modules
import pandas as pd
import nltk
import re
import random

In [2]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Reading in the answers file, keeping only awards
answers = pd.read_csv('answers.csv', usecols = ['award'])
answers

Unnamed: 0,award
0,best screenplay - motion picture
1,best director - motion picture
2,best performance by an actress in a television...
3,best foreign language film
4,best performance by an actor in a supporting r...
5,best performance by an actress in a supporting...
6,best motion picture - comedy or musical
7,best performance by an actress in a motion pic...
8,best mini-series or motion picture made for te...
9,best original score - motion picture


In [4]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')
for i in range(0, len(tweets)):    
    tweets.at[i, 'text'] = re.sub("^RT @[a-zA-Z0-9_]*: ", "", tweets.loc[i]['text']).lower()
tweets

Unnamed: 0,text,user,id,timestamp_ms
0,jlo's dress! #eredcarpet #goldenglobes,"{'screen_name': 'Dozaaa_xo', 'id': 557374298}",290620657987887104,2013-01-14 00:45:38
1,what's making sofia vergara's boobs stay like ...,"{'screen_name': 'theAmberShow', 'id': 14648726}",290620657887219713,2013-01-14 00:45:38
2,kerry washington is everything. dying over her...,"{'screen_name': 'SweetyPW', 'id': 35498686}",290620657828524032,2013-01-14 00:45:38
3,anne hathaway has got me living. #goldenglobes,"{'screen_name': '_NicoleEdwards', 'id': 144430...",290620657799159809,2013-01-14 00:45:38
4,jennifer lopez's lace dress? thoughts? #golden...,"{'screen_name': 'lolaogunnaike', 'id': 134953223}",290620657778188288,2013-01-14 00:45:38
...,...,...,...,...
174638,i was sad that mandy patinkin didn't win #gold...,"{'screen_name': 'dana1204', 'id': 18091543}",290675889379876864,2013-01-14 04:25:07
174639,jennifer lawrence aceptando su premio #goldeng...,"{'screen_name': 'IamTrisEverdeen', 'id': 55126...",290675889128230914,2013-01-14 04:25:07
174640,"golden globes, lots of fashion messes...but gl...","{'screen_name': 'Dpharmakis23', 'id': 852045842}",290675893024747523,2013-01-14 04:25:08
174641,did they have mug shots at the golden globes?!...,"{'screen_name': 'reynaramirez22', 'id': 22732662}",290675888763314178,2013-01-14 04:25:08


In [5]:
# Assigning answer information to variables - eventually we will have to infer this information ourselves
award_name = answers.loc[0]['award']
nominees = answers.loc[0]['nominees']
nominees = nominees.replace('[', '')
nominees = nominees.replace(']', '')
nominees = nominees.split(',')
nominees = [nominee.strip() for nominee in nominees]

shortened_award_name = re.sub(" - .*", "", award_name)

shortened_award_name
nominees

'best screenplay'

['zero dark thirty',
 'lincoln',
 'silver linings playbook',
 'argo',
 'django unchained']

In [6]:
# Searching tweets for the specified regular expression
match_count = {}
for j in range(0, len(tweets)):
    text = tweets.loc[j]['text']
    for i in range(0, len(nominees)):
        regex = nominees[i] + " wins " + shortened_award_name
        match = re.search(regex, text)
        if match:
            if nominees[i] not in match_count:
                match_count[nominees[i]] = 1
            else:
                match_count[nominees[i]] += 1
print(match_count)

{'django unchained': 2}


In [7]:
award_list_split = [] # List of list of keywords in each award split on spaces
award_list_unsplit = [] # List of all of the award names in a single string
nominees_list = [] # List of list of nominees by award

for i in range(0, len(answers)):
    # Add award name and all words in the award to respective lists
    award_name = answers.loc[i]['award']
    award_list_unsplit.append(award_name)
    award_name_list = award_name.split(" ")
    award_list_split.append(award_name_list)

    # Create list of nominees for each award
    nominees = answers.loc[i]['nominees']
    nominees = nominees.replace('[', '')
    nominees = nominees.replace(']', '')
    nominees = nominees.split(',')
    nominees = [nominee.strip() for nominee in nominees]
    nominees_list.append(nominees)

stop_words = ["a", "an", "by", "or", "with", "in", "-", "best", "award", "for", "b."]

award_list_split_updated = [] # Taking out stopwords from award_list_split, produce list of list of keywords
for award in award_list_split:
    award_updated = []
    for word in award:
        if word not in stop_words:
            award_updated.append(word)
    award_list_split_updated.append(award_updated)

# PRINT BELOW TO DEBUG
#award_list_split_updated
#award_list_unsplit
#nominees_list

# Dictionary mapping [award name (unsplit) -> [nominee -> mention count]]
match_count_dict = {}

# Adding award names as keys to the dictionary
for award_name in award_list_unsplit:
    match_count_dict[award_name] = {}

# PRINT BELOW TO DEBUG
#match_count_dict

# Go through each tweet and try and find each nominee, with max mentions indicating the winner
for j in range(0, len(tweets)):
    tweet_list = tweets.loc[j]['text'].split("wins")
    if len(tweet_list) == 2:
        # Tweet has the word "wins"
        tweet_nominees = tweet_list[0] # Left side of the word wins, assumed to contain the name of nominees
        tweet_award = tweet_list[1] # Right side of the word wins, assumed to contain the name of awards
        award_similarities = [] # Metric trying to figure out how similar right side of the word wins is to each award
        curr_award_number = 0

        # Try to identify the award based on the text of tweet_award
        for award in award_list_split_updated: # Loop through awards to get individual lists of keywords
            award_similarities.append(0) # Start the tally at 0
            for word in award: # Look for each of the keywords in the award
                if re.search(word, tweet_award):
                    award_similarities[curr_award_number] += 1 # Add one to the tally because the tweet has the keyword
                    #print("\n THE WORD IS " + word + "\n")
                    #print("\n THE TWEET RHS IS " + tweet_award + "\n")
            curr_award_number += 1

        # At least one award was relevant to the tweet
        if sum(award_similarities) != 0:
            #print(award_similarities)
            # Reset award number count and figure out the index of the award with the max similarity
            curr_award_number = 0 # Reset curr_award_number
            likely_award_number = 0
            likely_award_max = -1
            for award_similarity in award_similarities:
                if award_similarity > likely_award_max:
                    likely_award_max = award_similarity
                    likely_award_number = curr_award_number
                elif award_similarity == likely_award_max: # Handle tie cases
                    if random.randint(0, 1) == 1:
                        likely_award_number = curr_award_number
                curr_award_number += 1

            # Try to identify the nominee based on the nominees for the most likely award
            for nominee in nominees_list[likely_award_number]:
                if re.search(nominee, tweet_nominees):
                    # Nominee name shows up on left side of word wins
                    full_award_name = award_list_unsplit[likely_award_number]
                    #full_award_name
                    if nominee not in match_count_dict[full_award_name]:
                        match_count_dict[full_award_name][nominee] = 1
                    else:
                        match_count_dict[full_award_name][nominee] += 1
    else:
        # Word "wins" not in tweet
        tweet_nominees = ""
        tweet_award = ""
print(match_count_dict)

# Find the nominee winner based on the "votes"
winners = []
for award_key in match_count_dict.keys():
    award_votes = match_count_dict[award_key]
    nominee_winner = ""
    max_votes = 0
    for nominee in award_votes.keys():
        if award_votes[nominee] > max_votes:
            max_votes = award_votes[nominee]
            nominee_winner = nominee
    winners.append(nominee_winner)

for i in range(0, len(award_list_unsplit)):
    print(winners[i] + " wins " + award_list_unsplit[i])

# Adding the winners to the 'answers' DataFrame
answers['winner'] = winners
answers

{'best screenplay - motion picture': {'django unchained': 17}, 'best director - motion picture': {'quentin tarantino': 3, 'ben affleck': 198}, 'best performance by an actress in a television series - comedy or musical': {'lena dunham': 36, 'amy poehler': 1}, 'best foreign language film': {'amour': 41}, 'best performance by an actor in a supporting role in a motion picture': {'christoph waltz': 61}, 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television': {'maggie smith': 13}, 'best motion picture - comedy or musical': {'les miserables': 47}, 'best performance by an actress in a motion picture - comedy or musical': {'jennifer lawrence': 70}, 'best mini-series or motion picture made for television': {'game change': 1}, 'best original score - motion picture': {'life of pi': 15, 'argo': 1}, 'best performance by an actress in a television series - drama': {'claire danes': 62}, 'best performance by an actress in a motion picture - 

Unnamed: 0,award,nominees,winner
0,best screenplay - motion picture,"[zero dark thirty, lincoln, silver linings pla...",django unchained
1,best director - motion picture,"[kathryn bigelow, ang lee, steven spielberg, q...",ben affleck
2,best performance by an actress in a television...,"[zooey deschanel, tina fey, julia louis-dreyfu...",lena dunham
3,best foreign language film,"[the intouchables, kon tiki, a royal affair, r...",amour
4,best performance by an actor in a supporting r...,"[alan arkin, leonardo dicaprio, philip seymour...",christoph waltz
5,best performance by an actress in a supporting...,"[hayden panettiere, archie panjabi, sarah paul...",maggie smith
6,best motion picture - comedy or musical,"[the best exotic marigold hotel, moonrise king...",les miserables
7,best performance by an actress in a motion pic...,"[emily blunt, judi dench, maggie smith, meryl ...",jennifer lawrence
8,best mini-series or motion picture made for te...,"[the girl, hatfields & mccoys, the hour, polit...",game change
9,best original score - motion picture,"[argo, anna karenina, cloud atlas, lincoln, li...",life of pi
