In [2]:
# Importing necessary modules
import pandas as pd
import nltk
import re
import random

In [3]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# Reading in the answers file, keeping only awards and nominees
answers = pd.read_csv('answers.csv', usecols = ['award', 'nominees'])
answers

Unnamed: 0,award,nominees
0,best screenplay - motion picture,"[zero dark thirty, lincoln, silver linings pla..."
1,best director - motion picture,"[kathryn bigelow, ang lee, steven spielberg, q..."
2,best performance by an actress in a television...,"[zooey deschanel, tina fey, julia louis-dreyfu..."
3,best foreign language film,"[the intouchables, kon tiki, a royal affair, r..."
4,best performance by an actor in a supporting r...,"[alan arkin, leonardo dicaprio, philip seymour..."
5,best performance by an actress in a supporting...,"[hayden panettiere, archie panjabi, sarah paul..."
6,best motion picture - comedy or musical,"[the best exotic marigold hotel, moonrise king..."
7,best performance by an actress in a motion pic...,"[emily blunt, judi dench, maggie smith, meryl ..."
8,best mini-series or motion picture made for te...,"[the girl, hatfields & mccoys, the hour, polit..."
9,best original score - motion picture,"[argo, anna karenina, cloud atlas, lincoln, li..."


In [5]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')
for i in range(0, len(tweets)):    
    tweets.at[i, 'text'] = re.sub("^RT @[a-zA-Z0-9_]*: ", "", tweets.loc[i]['text']).lower()
tweets

Unnamed: 0,text,user,id,timestamp_ms
0,jlo's dress! #eredcarpet #goldenglobes,"{'screen_name': 'Dozaaa_xo', 'id': 557374298}",290620657987887104,2013-01-14 00:45:38
1,what's making sofia vergara's boobs stay like ...,"{'screen_name': 'theAmberShow', 'id': 14648726}",290620657887219713,2013-01-14 00:45:38
2,kerry washington is everything. dying over her...,"{'screen_name': 'SweetyPW', 'id': 35498686}",290620657828524032,2013-01-14 00:45:38
3,anne hathaway has got me living. #goldenglobes,"{'screen_name': '_NicoleEdwards', 'id': 144430...",290620657799159809,2013-01-14 00:45:38
4,jennifer lopez's lace dress? thoughts? #golden...,"{'screen_name': 'lolaogunnaike', 'id': 134953223}",290620657778188288,2013-01-14 00:45:38
...,...,...,...,...
174638,i was sad that mandy patinkin didn't win #gold...,"{'screen_name': 'dana1204', 'id': 18091543}",290675889379876864,2013-01-14 04:25:07
174639,jennifer lawrence aceptando su premio #goldeng...,"{'screen_name': 'IamTrisEverdeen', 'id': 55126...",290675889128230914,2013-01-14 04:25:07
174640,"golden globes, lots of fashion messes...but gl...","{'screen_name': 'Dpharmakis23', 'id': 852045842}",290675893024747523,2013-01-14 04:25:08
174641,did they have mug shots at the golden globes?!...,"{'screen_name': 'reynaramirez22', 'id': 22732662}",290675888763314178,2013-01-14 04:25:08


In [6]:
# Assigning answer information to variables - eventually we will have to infer this information ourselves
award_name = answers.loc[0]['award']
nominees = answers.loc[0]['nominees']
nominees = nominees.replace('[', '')
nominees = nominees.replace(']', '')
nominees = nominees.split(',')
nominees = [nominee.strip() for nominee in nominees]

shortened_award_name = re.sub(" - .*", "", award_name)

shortened_award_name
nominees

'best screenplay'

['zero dark thirty',
 'lincoln',
 'silver linings playbook',
 'argo',
 'django unchained']

In [8]:
# Searching tweets for the specified regular expression
match_count = {}
for j in range(0, len(tweets)):
    text = tweets.loc[j]['text']
    for i in range(0, len(nominees)):
        regex = nominees[i] + " wins " + shortened_award_name
        match = re.search(regex, text)
        if match:
            if nominees[i] not in match_count:
                match_count[nominees[i]] = 1
            else:
                match_count[nominees[i]] += 1
print(match_count)

{'django unchained': 2}


In [38]:

award_list_split = [] #list of list of keywords in each award split on spaces
award_list_unsplit = [] #list of all of the award names in a single string
nominees_list = [] #list of list of nominees by award

for i in range(0, len(answers)):
    #add award name and all words in the award to respective lists
    award_name = answers.loc[i]['award']
    award_list_unsplit.append(award_name)
    award_name_list = award_name.split(" ")
    award_list_split.append(award_name_list)
    
    #create list of nominees for each award
    nominees = answers.loc[i]['nominees']
    nominees = nominees.replace('[', '')
    nominees = nominees.replace(']', '')
    nominees = nominees.split(',')
    nominees = [nominee.strip() for nominee in nominees]
    nominees_list.append(nominees)

stop_words = ["a", "an", "by", "or", "with", "in", "-", "best", "award", "for", "b."]

award_list_split_updated = [] #taking out stopwords from award_list_split, produce list of list of keywords
for award in award_list_split:
    award_updated = []
    for word in award:
        if word not in stop_words:
            award_updated.append(word)
    award_list_split_updated.append(award_updated)

#PRINT BELOW TO DEBUG
#award_list_split_updated
#award_list_unsplit
#nominees_list

#dictionary mapping [award name (unsplit) -> [nominee -> mention count]]
match_count_list = {}

#adding award names as keys to the dictionary
for award_name in award_list_unsplit:
    match_count_list[award_name] = {}
    
#PRINT BELOW TO DEBUG
#match_count_list
    
#go through each tweet and try and find each nominee
for j in range(0, len(tweets)):
    tweet_list = tweets.loc[j]['text'].split("wins")
    if len(tweet_list) == 2:
        #tweet has the word "wins"
        tweet_nominees = tweet_list[0] #left side of the word wins
        tweet_award = tweet_list[1] #right side
        award_similarities = [] #metric trying to figure out how similar right side is to each award
        curr_award_number = 0 
        
        #try to identify the award based on the text
        for award in award_list_split_updated: #loop through awards to get individual lists of keywords
            award_similarities.append(0) #start the tally at 0
            for word in award: #look for each of the keywords in the award
                if re.search(word, tweet_award):
                    award_similarities[curr_award_number] += 1 #add one to the tally because the tweet has the keyword
                    #print("\n THE WORD IS " + word + "\n")
                    #print("\n THE TWEET RHS IS " + tweet_award + "\n")
            curr_award_number += 1
    
       #at least one award was relevant to the tweet
        if sum(award_similarities) != 0:
            print(award_similarities)
            #reset award number count and figure out the index of the award with the max similarity
            curr_award_number = 0 #reset curr_award_number
            likely_award_number = 0
            likely_award_max = -1
            for award_similarity in award_similarities:
                if award_similarity > likely_award_max:
                    likely_award_max = award_similarity
                    likely_award_number = curr_award_number
                elif award_similarity == likely_award_max: #HANDLE TIE CASES
                    if random.randint(0, 1) == 1:
                        likely_award_number = curr_award_number
                curr_award_number += 1


            #try to identify the nominee based on the nominees for the most likely award
            for nominee in nominees_list[likely_award_number]:
                if re.search(nominee, tweet_nominees):
                    #nominee name shows up on left side of word wins
                    full_award_name = award_list_unsplit[likely_award_number]
                    full_award_name
                    if nominee not in match_count_list[full_award_name]:
                        match_count_list[full_award_name][nominee] = 1
                    else:
                        match_count_list[full_award_name][nominee] += 1
            print(match_count_list)            
    else:
        #word "wins" not in tweet
        tweet_nominees = ""
        tweet_award = ""
        
#FIND THE CANDIDATE WINNER BASED ON THE "VOTES"
winners = []
for award_key in match_count_list.keys():
    award_votes = match_count_list[award_key]
    candidate_winner = ""
    max_votes = 0
    for nominee in award_votes.keys():
        if award_votes[nominee] > max_votes:
            max_votes = award_votes[nominee]
            candidate_winner = nominee
    winners.append(candidate_winner)
    
for i in range(0, len(award_list_unsplit)):
    print(winners[i] + " wins " + award_list_unsplit[i])

SyntaxError: invalid syntax (411313854.py, line 76)