In [1]:
# Importing necessary modules
import pandas as pd
import nltk
import re
import random

In [2]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Reading in the answers file, keeping only awards and nominees
answers = pd.read_csv('answers.csv', usecols = ['award', 'nominees'])
answers

Unnamed: 0,award,nominees
0,best screenplay - motion picture,"[zero dark thirty, lincoln, silver linings pla..."
1,best director - motion picture,"[kathryn bigelow, ang lee, steven spielberg, q..."
2,best performance by an actress in a television...,"[zooey deschanel, tina fey, julia louis-dreyfu..."
3,best foreign language film,"[the intouchables, kon tiki, a royal affair, r..."
4,best performance by an actor in a supporting r...,"[alan arkin, leonardo dicaprio, philip seymour..."
5,best performance by an actress in a supporting...,"[hayden panettiere, archie panjabi, sarah paul..."
6,best motion picture - comedy or musical,"[the best exotic marigold hotel, moonrise king..."
7,best performance by an actress in a motion pic...,"[emily blunt, judi dench, maggie smith, meryl ..."
8,best mini-series or motion picture made for te...,"[the girl, hatfields & mccoys, the hour, polit..."
9,best original score - motion picture,"[argo, anna karenina, cloud atlas, lincoln, li..."


In [4]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')
#for i in range(0, len(tweets)):    
#    tweets.at[i, 'text'] = re.sub("^RT @[a-zA-Z0-9_]*: ", "", tweets.loc[i]['text']).lower()
#tweets

# Subsetting to tweets that are not retweets
no_retweets = []
for j in range(0, len(tweets)):
    text = tweets.loc[j]['text']
    if not re.search("^RT", text):
        no_retweets.append(text.lower())

no_retweets_df = pd.DataFrame({'text': no_retweets})
no_retweets_df

Unnamed: 0,text
0,jlo's dress! #eredcarpet #goldenglobes
1,what's making sofia vergara's boobs stay like ...
2,anne hathaway has got me living. #goldenglobes
3,jennifer lopez's lace dress? thoughts? #golden...
4,podrán criticar a #adele de su moda y su maniq...
...,...
105785,thank god anne hathaway and hugh jackman won f...
105786,ben affleck celebrates his win backstage. #gol...
105787,"golden globes, lots of fashion messes...but gl..."
105788,did they have mug shots at the golden globes?!...


In [5]:
award_list_split = [] # List of list of keywords in each award split on spaces
award_list_unsplit = [] # List of all of the award names in a single string
nominees_list = [] # List of list of nominees by award

for i in range(0, len(answers)):
    # Add award name and all words in the award to respective lists
    award_name = answers.loc[i]['award']
    award_list_unsplit.append(award_name)
    award_name_list = award_name.split(" ")
    award_list_split.append(award_name_list)

    # Create list of nominees for each award
    nominees = answers.loc[i]['nominees']
    nominees = nominees.replace('[', '')
    nominees = nominees.replace(']', '')
    nominees = nominees.split(',')
    nominees = [nominee.strip() for nominee in nominees]
    nominees_list.append(nominees)

stop_words = ["a", "an", "by", "or", "with", "in", "-", "best", "award", "for", "b."]

award_list_split_updated = [] # Taking out stopwords from award_list_split, produce list of list of keywords
for award in award_list_split:
    award_updated = []
    for word in award:
        if word not in stop_words:
            award_updated.append(word)
    award_list_split_updated.append(award_updated)

# Dictionary mapping [award name (unsplit) -> [nominee -> mention count]]
match_count_dict = {}

# Adding award names as keys to the dictionary
for award_name in award_list_unsplit:
    match_count_dict[award_name] = {}

# Go through each tweet and try and find each nominee, with max mentions indicating the winner
#for j in range(0, len(tweets)):
#    tweet_list = tweets.loc[j]['text'].split("wins")
for j in range(0, len(no_retweets_df)):
    tweet_list = no_retweets_df.loc[j]['text'].split("wins")
    if len(tweet_list) == 2:
        # Tweet has the word "wins"
        tweet_nominees = tweet_list[0] # Left side of the word wins, assumed to contain the name of nominees
        tweet_award = tweet_list[1] # Right side of the word wins, assumed to contain the name of awards
        award_similarities = [] # Metric trying to figure out how similar right side of the word wins is to each award
        curr_award_number = 0

        # Try to identify the award based on the text of tweet_award
        for award in award_list_split_updated: # Loop through awards to get individual lists of keywords
            award_similarities.append(0) # Start the tally at 0
            for word in award: # Look for each of the keywords in the award
                if re.search(word, tweet_award):
                    award_similarities[curr_award_number] += 1 # Add one to the tally because the tweet has the keyword
            curr_award_number += 1

        # At least one award was relevant to the tweet
        if sum(award_similarities) != 0:
            # Reset award number count and figure out the index of the award with the max similarity
            curr_award_number = 0 # Reset curr_award_number
            likely_award_number = 0
            likely_award_max = -1
            for award_similarity in award_similarities:
                if award_similarity > likely_award_max:
                    likely_award_max = award_similarity
                    likely_award_number = curr_award_number
                elif award_similarity == likely_award_max: # Handle tie cases
                    if random.randint(0, 1) == 1:
                        likely_award_number = curr_award_number
                curr_award_number += 1

            # Try to identify the nominee based on the nominees for the most likely award
            for nominee in nominees_list[likely_award_number]:
                if re.search(nominee, tweet_nominees):
                    # Nominee name shows up on left side of word wins
                    full_award_name = award_list_unsplit[likely_award_number]
                    if nominee not in match_count_dict[full_award_name]:
                        match_count_dict[full_award_name][nominee] = 1
                    else:
                        match_count_dict[full_award_name][nominee] += 1
    else:
        # Word "wins" not in tweet
        tweet_nominees = ""
        tweet_award = ""
print(match_count_dict)

# Find the nominee winner based on the "votes"
winners = []
for award_key in match_count_dict.keys():
    award_votes = match_count_dict[award_key]
    nominee_winner = ""
    max_votes = 0
    for nominee in award_votes.keys():
        if award_votes[nominee] > max_votes:
            max_votes = award_votes[nominee]
            nominee_winner = nominee
    winners.append(nominee_winner)

for i in range(0, len(award_list_unsplit)):
    print(winners[i] + " wins " + award_list_unsplit[i])

# Adding the winners to the 'answers' DataFrame
answers['winner'] = winners
answers

{'best screenplay - motion picture': {'django unchained': 7}, 'best director - motion picture': {'quentin tarantino': 1, 'ben affleck': 63}, 'best performance by an actress in a television series - comedy or musical': {'lena dunham': 18, 'amy poehler': 1}, 'best foreign language film': {'amour': 16}, 'best performance by an actor in a supporting role in a motion picture': {'christoph waltz': 23}, 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television': {'maggie smith': 2}, 'best motion picture - comedy or musical': {'les miserables': 9}, 'best performance by an actress in a motion picture - comedy or musical': {'jennifer lawrence': 15}, 'best mini-series or motion picture made for television': {}, 'best original score - motion picture': {'life of pi': 7, 'argo': 1}, 'best performance by an actress in a television series - drama': {'claire danes': 16}, 'best performance by an actress in a motion picture - drama': {'jessica cha

Unnamed: 0,award,nominees,winner
0,best screenplay - motion picture,"[zero dark thirty, lincoln, silver linings pla...",django unchained
1,best director - motion picture,"[kathryn bigelow, ang lee, steven spielberg, q...",ben affleck
2,best performance by an actress in a television...,"[zooey deschanel, tina fey, julia louis-dreyfu...",lena dunham
3,best foreign language film,"[the intouchables, kon tiki, a royal affair, r...",amour
4,best performance by an actor in a supporting r...,"[alan arkin, leonardo dicaprio, philip seymour...",christoph waltz
5,best performance by an actress in a supporting...,"[hayden panettiere, archie panjabi, sarah paul...",maggie smith
6,best motion picture - comedy or musical,"[the best exotic marigold hotel, moonrise king...",les miserables
7,best performance by an actress in a motion pic...,"[emily blunt, judi dench, maggie smith, meryl ...",jennifer lawrence
8,best mini-series or motion picture made for te...,"[the girl, hatfields & mccoys, the hour, polit...",
9,best original score - motion picture,"[argo, anna karenina, cloud atlas, lincoln, li...",life of pi
