In [1]:
# Importing necessary modules
import pandas as pd
import nltk
import re
import random
from heapq import nlargest

In [2]:
# Setting up notebook to display multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Reading in and processing the gg2013 tweets file
tweets = pd.read_json('gg2013.json')
#for i in range(0, len(tweets)):    
#    tweets.at[i, 'text'] = re.sub("^RT @[a-zA-Z0-9_]*: ", "", tweets.loc[i]['text']).lower()

# Subsetting to tweets that are not retweets
no_retweets = []
for j in range(0, len(tweets)):
    text = tweets.loc[j]['text']
    if not re.search("^RT", text):
        no_retweets.append(text.lower())

no_retweets_df = pd.DataFrame({'text': no_retweets})
no_retweets_df

Unnamed: 0,text
0,jlo's dress! #eredcarpet #goldenglobes
1,what's making sofia vergara's boobs stay like ...
2,anne hathaway has got me living. #goldenglobes
3,jennifer lopez's lace dress? thoughts? #golden...
4,podr√°n criticar a #adele de su moda y su maniq...
...,...
105785,thank god anne hathaway and hugh jackman won f...
105786,ben affleck celebrates his win backstage. #gol...
105787,"golden globes, lots of fashion messes...but gl..."
105788,did they have mug shots at the golden globes?!...


In [4]:
# Searching tweets for the specified regular expression
possible_award_tweets = [] # List of possible tweets mentioning awards
#for j in range(0, len(tweets)):
#    text = tweets.loc[j]['text']
for j in range(0, len(no_retweets_df)):
    text = no_retweets_df.loc[j]['text']
    if re.search("wins best", text) or re.search("nominated for best", text):
        possible_award_tweets.append(text) # Save a tweet if it contains "wins best" or "nominated for best"

# Splitting the possible tweets mentioning awards on best, and keeping only the right-hand side
possible_award_names = []
for award in possible_award_tweets:
    split_award = award.split("best")
    split_award_rhs = split_award[1]
    split_award_rhs = "best" + split_award_rhs # Appending 'best' back to the beginning of the right-hand side
    # Cleaning up the possible award names by taking some extra stuff out
    split_award_rhs = re.sub(" for .*", "", split_award_rhs) # Deleting everything that falls after the word "for", which is usually the winner name
    split_award_rhs = re.sub("#.*", "", split_award_rhs) # Deleting everything that falls after a hashtag, since these are usually after the award names
    split_award_rhs = re.sub("@.*", "", split_award_rhs) # Deleting everything that falls after an @
    split_award_rhs = re.sub("\.|!.*", "", split_award_rhs) # Deleting everything that falls after a period or exclamation point, since these are after award names
    split_award_rhs = re.sub(" at .*", "", split_award_rhs) # Deleting everything that falls after "at ", since that is usually followed by the ceremony name
    split_award_rhs = re.sub("http.*", "", split_award_rhs) # Deleting all web addresses
    split_award_rhs = re.sub("congrat.*", "", split_award_rhs) # Deleting everything following "congrats" or "congratulations"
    split_award_rhs = re.sub(",", "-", split_award_rhs) # Changing all commas to dashes
    split_award_rhs = re.sub("-$", "", split_award_rhs) # Deleting all dashes that end lines
    split_award_rhs = re.sub(r'([a-z])(-)', r'\g<1> \g<2>', split_award_rhs) # Adding a space in between a character and dash
    split_award_rhs = re.sub(r'(-)([a-z])', r'\g<1> \g<2>', split_award_rhs) # Adding a space in between a dash and character
    split_award_rhs = split_award_rhs.strip() # Deleting leading and trailing spaces
    possible_award_names.append(split_award_rhs)

# Counting the occurrences of all possible award names
possible_award_names_count = {}
for award in possible_award_names:
    if award != "best": # Possible award string is not only the word "best"
        if award not in possible_award_names_count:
            possible_award_names_count[award] = 1
        else:
            possible_award_names_count[award] += 1
print(possible_award_names_count)

# Keeping the top 25 mentioned award names
awards = nlargest(25, possible_award_names_count, key = possible_award_names_count.get)
print(awards)

{'best actor': 32, 'best movie': 1, "best dressed i'm obsessed with that dress": 1, 'best supporting actor': 26, 'best supporting': 1, 'best supporting actor did not think that was gonna happen': 1, 'best suppporting actor': 1, "best supporting actor haven't seen django yet but i can already guarantee this is definitely deserved": 1, 'best supporting actor - motion picture': 1, 'best supporting actress in a miniseries': 1, 'best director': 53, 'best supporting actor in django unchained\nmaggie smith wins': 1, "best supporting actress - and i'm 1- for -1 (a pretty obvious pick though - so i'm not getting cocky yet)": 1, 'best miniseries': 2, 'best mini - series or tv movie': 5, 'best miniseries or tv movie': 1, 'best tv movie?': 1, 'best mini - series or motion picture made': 1, 'best mini series/ tv movie': 1, 'best tv movie': 2, 'best mini - series': 2, 'best tv mini series/drama': 1, 'best tv movie/mini - series definitely not surprised': 1, 'best mini series or tv movie': 1, 'best m

In [5]:
# Adding the award name to the 'answers' DataFrame
answers = pd.DataFrame({'award': awards})
answers

Unnamed: 0,award
0,best director
1,best song
2,best actor
3,best supporting actress
4,best supporting actor
5,best actress
6,best original song
7,best picture
8,best screenplay
9,best picture - drama
