In [35]:
# Importing necessary modules
import spacy
import json
import re
import pandas as pd
import nltk
import wikipediaapi as wk # search for someone on wikipedia returning true when an entity exists
from heapq import nlargest
import time # start_time - time.time() test
from difflib import SequenceMatcher as smt # matching similarities between target and other entities

In [36]:
# Importing gg2013.json and returning as a list with duplicates removed
def load_json():
    with open("./gg2013.json") as f:
        json_obj = json.load(f)

    return list(set([ content["text"] for content in json_obj ]))

In [37]:
#Importing nltk for tokenization
def nltk_download():
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')

In [38]:
# Deleting http, amp, hashtag, mentioned tag, emojis, etc. using regex 
def cleansing_regex(str_list):

    '''
    Delete http, amp, hashtag, mentored tag, emoji, etc. using regex.
    :param str_list: List [ all text ]
    :return: List [ all text ] Some patterns have been deleted.
    '''

    http_pattern = re.compile("(\w+:\/\/\S+)")
    hash_pattern = re.compile("(#[A-Za-z0-9_]+)")

    amp_pattern = re.compile("&([0-9a-zA-Z]+)")
    tag_pattern = re.compile("(@[A-Za-z0-9_]+)")
    rt_pattern = re.compile("RT @[a-zA-Z0-9_]+: ")
    rt_pattern_2 = re.compile("RT")

    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoji
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    return_txt_list = []

    for i in str_list:
        v1 = re.sub(http_pattern, "", i)
        v2 = re.sub(hash_pattern, "", v1)
        v3 = re.sub(amp_pattern, "", v2)
        v4 = re.sub(rt_pattern, "", v3)
        v5 = re.sub(tag_pattern, "", v4)
        v6 = re.sub(emoji_pattern, "", v5)
        v7 = re.sub(rt_pattern_2, "", v6)
        v8 = re.sub(r"[^a-zA-Z_ ]", "", v7)

        if len(v8) > 2:
            return_txt_list.append(v8.strip())

    return list(set(return_txt_list))

In [39]:
# Identifying keywords and finding sentences where a nominee could be associated with an award; 
# Keyword 1 indicates who could be a nominee and keyword 2 maps a potential award
def cleansing_keyword(str_list):
    keywords_1 = ["nominated", "nominee", "nominate", "nomination", "nominees", "choose", "chosen",
                  "designate", "recommend", "select", "named", "deserve", "elect", "delegated",
                  "assigned", "promoted", "presented"]
    # "win", "wins", "winner", "awarded"
    keywords_2 = ["best"]

    return [ txt for txt in str_list if any(keyword in txt.lower() for keyword in keywords_1)
             and any(keyword in txt.lower() for keyword in keywords_2)]

In [40]:
# Fixing unncessarily capitalized expressions to uppercase at the beginning of the word 
def cleansing_capitalize(str_list):
    ret_list = []

    for txt in str_list:
        rev_txt = " ".join([ i for i in txt.split(" ") if i != "" ])
        lower_txt = ""
        for pos in nltk.pos_tag(nltk.word_tokenize(rev_txt)):
            keyword = pos[0]
            if not ( pos[1].startswith("NN") ) and pos[0].isupper() and len(pos[0]) != 1:
                keyword = pos[0].lower()
            lower_txt += ( keyword + " " )
        ret_list.append(lower_txt.strip())

    return ret_list

In [41]:
# Extracting keywords that satisfy the criteria for person, organization, faciities, work of art, global and political entities
def cleansing_spacy_entity(str_list):
    
    # spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 10000000
    ent_dict = dict([ (re.sub("[^a-zA-Z0-9\s]", "", str(x)) , x.label_ ) for x in nlp(str(str_list)).ents])

    name_list = []

    for k, v in ent_dict.items():
        k_lower = k.lower()
        if "best" not in k_lower and "win" not in k_lower :
            if ( len(k.split(" ")) < 4 ) and (v == "PERSON" or v == "ORG" or v == "FAC" or v == "WORK_OF_ART" or v == "GPE"):
               name_list.append(k_lower)

    name_list = list(set(name_list))

    return name_list

In [42]:
# Send wikipedia the cleansed list and bring in categories for the existing keyword 
def cleansing_wikipedia(str_list):

    wiki = wk.Wikipedia("en")

    wiki_list = []
    except_list = []

    for k in str_list:

        k_wiki = "_".join(str(k).title().split(" ")).strip()
        wiki_page = wiki.page(k_wiki)

        # Checks if a person exists on wikipedia or not returning true if they exist 
        if wiki_page.exists():
            time.sleep(0.3)
            try:
                w_cate = " ".join(list(wiki_page.categories.keys()))

            except:
                time.sleep(5)
                print("A ConnectionError (Read timed out)")
                except_list.append(k)

            else:
                if ("actor" in w_cate) or ("actress" in w_cate) or ("director" in w_cate) or ("films" in w_cate) or ("television" in w_cate):
                    wiki_list.append(k)

    if len(except_list) != 0:
        for k in except_list:
            k_wiki = "_".join(str(k).title().split(" ")).strip()
            wiki_page = wiki.page(k_wiki)
            w_cate = " ".join(list(wiki_page.categories.keys()))
            if "disambiguation pages" in w_cate:
                continue
            if ("actor" in w_cate) or ("actress" in w_cate) or ("director" in w_cate) or ("films" in w_cate) or (
                    "television" in w_cate):
                wiki_list.append(k)

    print(wiki_list)
    print(len(wiki_list))

    return wiki_list

In [43]:
# Counting the most relevant entities against the answer file - still testing 
def make_dict_count():

    key_list = list(set(pd.read_csv("./sample_str_6.csv")["Full_Text"].values.tolist()))
    df = pd.read_csv("./sample_str_2.csv").dropna(axis=0)
    dt = df["Full_Text"].values.tolist()

    # key_list = list(set(key_list))

    k_dict = {key:0 for key in key_list}

    for k in k_dict.keys():
        idx = df[df["Full_Text"].str.contains(k, case=False) & df["Full_Text"].str.contains("best", case=False)]
        # print(k, len(idx))
        k_dict[k] = len(idx)
    
    print(k_dict)

    smt_dict = {}
    key_list = []

    for k_1, v_1 in k_dict.items():
        ratio_dict = {}
        if k_1 not in smt_dict.keys():
            smt_dict[k_1] = v_1
            ratio_dict[k_1] = v_1
        for k_2, v_2 in k_dict.items():
            if k_2 not in smt_dict.keys():
                if smt(None, k_1, k_2).ratio() > 0.55: #0.55 the best ratio to separate original name from others 
                    smt_dict[k_2] = v_2
                    ratio_dict[k_2] = v_2
        if len(ratio_dict) != 0:
            key_list.append(nlargest(1, ratio_dict, key=ratio_dict.get)[0])

    k_dict = {key: 0 for key in key_list}

    for k in k_dict.keys():
        idx = df[df["Full_Text"].str.contains(k, case=False) & df["Full_Text"].str.contains("best", case=False)]
        # print(k, len(idx))
        k_dict[k] = len(idx)

    print("===============================================================")
    per = int( len(k_dict) * 0.75 ) #0.75 to count individuals who are in tweets most frequent 
    print(per)
    str_list = nlargest(per, k_dict, key = k_dict.get)
    print(str_list)

    #############################
    ans = pd.read_csv("./answers.csv")
    name_list = []

    for i in ans.index:
        named = ans.loc[i, "nominees"][1:-1].split(", ")
        for k in named:
            # if len(i.split(" ")) < 4 and len(i.split(" ")) > 1:  # Person name limit
            if len(k.split(" ")) < 4:
                name_list.append(k)
    pre_list = []

    for i in ans.index:
        named = ans.loc[i, "presenters"][1:-1].split(", ")
        for k in named:
            # if len(i.split(" ")) < 4 and len(i.split(" ")) > 1:  # Person name limit
            if len(k.split(" ")) < 4:
                pre_list.append(k)

    print(name_list)
    print(len(name_list))
    print("------ award name list (127) --------")
    #############################

    str_list = list(set(str_list) - set(pre_list))

    final_list = []
    for i in str_list:
        for k in name_list:
            if smt(None, i, k).ratio() > 0.65:
                final_list.append(i)
    final_list = list(set(final_list))

    # 30/43 - 0.75
    print(str_list)
    print(len(str_list))
    print(len(set(str_list).intersection(name_list)))

    print(final_list)
    print(len(final_list))
    print(len(set(final_list).intersection(name_list)))

In [44]:
def main():

    start_time = time.time()
    # nltk_download()
    ##############################################
    print("1 stage")
    str_list = load_json()
    str_list_2 = cleansing_regex(str_list)
    str_2 = pd.DataFrame(str_list_2, columns=["Full_Text"])
    str_2 = str_2.dropna(axis=0)
    str_2.to_csv("./sample_str_2.csv", index=False)
    ###############################################
    print("2 stage")
    str_list_3 = cleansing_keyword(str_list_2) # important !!!!
    str_3 = pd.DataFrame(str_list_3, columns=["Full_Text"])
    str_3 = str_3.dropna(axis=0)
    str_3.to_csv("./sample_str_3.csv", index=False)
    ###############################################
    print("3 stage")
    str_list_4 = cleansing_capitalize(str_list_3)
    str_4 = pd.DataFrame(str_list_4, columns=["Full_Text"])
    str_4 = str_4.dropna(axis=0)
    str_4.to_csv("./sample_str_4.csv", index=False)
    ###############################################
    print("4 stage")
    str_list_5 = cleansing_spacy_entity(str_list_4)
    str_5 = pd.DataFrame(str_list_5, columns=["Full_Text"])
    str_5 = str_5.dropna(axis=0)
    str_5.to_csv("./sample_str_5.csv", index=False)
    ###############################################
    print("5 stage")
    str_list_6 = cleansing_wikipedia(str_list_5)
    str_6 = pd.DataFrame(str_list_6, columns=["Full_Text"])
    str_6 = str_6.dropna(axis=0)
    str_6.to_csv("./sample_str_6.csv", index=False)
    ###############################################
    print(time.time() - start_time)


    ###############################################
    print("6 stage")

    make_dict_count()
    # print(time.time() - start_time)
    ###############################################

In [45]:
if __name__ == "__main__":
    main()

1 stage
2 stage
3 stage
4 stage
5 stage
['skyfall', 'kerry washington', 'ben afleck', 'the hurt locker', 'life of pi', 'tommy lee jones', 'benedict cumberbatch', 'golden globe', 'the golden globes', 'lea michele', 'the wire', 'jlo', 'les miserables', 'ron burgandy', 'golden globes', 'jennifer lawrence', 'damian lewis', 'modern family', 'tom hooper', 'keith urban', 'kristen wig', 'kate hudson', 'don cheadle', 'hugh jackman', 'quentin tarentino', 'si robertson', 'jodie foster', 'jon bon jovi', 'django unchained', 'amanda seyfried', 'joe wright', 'oscars', 'julia roberts', 'robert pattinson', 'huge jackman', 'sofia vergara', 'lena dunham', 'anne hathaway', 'halle berry', 'ang lee', 'hotel transylvania', 'george clooney', 'claire danes', 'leonardo dicaprio', 'tay tay', 'bill murray', 'tina fey', 'jessica lange', 'jessica lang', 'selena gomez', 'jessica chastain', 'kanye west', 'kristin wiig', 'golden globe award', 'inglorious basterds', 'jack black', 'julianne moore', 'tarantino', 'bryan c