In [12]:
import pandas as pd
from pandas.io.json import json_normalize
import spacy as sp
import json
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split

In [24]:
rdata = pd.read_json('../../data/clean/relevantNewsNLTK4.json')
irdata = pd.read_json('../../data/clean/irrelevantNewsNLTK4.json')
rdata['relevance']=1
irdata['relevance']=0
data = rdata.append(irdata, ignore_index=True)
data.head()

Unnamed: 0,content,headline,source,summary,uid,index,label,relevance
0,lowly milkshake weapon choice britons determin...,Milkshakes become weapon of choice in UK Europ...,Agence France Presse,Former UK Independence Party leader Nigel Fara...,a437ff48-104a-54bb-bff7-c7a736158524,0,1,1
1,anz race set bring biggest white collar job lo...,ANZ's first assault in the looming job armageddon,News Ltd.,ANZ has moved to the front of the race that’ s...,366c92af-8143-5ffa-8702-4f26bd22c8b6,1,1,1
2,jul 10 carnival cruise stateroom attendants ex...,Carnival Cruise Line to collect your used soap...,Tribune Content Agency,Jul. 10-- Carnival Cruise Line stateroom atten...,863096d4-48f0-5a7c-bee6-384a76d575ee,2,1,1
3,chennai rohit 543 saravanan 546 wickets standa...,Standard CC bags fourth title in a row [New In...,SyndiGate Media Inc.,CHENNAI: R Rohit and P Saravanan took five wic...,3e4d6490-4224-595e-be26-4cb249209b8f,3,1,1
4,donald trump nominee lead fish wildlife servic...,Revealed: Trump's Wildlife Service pick has ti...,Guardian,New revelations show she also has ties to the ...,9f3e248d-b040-5058-bdc9-61c4de59f02a,4,1,1


In [25]:
X = data.drop(['relevance'], axis=1) # the features we want to analyze
y = data['relevance'] # the labels, or answers, we want to test against

# X_train and y_train are the entire dataset (for now)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
nlp = sp.load('en_core_web_sm')

In [27]:
def generate_hash(text):
    hash_df = {}
    doc = nlp(text)
    for entity in doc.ents:
        if entity.label_ not in hash_df:
            hash_df[entity.label_] = 0
        hash_df[entity.label_] = hash_df[entity.label_] + 1
    return hash_df

In [41]:
def generate_ner_vector(data, feature):
    '''
    Input: dataframe of news article data, column to perform function
    Output: dataframe of articles with entities as columns and counts of each entity as values
    Notes: column entity definitions: https://spacy.io/api/annotation#named-entities
    '''
    columns = ['uid', 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', \
               'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORGINAL', 'CARDINAL']
    df = pd.DataFrame(index=data.index, columns=columns)
    df['uid'] = data['uid']
    i = 0
    for index, row in data.iterrows():
        if (i % 1000 == 0):
            print('progress: ', i)
        text = row[feature]
        hash_df = generate_hash(text)
        itercol = iter(columns)
        next(itercol)
        for col in itercol:
            if col in hash_df:
                df.at[index, col] = hash_df[col]
            else:
                df.at[index, col] = 0
        i += 1
    # diagnostics
    # print(df)
    return df

In [43]:
# Generate dataframe with rows of articles, and columns of entities; cells are counts of each entity in the article content
# Test Case: rdata_ner_df = generate_ner_vector(rdata.loc[[0]], "content")

train_ner = generate_ner_vector(X_train, "content")

progress:  0.0
progress:  0.1
progress:  0.2
progress:  0.3
progress:  0.4
progress:  0.5
progress:  0.6
progress:  0.7
progress:  0.8
progress:  0.9
progress:  1.0
progress:  1.1
progress:  1.2
progress:  1.3
progress:  1.4
progress:  1.5


In [44]:
train_ner['label'] = y_train

In [46]:
test_ner = generate_ner_vector(X_test, "content")
test_ner['label'] = y_test

progress:  0.0
progress:  0.1
progress:  0.2
progress:  0.3


In [47]:
train_ner.to_csv('../../data/feature/train_ner.csv')
test_ner.to_csv('../../data/feature/test_ner.csv')

In [None]:
def ner_extractor(data, entity, column_name):
    '''
    Input: dataframe, entity from spaCy, column name from dataset
    Output: list of words that are matched to the given entity
    '''
    words_names = []
    for idx, line in data.iterrows():
        if (idx % 2000 == 0):
            print("index: {}".format(idx))
        doc = nlp(line[column_name])
        names = [ent.text for ent in doc.ents if ent.label_ == entity]
        words_names.append(names)
    return words_names

def top_x_common_entity(words_names, x):
    '''
    Input: list of words that are matched to the given entity, integer to specify top x number of words
    '''
    names = [line for line in words_names for line in set(line)]
    names_count = Counter(names).most_common(x)
    print(pd.DataFrame(names_count))