In [11]:
import json
import pandas as pd

# Import reduce from functools
from functools import reduce

In [3]:
with open('data/data/news_data.json') as f:
      data2 = json.load(f)

In [4]:
new_cases = pd.DataFrame(data2['data'], columns=['id', 'title', 'summary', 
                                      'authors', 'tags', 
                                      'text', 'url', 'source',
                                      'created_at', 'updated_at',
                                      'author', 'date'])
new_cases.head(2)

Unnamed: 0,id,title,summary,authors,tags,text,url,source,created_at,updated_at,author,date
0,10813,"ZingBox aims for ‘Internet of Trusted Things’,...",Cybersecurity provider ZingBox has announced t...,,device\niot\nguardian\napproach\ndevices\nindu...,Cybersecurity provider ZingBox has announced t...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.343Z,2020-02-05T17:08:34.343Z,James Bourne,2017-04-25
1,10814,AI may help create more sustainable data centres,Enterprise data centre provider Aegis Data arg...,,data\ncentre\nnatural\nnew\ntechnology\nindust...,Enterprise data centre provider Aegis Data arg...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.355Z,2020-02-05T17:08:34.355Z,James Bourne,2017-04-25


In [5]:
new_df = new_cases[['title', 'summary', 'tags', 'text']]

In [6]:
industry = ['agriculture', 'automative', 'comsumer products', 
            'energy', 'finance', 'health care', 'manufacturaing', 
            'media', 'pharmaceuticals', 'public and social sector', 
            'telecom', 'transport, travel and logistics']

## 1) Dind topic with tf-idf

In [7]:
texts = new_df.text

### Cleaning

In [8]:
from wordcloud import STOPWORDS
english_stops = set(STOPWORDS)

import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [13]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

# Tokenize the article: tokens
tokens =[word_tokenize(article) for article in texts]
print("Total number of texts: {}".format(len(tokens)))

len_array = [len(token_array) for token_array in tokens]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total number of tokens: {}".format(total_tokens))

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [[t.lower() for t in token] for token in tokens]

# Retain alphabetic words: alpha_only
alpha_only_list = [[t for t in lower_token if t.isalpha()] for lower_token in lower_tokens]

# Remove all stop words: no_stops
no_stops = [[t for t in alpha_only if t not in english_stops] for alpha_only in alpha_only_list]

len_array = [len(token_array) for token_array in no_stops]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing stop words: {}".format(total_tokens))

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
articles_lemmatized = [[wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in no_stop] for no_stop in no_stops] 

Total number of texts: 1626
Total number of tokens: 1434310
Total of words after removing stop words: 694546


### Create dictionary including all words in dataset (694546 words).

In [14]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles_lemmatized)

In [16]:
# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles_lemmatized]

# How much text in corpus_sorted
print(len(corpus))

1626


### Word Frequency and Weights

In [17]:
from gensim.models.tfidfmodel import TfidfModel

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Save the fench revolution document: doc
doc = corpus[0]
# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Sort the doc for word frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 10 words of the document alongside the count
for word_id, word_count in bow_doc[:10]:
    print(dictionary.get(word_id), word_count)

print('*********')

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 10 weighted words
for term_id, weight in sorted_tfidf_weights[:10]:
    print(dictionary.get(term_id), weight)

device 9
iot 6
solution 5
expo 4
medical 4
zingbox 4
approach 3
data 3
guardian 3
new 3
*********
zingbox 0.5306773897793426
device 0.2869818491447389
guardian 0.2797327586089335
personality 0.20979313333215804
solution 0.16377898490507994
behaviour 0.15129020469990984
defend 0.15129020469990984
medical 0.14656096355385323
conceptualise 0.13266934744483566
enforces 0.13266934744483566


### Weights for all texts

In [20]:
tfidf_weights = [sorted(tfidf[doc], key=lambda w: w[1], reverse=True) for doc in corpus]

## 2) Name Entity Recognition

In [18]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/becode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/becode/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/becode/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [19]:
article2 = new_cases.text[0]

# Import spacy
import spacy

# Instantiate the English model: nlp
nlp = spacy.load('en_core_web_md')

# Create a new document: doc
doc = nlp(article2)

# Print all of the found entities and their labels
for ent in doc.ents:
    print(ent.label_, ent.text)

ORG ZingBox
PERSON IoT Guardian
ORDINAL first
PERCENT 99.9%
ORG Guardian
PRODUCT IoT Guardian
ORG Stanford University
ORG ZingBox
DATE zero-day
PERSON Jerry Marshall
ORG United Regional Health Care System
ORG ZingBox
PERCENT over 95%
PERCENT about 5%
ORG ZingBox
LOC Silicon Valley
GPE London
GPE Amsterdam
LAW the IoT Tech Expo
LAW Blockchain Expo and Cyber Security & Cloud Expo
CARDINAL one


### Use NER for extracting industry from text

In [21]:
import spacy
# Import the PhraseMatcher and initialize it
from spacy.matcher import Matcher

# Load the en_core_web_md model
nlp = spacy.load('en_core_web_md')

In [22]:
article3 = new_cases.text[4]

In [25]:
doc = nlp(article3)

matcher = Matcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(industry))
matcher.add('INDUSTRY', None, *patterns)

# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

AttributeError: 'spacy.tokens.token.Token' object has no attribute 'items'