In [2]:
import json
import pandas as pd

# Import reduce from functools
from functools import reduce

In [3]:
with open('data/data/news_data.json') as f:
      data2 = json.load(f)

In [4]:
new_cases = pd.DataFrame(data2['data'], columns=['id', 'title', 'summary', 
                                      'authors', 'tags', 
                                      'text', 'url', 'source',
                                      'created_at', 'updated_at',
                                      'author', 'date'])
new_cases.head(2)

Unnamed: 0,id,title,summary,authors,tags,text,url,source,created_at,updated_at,author,date
0,10813,"ZingBox aims for ‘Internet of Trusted Things’,...",Cybersecurity provider ZingBox has announced t...,,device\niot\nguardian\napproach\ndevices\nindu...,Cybersecurity provider ZingBox has announced t...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.343Z,2020-02-05T17:08:34.343Z,James Bourne,2017-04-25
1,10814,AI may help create more sustainable data centres,Enterprise data centre provider Aegis Data arg...,,data\ncentre\nnatural\nnew\ntechnology\nindust...,Enterprise data centre provider Aegis Data arg...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.355Z,2020-02-05T17:08:34.355Z,James Bourne,2017-04-25


In [5]:
new_cases.title[20]

'Exscientia partners with GSK to further drug discovery through AI'

In [6]:
new_cases.text[20]

'UK-based drug design company Exscientia has announced it has entered into a strategic drug discovery partnership with GlaxoSmithKline (GSK).\n\nUnder this partnership, Exscientia will use its artificial intelligence (AI)-enabled platform, combined with the expertise of GSK, to discover novel and selective small molecules for up to 10 disease-related targets, nominated by GSK across multiple therapeutic areas. It will receive research payments from GSK in lieu of this.\n\nExscientia will also receive near-term lead and pre-clinical candidate milestones on achieving all of the objectives. If all 10 projects are advanced, Exscientia will receive a total of £33 million from GSK on achieving these milestones. It will also receive incentives to reduce the number of compounds required for synthesis and assay in order to achieve lead and candidate compound goals.\n\nExscientia will use technologies like its ‘big data’ resources that comprise medicinal chemistry and large-scale bio-assays, and

## 1) Dind topic with tf-idf

In [7]:
texts = new_cases.text

### Cleaning in Texts

In [8]:
from wordcloud import STOPWORDS
english_stops = set(STOPWORDS)

import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [9]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

# Tokenize the article: tokens
tokens =[word_tokenize(article) for article in texts]
print("Total number of texts: {}".format(len(tokens)))

len_array = [len(token_array) for token_array in tokens]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total number of tokens: {}".format(total_tokens))

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [[t.lower() for t in token] for token in tokens]

# Retain alphabetic words: alpha_only
alpha_only_list = [[t for t in lower_token if t.isalpha()] for lower_token in lower_tokens]

# Remove all stop words: no_stops
no_stops = [[t for t in alpha_only if t not in english_stops] for alpha_only in alpha_only_list]

len_array = [len(token_array) for token_array in no_stops]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing stop words: {}".format(total_tokens))

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
articles_lemmatized = [[wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in no_stop] for no_stop in no_stops] 

Total number of texts: 1626
Total number of tokens: 1434310
Total of words after removing stop words: 694546


### Create dictionary including all words in dataset (694546 words).

In [10]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles_lemmatized)

In [11]:
# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles_lemmatized]

# How much text in corpus_sorted
print(len(corpus))

1626


### Find Usecase with similarity method

In [76]:
doc1 = nlp("Clustering, Regression, logistic, Resource allocation")
doc2 = nlp("Dimensionality reduction, Search algorithms, Predictive analytics")
doc3 = nlp("Classification, Sorting, Predictive maintenance")
doc4 = nlp("Conventional neural networks, Merging, Hyper-personalization")
doc5 = nlp("Deep learning networks, Compression, Discover new trends/anomalies")
doc6 = nlp("Convolutional neural network, Graph algorithms, Forecasting")
doc7 = nlp("Recurrent neural network Linear and non-linear optimization, Price and product optimization")
doc8 = nlp("Deep belief networks, Signal processing, Convert unstructured data")
doc9 = nlp("Encryption, Triaging")
docX = nlp(new_cases.text[20])

# Get the similarity of doc1 and doc2
similarity = docX.similarity(doc1)
print(similarity)
similarity = docX.similarity(doc2)
print(similarity)
similarity = docX.similarity(doc3)
print(similarity)
similarity = docX.similarity(doc4)
print(similarity)
similarity = docX.similarity(doc5)
print(similarity)
similarity = docX.similarity(doc6)
print(similarity)
similarity = docX.similarity(doc7)
print(similarity)
similarity = docX.similarity(doc8)
print(similarity)
similarity = docX.similarity(doc9)
print(similarity)

0.6868798983943949
0.6651798996139505
0.640085929584783
0.6884786141765988
0.8165983805119333
0.602837601429253
0.7635216211151113
0.7737061411011231
0.443758571234073


### Topic Modelling with LDA for all Dataset

In [26]:
from gensim.models import LdaMulticore

In [67]:
lda_model =  LdaMulticore(corpus, 
                           num_topics = 8, 
                           id2word = dictionary,                                    
                           passes = 10,
                           workers = 2)

In [68]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.021*"ai" + 0.013*"expo" + 0.013*"data" + 0.012*"will" + 0.010*"technology" + 0.009*"s" + 0.007*"use" + 0.007*"new" + 0.006*"learn" + 0.006*"industry"
Topic: 1 
Words: 0.017*"mit" + 0.011*"s" + 0.010*"say" + 0.009*"system" + 0.008*"will" + 0.007*"intelligence" + 0.007*"science" + 0.007*"research" + 0.007*"work" + 0.007*"computer"
Topic: 2 
Words: 0.013*"learn" + 0.010*"human" + 0.009*"say" + 0.008*"system" + 0.008*"computer" + 0.008*"s" + 0.007*"robot" + 0.007*"machine" + 0.005*"work" + 0.005*"data"
Topic: 3 
Words: 0.018*"s" + 0.010*"use" + 0.009*"facial" + 0.009*"ai" + 0.009*"recognition" + 0.006*"say" + 0.006*"technology" + 0.006*"percent" + 0.006*"human" + 0.005*"algorithm"
Topic: 4 
Words: 0.014*"s" + 0.011*"say" + 0.010*"data" + 0.010*"model" + 0.009*"use" + 0.009*"learn" + 0.009*"system" + 0.008*"researcher" + 0.007*"image" + 0.005*"network"
Topic: 5 
Words: 0.029*"ai" + 0.021*"s" + 0.010*"google" + 0.009*"company" + 0.008*"say" + 0.007*"will" + 0.006*"use" + 0

### Topic modelling for only 21th article

In [23]:
parts = [new_cases.title[20], new_cases.tags[20], new_cases.text[20]]

In [21]:
tokens = [word_tokenize(article) for article in parts]

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [[t.lower() for t in token] for token in tokens]

# Retain alphabetic words: alpha_only
alpha_only_list = [[t for t in lower_token if t.isalpha()] for lower_token in lower_tokens]

# Remove all stop words: no_stops
no_stops = [[t for t in alpha_only if t not in english_stops] for alpha_only in alpha_only_list]

# Lemmatize all tokens into a new list: lemmatized
parts_lemmatized = [[wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in no_stop] for no_stop in no_stops] 

In [24]:
# Create a Dictionary from the articles: dictionary
dictionary_single = Dictionary(parts_lemmatized)

# Create a MmCorpus: corpus
corpus_single = [dictionary_single.doc2bow(part) for part in parts_lemmatized]

In [27]:
lda_model2 =  LdaMulticore(corpus_single, 
                           num_topics = 3, 
                           id2word = dictionary,                                    
                           passes = 10,
                           workers = 2)

In [29]:
for idx, topic in lda_model2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.000*"activity" + 0.000*"address" + 0.000*"united" + 0.000*"algorithm" + 0.000*"announce" + 0.000*"avoid" + 0.000*"attend" + 0.000*"marshall" + 0.000*"across" + 0.000*"approach"
Topic: 1 
Words: 0.001*"address" + 0.001*"activity" + 0.001*"accuracy" + 0.001*"across" + 0.001*"united" + 0.001*"attend" + 0.001*"avoid" + 0.001*"algorithm" + 0.001*"announce" + 0.001*"approach"
Topic: 2 
Words: 0.000*"united" + 0.000*"activity" + 0.000*"announce" + 0.000*"address" + 0.000*"across" + 0.000*"accuracy" + 0.000*"marshall" + 0.000*"attend" + 0.000*"avoid" + 0.000*"big"


### Word Frequency and Weights with TF-IDF

In [17]:
from gensim.models.tfidfmodel import TfidfModel

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

# Save the fench revolution document: doc
doc = corpus[0]
# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]

# Sort the doc for word frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)

# Print the top 10 words of the document alongside the count
for word_id, word_count in bow_doc[:10]:
    print(dictionary.get(word_id), word_count)

print('*********')

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)

# Print the top 10 weighted words
for term_id, weight in sorted_tfidf_weights[:10]:
    print(dictionary.get(term_id), weight)

device 9
iot 6
solution 5
expo 4
medical 4
zingbox 4
approach 3
data 3
guardian 3
new 3
*********
zingbox 0.5306773897793426
device 0.2869818491447389
guardian 0.2797327586089335
personality 0.20979313333215804
solution 0.16377898490507994
behaviour 0.15129020469990984
defend 0.15129020469990984
medical 0.14656096355385323
conceptualise 0.13266934744483566
enforces 0.13266934744483566


#### TF-idf eliminate "iot", it supposes it insignificant because all articles are about AI and Technology.

### Weights for all texts

In [20]:
tfidf_weights = [sorted(tfidf[doc], key=lambda w: w[1], reverse=True) for doc in corpus]

In [33]:
# In 9th text, Sort the weights from highest to lowest: 
for term_id, weight in tfidf_weights[8][:10]:
    print(dictionary.get(term_id), weight)

gartner 0.41867985190680573
prentice 0.2857483516069023
utility 0.2621374957321741
enterprise 0.18416514544491902
smart 0.13834487437279003
adecco 0.13422877693712948
tentative 0.13422877693712948
missive 0.1233368624905319
flexibly 0.11944936995691548
professional 0.11766214043801765


## 2) Name Entity Recognition

In [18]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/becode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/becode/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/becode/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [19]:
article2 = new_cases.text[0]

# Import spacy
import spacy

# Instantiate the English model: nlp
nlp = spacy.load('en_core_web_md')

# Create a new document: doc
doc = nlp(article2)

# Print all of the found entities and their labels
for ent in doc.ents:
    print(ent.label_, ent.text)

ORG ZingBox
PERSON IoT Guardian
ORDINAL first
PERCENT 99.9%
ORG Guardian
PRODUCT IoT Guardian
ORG Stanford University
ORG ZingBox
DATE zero-day
PERSON Jerry Marshall
ORG United Regional Health Care System
ORG ZingBox
PERCENT over 95%
PERCENT about 5%
ORG ZingBox
LOC Silicon Valley
GPE London
GPE Amsterdam
LAW the IoT Tech Expo
LAW Blockchain Expo and Cyber Security & Cloud Expo
CARDINAL one


### Use NER for extracting industry from text

In [39]:
industry = ['agriculture', 'automative','consumer products', 
            'energy','finance', 'healt care','media', 
            'public and social sector','telecom', 
            'transport, travel and logistics']          

In [36]:
import spacy
# Import the PhraseMatcher and initialize it
from spacy.matcher import Matcher
# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher

# Load the en_core_web_md model
nlp = spacy.load('en_core_web_md')

In [41]:
article4 = new_cases. text[4]

In [50]:
doc = nlp(article4)

matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(industry))
matcher.add('INDUSTRY', None, *patterns)

# Call the matcher on the test document and print the result
matches_single = matcher(doc)
print([doc[start:end] for match_id, start, end in matches_single])

[energy]


In [45]:
alldocs = [nlp(article) for article in new_cases.text]

matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(industry))
matcher.add('INDUSTRY', None, *patterns)

# Call the matcher on the test document and print the result
matches = [matcher(doc) for doc in alldocs]

In [52]:
matches_word = [[alldocs[i][start:end] for match_id, start, end in match] for i, match in enumerate(matches)]

In [61]:
number_empty=0

for match in matches_word:
    if len(match)==0:
        number_empty+=1

In [63]:
len(alldocs), number_empty

(1626, 1317)