In [1]:
import json
import pandas as pd
import numpy as np

# Import reduce from functools
from functools import reduce

In [None]:
industry_words = {
    'automative': ['automotive', 'taxi', 'wheel', 'fuel', 'car','drive','auto','selfdrive','vehicle','road','automobile'],
    'Manufacturing': ['cleantech', 'deindustrialization', 'prefabrication', 'manufacturing', 'vitrification', 'fabrication' 'R&D','quality','produce','goods','factory','equipment'],
    'Consumer Products' : ['product', 'price', 'goods', 'commerce', 'economic', 'customer','marketing','demand','inventory','supply'],
    'Finance' : ['bank', 'money', 'capitalization', 'interest', 'fund', 'finance', 'asset','risk','loan','credit','fraud'],
    'Agriculture' :['soil', 'grain', 'agriculture', 'field','farm','soil','weather','crop','grow','animal', 'food' , 'land'],
    'Energy' : ['renewable', 'sustainable', 'green', 'electricity', 'energy', 'power','mines','solar','light','metal','electric','carbon', 'electonic','wind','speed'],
    'Health Care' : ['Health', 'Care', 'emergency','doctor','wellness','patient','hospital', 'clinic','treatment','disease', 'medical','cancer'],
    'Pharmaceuticals' : ['dose', 'pillbox', 'tonic', 'tablet', 'placebo', 'medicate', 'hospital', 'Pharmaceutical', 'drug','diagnose', 'test','trial','medicine', 'vaccine'],
    'Public and Social sector' : ['social', 'law','crime','terrorism','policing','govern', 'public', 'infrastructure', 'education', 'tax', 'urban', 'life', 'job','enforcement', 'surveillance'],
    'Media' :['mainstream', 'publishing', 'medium', 'social','media','video','content','news','release','film', 'press', 'viral', 'game'],
    'Telecom' : ['location', 'station', 'host', 'telecom', 'mobile', 'voice','call','subscription','network','phone', 'broadcast', 'internet','communication' ,'modulation'],
    'Transport & Logistics' : ['transport' , 'logistic', 'mail','parcel','travel','route','planes','truck', 'shipping', 'mobility', 'movement']
}
for keys, value in industry_words.items():
    industry_words[keys] = " ".join(value)

In [2]:
with open('data/data/news_data.json') as f:
      data2 = json.load(f)

In [3]:
new_cases = pd.DataFrame(data2['data'], columns=['id', 'title', 'summary', 
                                      'authors', 'tags', 
                                      'text', 'url', 'source',
                                      'created_at', 'updated_at',
                                      'author', 'date'])
new_cases.head(2)

Unnamed: 0,id,title,summary,authors,tags,text,url,source,created_at,updated_at,author,date
0,10813,"ZingBox aims for ‘Internet of Trusted Things’,...",Cybersecurity provider ZingBox has announced t...,,device\niot\nguardian\napproach\ndevices\nindu...,Cybersecurity provider ZingBox has announced t...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.343Z,2020-02-05T17:08:34.343Z,James Bourne,2017-04-25
1,10814,AI may help create more sustainable data centres,Enterprise data centre provider Aegis Data arg...,,data\ncentre\nnatural\nnew\ntechnology\nindust...,Enterprise data centre provider Aegis Data arg...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.355Z,2020-02-05T17:08:34.355Z,James Bourne,2017-04-25


In [68]:
new_cases[new_cases.id==10829]

Unnamed: 0,id,title,summary,authors,tags,text,url,source,created_at,updated_at,author,date
16,10829,Here’s how AI can assist medical science in te...,Artificial intelligence (AI) and deep learning...,,diseases\nmedical\nai\nexpo,Artificial intelligence (AI) and deep learning...,https://artificialintelligence-news.com/2017/0...,AInews,2020-02-05T17:08:34.515Z,2020-02-05T17:08:34.515Z,James Bourne,2017-06-12


In [6]:
# Import spacy
import spacy

# Instantiate the English model: nlp
nlp = spacy.load('en_core_web_md')

In [7]:
texts = new_cases.text

In [8]:
from wordcloud import STOPWORDS
english_stops = set(STOPWORDS)

import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

## 1) Lower all words and Remove non-alpha, stopwords, no-noun or no-verb and Lemmatize all words

In [9]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize

# Tokenize the article: tokens
tokens =[word_tokenize(article) for article in texts]
print("Total number of texts: {}".format(len(tokens)))

len_array = [len(token_array) for token_array in tokens]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total number of tokens: {}".format(total_tokens))

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [[t.lower() for t in token] for token in tokens]

# Retain alphabetic words: alpha_only
alpha_only_list = [[t for t in lower_token if t.isalpha()] for lower_token in lower_tokens]

# Remove all stop words: no_stops
no_stops = [[t for t in alpha_only if t not in english_stops] for alpha_only in alpha_only_list]

len_array = [len(token_array) for token_array in no_stops]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing stop words: {}".format(total_tokens))

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
articles_lemmatized = [[wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in no_stop if nltk.pos_tag([t])[0][1][0].upper()=='N' or nltk.pos_tag([t])[0][1][0].upper()=='V'] for no_stop in no_stops]

len_array = [len(article) for article in articles_lemmatized]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing words except noun: {}".format(total_tokens))

Total number of texts: 1626
Total number of tokens: 1434310
Total of words after removing stop words: 694546
Total of words after removing words except noun: 478185


## 2) Remove organization, location nouns

In [None]:
for i in range(new_cases.shape[0]):
    # Create a new document: doc
    doc = nlp(new_cases.text[i])

    # Print all of the found entities and their labels
    for ent in doc.ents:
        if ent.label_=='ORG' or ent.label_=='GPE' or ent.label_=='LOC':
            words = ent.text.split()
            for word in words:
                word = word.lower()
                while word in articles_lemmatized[i]:
                    articles_lemmatized[i].remove(word) 

In [11]:
len_array = [len(article) for article in articles_lemmatized]
# Use reduce() to apply a lambda function over stark: result
total_tokens = reduce(lambda item1, item2: item1+item2, len_array)
print("Total of words after removing words in ORG, GPE, LOC: {}".format(total_tokens))

Total of words after removing words in ORG, GPE, LOC: 415776


## 3) Predict industry type with cleaned texts by using similarity method

### Convert list into string for each text

In [16]:
articles_string = [" ".join(article) for article in articles_lemmatized]
key_list = list(industry_words.keys())

### Create nlp object for each industry category

In [17]:
doc_industry_list = []
for key, value in industry_words.items():
        doc_industry_list.append(nlp(industry_words[key]))

### An example : 16th text

In [72]:
# Create nlp object for an cleaned article
doc_article = nlp(articles_string[16])
print(new_cases.title[16])
similarities = []
for doc_industry in doc_industry_list:
    similarity = doc_article.similarity(doc_industry)
    print(similarity)
    similarities.append(similarity)
max_value = max(similarities)
max_position = similarities.index(max_value)
industry_type = key_list[max_position]
print(industry_type)

Here’s how AI can assist medical science in telling a patient’s lifespan
0.502665953962633
0.560985686202927
0.6732316071532943
0.6527046354124848
0.6223831246228051
0.5929365454180491
0.7317095407518914
0.6686644554870524
0.7700925818314376
0.6989430444817823
0.6479598292617698
0.6451362461295347
Public and Social sector


### Predict industry type with similarity method along two nlp object for all texts in industry_type_list 

In [None]:
industry_type_list = []
for article in articles_string:
    # Create nlp object for an cleaned article
    doc_article = nlp(article)
    similarities = []
    for doc_industry in doc_industry_list:
        similarities.append(doc_article.similarity(doc_industry))
    max_value = max(similarities)
    max_position = similarities.index(max_value)
    industry_type = key_list[max_position]
    industry_type_list.append(industry_type)

## 4) Predict industry type with tf-idf words by using similarity method

### Create a dictionary

In [30]:
# Import Dictionary
from gensim.corpora.dictionary import Dictionary

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(articles_lemmatized)

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in articles_lemmatized]

# How much text in corpus_sorted
print(len(corpus))

1626


### Create tf-idf model

In [31]:
from gensim.models.tfidfmodel import TfidfModel

# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)

tfidf_weights = [sorted(tfidf[doc], key=lambda w: w[1], reverse=True) for doc in corpus]

### Create nlp object for each industry category and key_list 

In [None]:
key_list = list(industry_words.keys())
doc_industry_list = []
for key, value in industry_words.items():
        doc_industry_list.append(nlp(industry_words[key]))

### An example : 16th text

In [70]:
article_string_single = [dictionary.get(term_id) for term_id, weight in tfidf_weights[16][:20]]
article_string_single = " ".join(article_string_single)
article_string_single

'lifespan adelaide surpasses disease ai prediction patient survey congestive emphysema expo job absence chronic organ image resident illness predict analysis'

In [71]:
doc_article = nlp(article_string_single)
similarities = []
for doc_industry in doc_industry_list:
    similarity = doc_article.similarity(doc_industry)
    print(similarity)
    similarities.append(similarity)
max_value = max(similarities)
max_position = similarities.index(max_value)
industry_type = key_list[max_position]
print(industry_type)

0.3804882291089601
0.4508796499406264
0.5061033561902708
0.531079364987485
0.5476590285682619
0.46029947173572866
0.7917154485345823
0.7025677445623351
0.6184250942701971
0.5402324152260866
0.48449173756379493
0.5036701114194474
Health Care


### Predict industry type with similarity method along two nlp object for all texts in industry_type_list

#### Select the most important first 20 words for each text a and convert list into string

In [76]:
articles_strings = [[dictionary.get(term_id) for term_id, weight in tfidf_weight[:20]] for tfidf_weight in tfidf_weights]
articles_strings = [" ".join(articles_string) for articles_string in articles_strings]

In [77]:
industry_type_list = []
for article in articles_string:
    # Create nlp object for an cleaned article
    doc_article = nlp(article)
    similarities = []
    for doc_industry in doc_industry_list:
        similarities.append(doc_article.similarity(doc_industry))
    max_value = max(similarities)
    max_position = similarities.index(max_value)
    industry_type = key_list[max_position]
    industry_type_list.append(industry_type)

In [79]:
len(industry_type_list)

1626