In [12]:
import numpy as np
import pandas as pd
import texthero as hero
import os
import nltk
from newspaper import Article
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import scattertext as st
import pytextrank
import spacy
from pprint import pprint
import datetime 

import matplotlib.pyplot as plt
%matplotlib inline


In [13]:
CATEGORY_MAPPINGS = {
    "The Hype": "Concerns & Hype",
    "The Panic": "Concerns & Hype",
    "The good coverage": "Advances & Business",
    "Expert Opinions & Discussion within the field": "Expert Opinions & Discussion within the field",
    "Explainers": "Explainers",
    "AI Advances": "Advances & Business",
    "AI Worries": "Concerns & Hype",
    "Advances & Business": "Advances & Business",
    "Concerns & Hype": "Concerns & Hype",
    "Analysis & Policy": "Analysis & Policy",
    "Mini Briefs": "Mini Briefs"
}

In [14]:
def parse_file(file_name):
    with open(file_name,'r') as f:
        current_category = None
        articles = []
        for line in f:
            for c in CATEGORY_MAPPINGS.keys():
                if c in line:
                    current_category = CATEGORY_MAPPINGS[c]
            if current_category and '[' in line and '(' in line:
                title = line.split('[')[1].split(']')[0]
                url = line.split('(')[1].split(')')[0]
                if len(title.split(' '))<4:
                    continue
                if ' - ' in line:
                    excerpt = line.split(' - ')[1].strip()
                else:
                    excerpt = ''
                article = Article(url)
                try: 
                    article.download()
                    article.parse()
                    authors = article.authors
                    date = article.publish_date
                    text = article.text
                    top_image = article.top_image
                    article.nlp()
                    keywords = article.keywords
                    summary = article.summary
                except:
                    authors=None
                    date=None
                    keywords=[]
                    text=''
                    summary=title
                articles.append([str(current_category), 
                                 title, 
                                 date, 
                                 url, 
                                 excerpt, 
                                 authors, 
                                 keywords, 
                                 summary,
                                 text])
    return articles

In [None]:
edition_date_mapping = {}
all_articles = []
category_counts = {}
for file_name in os.listdir('digests'):
    if '.md' not in file_name or 'year' in file_name:
        continue
    name_parts = file_name.split('.')[0].split('-')
    year = int(name_parts[0])
    month = int(name_parts[1])
    day = int(name_parts[2])
    edition = int(name_parts[3])
    edition_date = datetime.datetime(year, month, day)
    edition_date_mapping[edition] = edition_date
    articles = parse_file(os.path.join('digests',file_name))
    for article in articles:
        article.insert(0,edition_date)
        article.insert(0,edition)
    all_articles+=articles
    for article in articles:
        if article[0] not in category_counts:
            category_counts[article[2]]=0
        category_counts[article[2]]+=1

In [None]:
print(len(all_articles))
print(category_counts)

In [None]:
df = pd.DataFrame(all_articles, columns =['edition', 'digest_date', 'category', 'title', 'article_date', 'url', 'excerpt', 'authors', 'keywords', 'summary', 'text']) 

In [None]:
categories = df['category'].unique()

In [None]:
cfd = nltk.ConditionalFreqDist(
    (article[2], word.lower())
    for article in all_articles
    for word in nltk.tokenize.word_tokenize(article[-2]))
modals = ['ai',
 'bias',
 'neural',
 'robot',
 'artificial',
 'facial']
cfd.tabulate(categories, samples=modals)

In [None]:
dicts = []
sorted_articles = sorted(all_articles, key = lambda x:x[1])
for i,article in enumerate(sorted_articles):
    x = article[1]
    d = {'Date': x, 'Article Count': i}
    dicts.append(d)
df_plot = pd.DataFrame(dicts)
df_plot.plot(x='Date',y='Article Count')

In [None]:
cfd = nltk.ConditionalFreqDist(
    (article[0], word.lower())
    for article in all_articles
    for word in nltk.tokenize.word_tokenize(article[3]))

def plot_trends(words, synonyms={}):
    dicts = []
    word_totals = {word:0 for word in words}
    for x in sorted(df['edition'].unique()):
        word_counts = cfd[x]
        d = {'Date': edition_date_mapping[x]}

        for word in words:
            word_key = word
            if word in synonyms:
                word_key = synonyms[word]
            word_totals[word_key]+=word_counts[word]
            d[word_key] = word_totals[word_key]
        dicts.append(d)
    df_plot = pd.DataFrame(dicts)
    df_plot.plot(x='Date')

In [None]:
plot_trends(['ai', 'robot', 'neural', 'deep', 'researchers', 'learning', 'artificial'], {'learned': 'learning', 'research':'researchers'})

In [None]:
plot_trends(['robot', 'neural', 'deep', 'researchers', 'learning', 'artificial'], {'learned': 'learning', 'research':'researchers'})

In [None]:
words = ['biased', 'bias', 
         'surveillance',  'facial', 
         'coronavirus', 'covid', 'covid-19',
         'fake', 'deepfake', 'deepfakes',
         'military', 'weapon',
         'jobs', 'automation']
synonyms = {'covid-19': 'coronavirus',  'covid':'coronavirus', 
            'deepfakes':'deepfake', 'deepfake': 'fake',
            'weapon': 'military',
            'bias': 'biased', 
            'automation': 'jobs'}
plot_trends(words, synonyms)

In [None]:
words = ['biased', 'bias', 
         'surveillance', 
         'coronavirus', 'covid', 'covid-19',
         'fake', 'deepfake', 'deepfakes',
         'military', 'weapon',
         'jobs', 'automation']
synonyms = {'covid-19': 'coronavirus',  'covid':'coronavirus', 
            'deepfakes':'deepfake', 'deepfake': 'fake',
            'weapon': 'military',
            'bias': 'biased', 
            'automation': 'jobs'}
plot_trends(words, synonyms)

In [None]:
words = ['image', 'language', 
         'robot', 'bot',
         'coronavirus', 'covid', 'covid-19',
         'medical', 'medicine', 'diagnose',
         'predict', 'climate']
synonyms = {'covid-19': 'coronavirus',  'covid':'coronavirus', 
            'diagnose': 'medical', 'medicine': 'medical', 'bot': 'robot'}
plot_trends(words, synonyms)

In [None]:
words = ['openai', 'deepmind', 'google',  'microsoft', 'amazon', 'facebook', 'stanford', 'berkeley']
plot_trends(words)

In [None]:
df['pca'] = (
   df['title']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)
hero.scatterplot(df, 'pca', color='category', title="AI News")

In [None]:
df['pca'] = (
   df['summary']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)
hero.scatterplot(df, 'pca', color='category', title="AI News")

In [None]:
df['pca'] = (
   df['text']
   .pipe(hero.clean)
   .pipe(hero.tfidf)
   .pipe(hero.pca)
)
hero.scatterplot(df, 'pca', color='category', title="AI News")

In [None]:
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(df, 
                              category_col='category', 
                              text_col='text',
                              nlp=nlp).build().compact(st.AssociationCompactor(2500))

In [None]:
print('Top common words:')
pprint(list(corpus.get_scaled_f_scores_vs_background().index[:25]))

In [None]:
def make_cat_explorer(cat):
    html = st.produce_scattertext_explorer(corpus,
              category=cat,
              category_name=cat,
              not_category_name='Other',
              width_in_pixels=1000,)
    open("scatterplots/%s Viz.html"%cat, 'wb').write(html.encode('utf-8'))

In [None]:
for cat in categories:
    make_cat_explorer(cat)

In [None]:
def make_word_cloud(cat=None):
    comment_words = '' 
    stopwords = set(STOPWORDS) 

    # iterate through the csv file 
    for article in all_articles: 
        if cat is not None and article[0]!=cat:
            continue
        val = article[-2]
        # split the value 
        tokens = val.split() 

        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 

        comment_words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(comment_words) 

    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show() 

In [None]:
make_word_cloud(None)