In [25]:
import get_news
import pandas as pd

In [27]:
new_path = './news_20191220.pkl'
#df = get_news.incrental_load('./news_20191220.pkl')

# Incrementally loading data
df = pd.read_pickle(new_path)

In [28]:
df.head()

Unnamed: 0,timestamp,headline,link,content
0,"Last updated December 20, 2019 19:38:56 AEDT",‘ABSOLUTELY PETRIFIED’: Drug dealer sobs as he...,https://www.news.com.au/national/courts-law/te...,A terrified drug dealer has sobbed and called ...
1,"Last updated December 20, 2019 19:38:57 AEDT",'Do you even care?' Firey blasts ScoMo,https://www.news.com.au/technology/environment...,A fire station officer has posted an emotional...
2,"Last updated December 20, 2019 19:38:58 AEDT",Roads melting in extreme heat,https://www.news.com.au/technology/environment...,Roads in parts of South Australia are “bleedin...
3,"Last updated December 20, 2019 19:38:58 AEDT",‘Grossly immoral’: Christians lash Trump,https://www.news.com.au/finance/work/leaders/u...,A major Christian magazine has turned against ...
4,"Last updated December 20, 2019 19:38:58 AEDT",‘This is ridiculous’: Teen’s freakish act,https://www.news.com.au/sport/cricket/big-bash...,The Hobart Hurricanes looked like they were st...


## Stem/Tokenise

Tokenise is basically a function converts sentses into "tokens" or a list of words. In function below we did **stem** on top of the tokenised words, and removed **stop words** from the tokens

In [29]:
#%%
import nltk
import re
from nltk.tokenize import punkt
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

sample = df.sample(1).content.iloc[0]
keep = re.compile('[a-zA-Z]')

ps = PorterStemmer()

def stem_tokenise(corpus):
    stop_words = set(stopwords.words('english'))
    return [ps.stem(w.lower()) for w in word_tokenize(corpus) if w.lower() not in stop_words and re.match(keep, w) ]

## SKlearn DTM

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(df.content)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index
pprint(data_dtm)

    00  000  0003587  0003649  000km  007  018  02  022  023  ...  zoned  \
0    0    0        0        0      0    0    0   0    0    0  ...      0   
1    0    2        2        0      0    0    0   0    0    0  ...      0   
2    0    0        0        0      0    0    0   0    0    0  ...      0   
3    0    1        0        0      0    0    0   0    0    0  ...      0   
4    0    0        0        0      0    0    0   0    0    0  ...      0   
..  ..  ...      ...      ...    ...  ...  ...  ..  ...  ...  ...    ...   
66   0    0        0        0      0    0    0   0    0    0  ...      0   
67   0    0        0        0      0    0    0   0    0    0  ...      0   
68   0    0        0        0      0    0    0   0    0    0  ...      0   
69   0    0        0        0      0    0    0   0    0    0  ...      0   
70   0    0        0        0      0    0    0   0    0    0  ...      0   

    zones  zookal  zoom  zqda3na9jp  ztprxqug8u  ºc  ðÿ  über  œairâ  
0       0       

## Gensim

https://www.machinelearningplus.com/nlp/gensim-tutorial/

### Dictionary ###
**Dictionary** object convert text/sentences to a [list of word] with id's. (https://radimrehurek.com/gensim/corpora/dictionary.html)

The object can be incrementally added by calling method *add_documents()*

Dictionary can be stored into disk and loaded later

In [39]:
import gensim
from gensim import corpora


# create a gensim dictory from a pandas series object (created by a column from a dataframe)
def create_dictionary(df, column_name, save_to_file=None):
    dict = corpora.Dictionary(df[column_name].map(stem_tokenise))
    if save_to_file:
        dict.save(save_to_file)
    return dict

#dict = create_dictionary(df, "content")
dict = corpora.Dictionary.load('my_dict.dict')
dict.token2id 

## Bag of Word / TF IDF

In [62]:
# for each of the document generate bag of the words. Thr reprsentation is (id, count) tuple (only occurring words)
def create_bow(data_frame, column_name, dict, save_to_file=None):
    my_corpus = [dict.doc2bow(doc, allow_update=True) for doc in data_frame[column_name].map(stem_tokenise)]
    if save_to_file:
        corpora.MmCorpus.serialize(save_to_file, my_corpus)
    return my_corpus

#my_corpus = create_bow(df, "content", dict, 'bow_corpus.mm')
my_corpus = corpora.MmCorpus('bow_corpus.mm')

# Show the Word Weights in Corpus
for doc in my_corpus:
    print([[dict[id], freq] for id, freq in doc])
    break

[['absolut', 1.0], ['accept', 1.0], ['acknowledg', 1.0], ['across', 1.0], ['act', 1.0], ['ad', 1.0], ['addict', 1.0], ['advertis', 2.0], ['aedt', 1.0], ['afternoon', 1.0], ['age', 1.0], ['allegedli', 3.0], ['almost', 1.0], ['also', 2.0], ['anoth', 1.0], ['appear', 1.0], ['area', 1.0], ['arrest', 2.0], ['ask', 1.0], ['australia', 10.0], ['away', 1.0], ['back', 1.0], ['background', 1.0], ['bail', 1.0], ['becam', 1.0], ['began', 1.0], ['behind', 1.0], ['believ', 1.0], ['black', 1.0], ['boob', 2.0], ['bought', 1.0], ['bra', 4.0], ['bra.sourc', 1.0], ['brought', 1.0], ['burwood', 1.0], ['buyer', 1.0], ['call', 3.0], ['came', 3.0], ['candace.sutton', 1.0], ['cap', 6.0], ['capsul', 6.0], ['carri', 3.0], ['cell', 1.0], ['centr', 1.0], ['chanc', 1.0], ['chaotic', 1.0], ['childhood', 1.0], ['chill', 1.0], ['choic', 1.0], ['circuz', 1.0], ['clear', 1.0], ['co-offend', 1.0], ['collect', 1.0], ['comfort', 1.0], ['commerci', 1.0], ['community-bas', 1.0], ['condom', 2.0], ['contain', 1.0], ['content'

In [64]:
from gensim import models
import numpy as np
import wordcloud as wc

# Create the TF-IDF model
tfidf = models.TfidfModel(my_corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[my_corpus]:
    print([[dict[id], np.around(freq, decimals=2)] for id, freq in doc])    
    break

weights = 
# get a word cloud going
wc = WordCloud(
    background_color="white",
    max_words=2000,
    width = 1024,
    height = 720,
    stopwords=stopwords.words("english")
)

# Generate the cloud
weights = tfidf[my_corpus[0]]

weights = [[dict[id], np.around(freq, decimals=2)] for id, freq in weights]

wc.generate_from_frequencies(weights)


[['absolut', 0.01], ['accept', 0.01], ['acknowledg', 0.02], ['across', 0.0], ['act', 0.01], ['ad', 0.0], ['addict', 0.02], ['advertis', 0.0], ['aedt', 0.0], ['afternoon', 0.02], ['age', 0.01], ['allegedli', 0.05], ['almost', 0.01], ['also', 0.0], ['anoth', 0.01], ['appear', 0.0], ['area', 0.01], ['arrest', 0.03], ['ask', 0.01], ['australia', 0.05], ['away', 0.01], ['back', 0.0], ['background', 0.02], ['bail', 0.02], ['becam', 0.01], ['began', 0.01], ['behind', 0.01], ['believ', 0.01], ['black', 0.01], ['boob', 0.03], ['bought', 0.01], ['bra', 0.08], ['bra.sourc', 0.02], ['brought', 0.01], ['burwood', 0.02], ['buyer', 0.01], ['call', 0.02], ['came', 0.02], ['candace.sutton', 0.02], ['cap', 0.09], ['capsul', 0.15], ['carri', 0.03], ['cell', 0.02], ['centr', 0.01], ['chanc', 0.01], ['chaotic', 0.02], ['childhood', 0.02], ['chill', 0.02], ['choic', 0.0], ['circuz', 0.02], ['clear', 0.01], ['co-offend', 0.02], ['collect', 0.0], ['comfort', 0.01], ['commerci', 0.02], ['community-bas', 0.02],

## Download W2V trained model

In [65]:
import gensim.downloader as api

# Get information about the model or dataset
api.info('glove-wiki-gigaword-50')
# {'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
#  'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
#  'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
#  'file_name': 'glove-wiki-gigaword-50.gz',
#  'file_size': 69182535,
#  'license': 'http://opendatacommons.org/licenses/pddl/',
#  (... truncated...)

# Download
w2v_model = api.load("glove-wiki-gigaword-50")
w2v_model.most_similar('blue')
# [('red', 0.8901656866073608),
#  ('black', 0.8648407459259033),
#  ('pink', 0.8452916741371155),
#  ('green', 0.8346816301345825),
#  ... ]



[('red', 0.8901657462120056),
 ('black', 0.8648406863212585),
 ('pink', 0.845291793346405),
 ('green', 0.8346816301345825),
 ('yellow', 0.8320707082748413),
 ('purple', 0.8293111324310303),
 ('white', 0.8225342035293579),
 ('orange', 0.8114302158355713),
 ('bright', 0.799933910369873),
 ('colored', 0.7876655459403992)]

In [71]:
w2v_model.most_similar('son')

[('father', 0.9528983235359192),
 ('brother', 0.9449328184127808),
 ('cousin', 0.9256070256233215),
 ('uncle', 0.9207189679145813),
 ('nephew', 0.9195291996002197),
 ('grandson', 0.8999317288398743),
 ('grandfather', 0.8975841403007507),
 ('daughter', 0.8961816430091858),
 ('friend', 0.8672630786895752),
 ('elder', 0.8604500889778137)]