In [6]:
pip install bs4 nltk

Note: you may need to restart the kernel to use updated packages.


In [77]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from collections import Counter
 
from collections import OrderedDict
import re
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
 
from html.parser import HTMLParser
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ICHAL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
porter = PorterStemmer()
wnl = WordNetLemmatizer() 
stop = stopwords.words('english')
stop.append("new")
stop.append("like")
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('mr.')
stop = set(stop)

In [98]:
# From http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html

def tokenizer(text):

    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]

    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent

    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

    return filtered_tokens

In [99]:
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [100]:
def get_keywords(tokens, num):
    return Counter(tokens).most_common(num)

In [114]:
def build_article_df(urls):
    articles = []
    for index, row in urls.iterrows():
        try:
            data=row['text'].strip().replace("'", "")
            data = strip_tags(data)
            soup = BeautifulSoup(data)
            data = soup.get_text()
            data = data.encode('ascii', 'ignore').decode('ascii')
            document = tokenizer(data)
            top_3 = get_keywords(document, 3)
          
            unzipped = list(zip(*top_3))
            kw= list(unzipped[0])
            kw=",".join(str(x) for x in kw)
            articles.append((kw, row['title'], row['pubdate']))
        except Exception as e:
            print(e)
            #print data
            #break
            pass
        #break
    article_df = pd.DataFrame(articles, columns=['keywords', 'title', 'pubdate'])
    return article_df

In [131]:
df = pd.read_csv('tocsv.csv')
data = []
for index, row in df.iterrows():
    data.append((row['Title'], row['Permalink'], row['Date'], row['Content']))
data_df = pd.DataFrame(data, columns=['title' ,'url', 'pubdate', 'text' ])

In [132]:
data_df.tail()

Unnamed: 0,title,url,pubdate,text
143,Driving Digital by Isaac Sacolick - a book review,http://ericbrown.com/driving-digital-isaac-sac...,20170906,"<img class=""alignleft size-medium wp-image-975..."
144,Data and Culture go hand in hand,http://ericbrown.com/?p=9757,-11130,"Last week, I spent an afternoon talking to the..."
145,Data Quality - The most important data dimension?,http://ericbrown.com/data-quality-most-importa...,20170918,"<img class=""size-medium wp-image-9764 alignrig..."
146,"Be pragmatic, not dogmatic",http://ericbrown.com/be-pragmatic-not-dogmatic...,20170928,"<img class=""alignright size-medium wp-image-97..."
147,The Data Way,http://ericbrown.com/the-data-way.htm,20171003,"<img class=""alignleft size-medium wp-image-977..."


In [68]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ICHAL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ICHAL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [115]:
article_df = build_article_df(data_df)
article_df.head()

Unnamed: 0,keywords,title,pubdate
0,"data,big,culture",Building a Data Culture,20141118
1,"data,data-driven,make","Note to Self - Don't say ""Data Driven"" Anymore",20141120
2,"captured,canon,titmouse",Foto Friday - Titmouse on the Feeder,20141121
3,"mobility,organization,device",The Cloud - Gateway to Enterprise Mobility,20141121
4,"data,center,agile",The Agile Data Center,20141124


In [108]:
keywords_array=[]
for index, row in article_df.iterrows():
    keywords=row['keywords'].split(',')
    for kw in keywords:
        keywords_array.append((kw.strip(' '), row['keywords']))
kw_df = pd.DataFrame(keywords_array).rename(columns={0:'keyword', 1:'keywords'})

In [109]:
kw_df.head()

Unnamed: 0,keyword,keywords
0,data,"data,big,culture"
1,big,"data,big,culture"
2,culture,"data,big,culture"
3,data,"data,data-driven,make"
4,data-driven,"data,data-driven,make"


In [87]:
document = kw_df.keywords.tolist()
names = kw_df.keyword.tolist()

document_array = []
for item in document:
    items = item.split(',')
    document_array.append((items))

occurrences = OrderedDict((name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document_array:
    for i in range(len(l)):
        for item in l[:i] + l[i + 1:]:
            occurrences[l[i]][item] += 1

co_occur = pd.DataFrame.from_dict(occurrences)

In [133]:
co_occur.to_csv('ericbrown_co-occurancy_matrix.csv')
co_occur.head()

Unnamed: 0,data,big,culture,may,skill,data-driven,make,company,decision,captured,...,manager,love,song,scene,isaac,quality,governance,pragmatic,dogmatic,thats
7d,0,0,0,0,0,0,0,0,0,10,...,0,0,0,0,0,0,0,0,0,0
abstract,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
access,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
across,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
act,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [172]:
# latihan saja
text = data_df.iloc[0]['text']
data = text.strip().replace("'", "")
data = strip_tags(data)
soup = BeautifulSoup(data)
data = soup.get_text()
data = data.encode('ascii', 'ignore').decode('ascii')

tokens_ = [word_tokenize(sent) for sent in sent_tokenize(data)]
tokens = []
for token_by_sent in tokens_:
    tokens += token_by_sent

tokens = list(filter(lambda t: t.lower() not in stop, tokens))
tokens = list(filter(lambda t: t not in punctuation, tokens))
tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))

filtered_tokens = []
for token in tokens:
    token = wnl.lemmatize(token)
    if re.search('[a-zA-Z]', token):
        filtered_tokens.append(token)

filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

counts = Counter(filtered_tokens)

sorted_counts = sorted(counts.items(), key=lambda count: count[1], reverse=True)
sorted_counts

[('data', 29),
 ('big', 8),
 ('culture', 8),
 ('may', 7),
 ('skill', 6),
 ('time', 6),
 ('corporate', 5),
 ('failure', 5),
 ('analysis', 5),
 ('many', 4),
 ('initiative', 4),
 ('scientist', 4),
 ('project', 4),
 ('system', 3),
 ('right', 3),
 ('people', 3),
 ('one', 3),
 ('analytics', 3),
 ('listen', 3),
 ('willing', 3),
 ('spent', 3),
 ('company', 2),
 ('want', 2),
 ('money', 2),
 ('software', 2),
 ('training', 2),
 ('able', 2),
 ('analyze', 2),
 ('use', 2),
 ('need', 2),
 ('area', 2),
 ('organization', 2),
 ('fail', 2),
 ('program', 2),
 ('involves', 2),
 ('listening', 2),
 ('investigation', 2),
 ('success', 2),
 ('doesnt', 2),
 ('information', 2),
 ('act', 2),
 ('important', 2),
 ('aspect', 2),
 ('dont', 2),
 ('example', 2),
 ('argument', 2),
 ('show', 2),
 ('cultural', 2),
 ('make', 2),
 ('well', 2),
 ('working', 2),
 ('arent', 2),
 ('finding', 2),
 ('value', 2),
 ('end', 2),
 ('much', 2),
 ('small', 2),
 ('today', 1),
 ('theyre', 1),
 ('spending', 1),
 ('consulting', 1),
 ('servic

In [177]:
my_text = nltk.Text(filtered_tokens)
my_text.concordance('data')

Displaying 25 of 29 matches:
many company want big data today theyre spending money system s
ice able capture process analyze use data thing need done data science capabil
ess analyze use data thing need done data science capability skill companies n
ople skill able properly analyze use data theres one area many organization fa
y organization fail address building data analytics program skill area involve
lture play huge role success failure data analytics program company culture do
ogram company culture doesnt hearing data may provide conflicting information 
 provide conflicting information big data initiative may set failure beginning
inning experience ability listen act data one important aspect corporate cultu
spect corporate culture lead success data analytics big data dont corporate cu
ture lead success data analytics big data dont corporate culture leadership te
nformation example ceo doesnt listen data argument go belief may difficult tim
rgument go belief may difficult time d

In [181]:
from nltk import BigramCollocationFinder
finder = BigramCollocationFinder.from_words(filtered_tokens)

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder.score_ngrams(bigram_measures.raw_freq)

[(('big', 'data'), 0.027874564459930314),
 (('corporate', 'culture'), 0.017421602787456445),
 (('data', 'initiative'), 0.013937282229965157),
 (('data', 'scientist'), 0.013937282229965157),
 (('data', 'analytics'), 0.010452961672473868),
 (('act', 'data'), 0.006968641114982578),
 (('analytics', 'program'), 0.006968641114982578),
 (('analyze', 'use'), 0.006968641114982578),
 (('data', 'analysis'), 0.006968641114982578),
 (('data', 'may'), 0.006968641114982578),
 (('time', 'data'), 0.006968641114982578),
 (('time', 'spent'), 0.006968641114982578),
 (('use', 'data'), 0.006968641114982578),
 (('ability', 'listen'), 0.003484320557491289),
 (('able', 'capture'), 0.003484320557491289),
 (('able', 'properly'), 0.003484320557491289),
 (('accept', 'failure'), 0.003484320557491289),
 (('accepting', 'competing'), 0.003484320557491289),
 (('address', 'building'), 0.003484320557491289),
 (('allows', 'continuously'), 0.003484320557491289),
 (('analysis', 'dont'), 0.003484320557491289),
 (('analysis',

In [185]:
my_text = nltk.Text(filtered_tokens)
my_text.findall('<.*><data><.*>')

big data today; use data thing; done data science; use data theres;
building data analytics; failure data analytics; hearing data may; big
data initiative; act data one; success data analytics; big data dont;
listen data argument; time data analysis; argument data top; big data
cultural; working data arent; curious data willing; investigating data
may; become data scientist; act data scientist; finding data
investigation; big data initiative; time data analysis; spent data
scientist; look data many; great data scientist; big data initiative;
big data initiative; big data mindset
