In [33]:
import bs4 as bs   #beautiful soup , useful Python utility for web scraping

In [34]:
import urllib.request    # parse XML and HTML

In [35]:
import re     #for regular expressions

In [36]:
import nltk         #for nlp processing

In [37]:
from nltk.tokenize import word_tokenize

In [38]:
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Small_data')     #accessing/scraping wiki data

In [39]:
article = scraped_data.read()    #reading the data byte-by-byte

In [40]:
print(article)

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Small data - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Small_data","wgTitle":"Small data","wgCurRevisionId":872194454,"wgRevisionId":872194454,"wgArticleId":41530851,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: external links","Data management"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","O

In [41]:
parsed_article = bs.BeautifulSoup(article,'lxml')      #to parse the data

In [42]:
paragraphs = parsed_article.find_all('p')     #text enclosed within <p> tag is retrieved

In [43]:
article_text = ""     #to combine the paragraphs

In [44]:
for p in paragraphs:  
    article_text += p.text

In [45]:
#Pre-processing
#Removing square brackets and extra spaces
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  
article_text = re.sub(r'\s+', ' ', article_text)

In [46]:
print(article_text)

Small data is data that is 'small' enough for human comprehension. It is data in a volume and format that makes it accessible, informative and actionable. The term "big data" is about machines and "small data" is about people. This is to say that eyewitness observations or five pieces of related data could be small data. Small data is what we used to think of as data. The only way to comprehend Big data is to reduce the data into small, visually-appealing objects representing various aspects of large data sets (such as histogram, charts, and scatter plots). Big Data is all about finding correlations, but Small Data is all about finding the causation, the reason why. A formal definition of small data has been proposed by Allen Bonde, former vice-president of Innovation at Actuate - now part of OpenText: "Small data connects people with timely, meaningful insights (derived from big data and/or “local” sources), organized and packaged – often visually – to be accessible, understandable, a

In [47]:
#Removing special characters and digits
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

In [48]:
print(formatted_article_text)

Small data is data that is small enough for human comprehension It is data in a volume and format that makes it accessible informative and actionable The term big data is about machines and small data is about people This is to say that eyewitness observations or five pieces of related data could be small data Small data is what we used to think of as data The only way to comprehend Big data is to reduce the data into small visually appealing objects representing various aspects of large data sets such as histogram charts and scatter plots Big Data is all about finding correlations but Small Data is all about finding the causation the reason why A formal definition of small data has been proposed by Allen Bonde former vice president of Innovation at Actuate now part of OpenText Small data connects people with timely meaningful insights derived from big data and or local sources organized and packaged often visually to be accessible understandable and actionable for everyday tasks Anoth

In [49]:
text = word_tokenize(article_text)

In [50]:
tagged = nltk.pos_tag(text)
print(tagged)

[('Small', 'NNP'), ('data', 'NNS'), ('is', 'VBZ'), ('data', 'NNS'), ('that', 'WDT'), ('is', 'VBZ'), ("'small", 'DT'), ("'", "''"), ('enough', 'RB'), ('for', 'IN'), ('human', 'JJ'), ('comprehension', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('data', 'VBN'), ('in', 'IN'), ('a', 'DT'), ('volume', 'NN'), ('and', 'CC'), ('format', 'NN'), ('that', 'WDT'), ('makes', 'VBZ'), ('it', 'PRP'), ('accessible', 'JJ'), (',', ','), ('informative', 'JJ'), ('and', 'CC'), ('actionable', 'JJ'), ('.', '.'), ('The', 'DT'), ('term', 'NN'), ('``', '``'), ('big', 'JJ'), ('data', 'NN'), ("''", "''"), ('is', 'VBZ'), ('about', 'IN'), ('machines', 'NNS'), ('and', 'CC'), ('``', '``'), ('small', 'JJ'), ('data', 'NN'), ("''", "''"), ('is', 'VBZ'), ('about', 'IN'), ('people', 'NNS'), ('.', '.'), ('This', 'DT'), ('is', 'VBZ'), ('to', 'TO'), ('say', 'VB'), ('that', 'IN'), ('eyewitness', 'JJ'), ('observations', 'NNS'), ('or', 'CC'), ('five', 'CD'), ('pieces', 'NNS'), ('of', 'IN'), ('related', 'JJ'), ('data', 'NNS'

In [51]:
ne_tree = nltk.ne_chunk(tagged)
print(ne_tree)

(S
  (GPE Small/NNP)
  data/NNS
  is/VBZ
  data/NNS
  that/WDT
  is/VBZ
  'small/DT
  '/''
  enough/RB
  for/IN
  human/JJ
  comprehension/NN
  ./.
  It/PRP
  is/VBZ
  data/VBN
  in/IN
  a/DT
  volume/NN
  and/CC
  format/NN
  that/WDT
  makes/VBZ
  it/PRP
  accessible/JJ
  ,/,
  informative/JJ
  and/CC
  actionable/JJ
  ./.
  The/DT
  term/NN
  ``/``
  big/JJ
  data/NN
  ''/''
  is/VBZ
  about/IN
  machines/NNS
  and/CC
  ``/``
  small/JJ
  data/NN
  ''/''
  is/VBZ
  about/IN
  people/NNS
  ./.
  This/DT
  is/VBZ
  to/TO
  say/VB
  that/IN
  eyewitness/JJ
  observations/NNS
  or/CC
  five/CD
  pieces/NNS
  of/IN
  related/JJ
  data/NNS
  could/MD
  be/VB
  small/JJ
  data/NNS
  ./.
  Small/NNP
  data/NN
  is/VBZ
  what/WP
  we/PRP
  used/VBD
  to/TO
  think/VB
  of/IN
  as/RB
  data/NNS
  ./.
  The/DT
  only/JJ
  way/NN
  to/TO
  comprehend/VB
  Big/NNP
  data/NN
  is/VBZ
  to/TO
  reduce/VB
  the/DT
  data/NN
  into/IN
  small/JJ
  ,/,
  visually-appealing/JJ
  objects/NNS
  represen

In [52]:
for sent in nltk.sent_tokenize(article_text):
   for chunk in ne_tree:
      if hasattr(chunk, 'label'):
         print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Small
PERSON Small Data
PERSON Allen Bonde
ORGANIZATION Innovation
ORGANIZATION Actuate
PERSON Small
PERSON Martin Lindstrom
GPE Snapchat
PERSON Lindstrom
PERSON Lindstrom
PERSON Small Data
ORGANIZATION Small Data
PERSON Bonde
ORGANIZATION Forbes
PERSON Direct Marketing News
PERSON Martin Lindstrom
PERSON Small Data
PERSON Small Data
PERSON Small Data
GPE Lindstrom
ORGANIZATION Small Data
PERSON Lindstrom
PERSON Lindstrom
PERSON Lowes Foods
GPE North
PERSON Small Data
ORGANIZATION Cornell University
ORGANIZATION Cornell
ORGANIZATION Small Data Lab
PERSON Weill Cornell Medicine College
PERSON Deborah Estrin
ORGANIZATION Small Data Lab
GPE United States
ORGANIZATION Postal Service
ORGANIZATION USPS
ORGANIZATION OCR
GPE US
ORGANIZATION USPS
PERSON Boeing
ORGANIZATION Carnegie Mellon University
ORGANIZATION AI
ORGANIZATION Harvard Business Review
GPE Small
PERSON Small Data
PERSON Allen Bonde
ORGANIZATION Innovation
ORGANIZATION Actuate
PERSON Small
PERSON Martin Lindstrom
GPE Snapchat

PERSON Allen Bonde
ORGANIZATION Innovation
ORGANIZATION Actuate
PERSON Small
PERSON Martin Lindstrom
GPE Snapchat
PERSON Lindstrom
PERSON Lindstrom
PERSON Small Data
ORGANIZATION Small Data
PERSON Bonde
ORGANIZATION Forbes
PERSON Direct Marketing News
PERSON Martin Lindstrom
PERSON Small Data
PERSON Small Data
PERSON Small Data
GPE Lindstrom
ORGANIZATION Small Data
PERSON Lindstrom
PERSON Lindstrom
PERSON Lowes Foods
GPE North
PERSON Small Data
ORGANIZATION Cornell University
ORGANIZATION Cornell
ORGANIZATION Small Data Lab
PERSON Weill Cornell Medicine College
PERSON Deborah Estrin
ORGANIZATION Small Data Lab
GPE United States
ORGANIZATION Postal Service
ORGANIZATION USPS
ORGANIZATION OCR
GPE US
ORGANIZATION USPS
PERSON Boeing
ORGANIZATION Carnegie Mellon University
ORGANIZATION AI
ORGANIZATION Harvard Business Review
GPE Small
PERSON Small Data
PERSON Allen Bonde
ORGANIZATION Innovation
ORGANIZATION Actuate
PERSON Small
PERSON Martin Lindstrom
GPE Snapchat
PERSON Lindstrom
PERSON Lin

PERSON Boeing
ORGANIZATION Carnegie Mellon University
ORGANIZATION AI
ORGANIZATION Harvard Business Review


In [53]:
sentence_list = nltk.sent_tokenize(article_text)  #fommatted data doesn;t contain punctuation so article_text used

In [54]:
stopwords = nltk.corpus.stopwords.words('english')     #all stopwords stored here

In [55]:
word_frequencies = {}    #calculating freq of words(excluding stopwords)

In [56]:
for word in nltk.word_tokenize(formatted_article_text):  
    if word not in stopwords:
        if word not in word_frequencies.keys():    #encountered for the first time
            word_frequencies[word] = 1     
        else:                                      #incrementing count
            word_frequencies[word] += 1

In [57]:
print(word_frequencies)

{'Small': 16, 'data': 28, 'small': 14, 'enough': 1, 'human': 1, 'comprehension': 1, 'It': 2, 'volume': 1, 'format': 1, 'makes': 1, 'accessible': 2, 'informative': 1, 'actionable': 2, 'The': 5, 'term': 1, 'big': 2, 'machines': 1, 'people': 4, 'This': 2, 'say': 1, 'eyewitness': 1, 'observations': 3, 'five': 1, 'pieces': 2, 'related': 1, 'could': 1, 'used': 2, 'think': 1, 'way': 2, 'comprehend': 1, 'Big': 4, 'reduce': 2, 'visually': 2, 'appealing': 1, 'objects': 2, 'representing': 1, 'various': 1, 'aspects': 1, 'large': 1, 'sets': 2, 'histogram': 1, 'charts': 1, 'scatter': 1, 'plots': 1, 'Data': 14, 'finding': 2, 'correlations': 2, 'causation': 1, 'reason': 1, 'A': 1, 'formal': 1, 'definition': 2, 'proposed': 1, 'Allen': 1, 'Bonde': 2, 'former': 1, 'vice': 1, 'president': 1, 'Innovation': 1, 'Actuate': 1, 'part': 1, 'OpenText': 1, 'connects': 1, 'timely': 1, 'meaningful': 2, 'insights': 4, 'derived': 1, 'local': 1, 'sources': 1, 'organized': 2, 'packaged': 1, 'often': 1, 'understandable':

In [58]:
maximum_frequncy = max(word_frequencies.values())

In [59]:
#Calculating weighted freq of all words
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

In [60]:
sentence_scores = {}    #calculating freq of sentences

In [61]:
for sent in sentence_list:  
    for word in nltk.word_tokenize(sent.lower()):   #convert to lowercase
        if word in word_frequencies.keys():        
            if len(sent.split(' ')) < 30:      #for smaller sentences < 30
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [62]:
import heapq    #to retrieve highest scores

In [63]:
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

In [64]:
summary = ' '.join(summary_sentences)
print(summary) 

According to Martin Lindstrom, in his book, Small Data: "{In customer research, small data is} Seemingly insignificant behavioural observations containing very specific attributes pointing towards an unmet customer need. The Small Data Lab developed a series of apps, focusing not only on gathering data from patients' pain but also tracking habits in areas such as grocery shopping. In comparison to Big Data, Small Data has the power to trigger emotions and to provide insights into the reasons behind the behaviours of customers. This is to say that eyewitness observations or five pieces of related data could be small data. Big Data is all about finding correlations, but Small Data is all about finding the causation, the reason why. The term "big data" is about machines and "small data" is about people. Small data is what we used to think of as data.
