# Working with text data #

## 1.0 n-gram in text data: A sequence of n words used in text ##

In [3]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import pprint
# get the sequence of n words
def getNgrams(input, n):
  input = input.split(' ')
  output = []
  # return words from the text in groups of n
  for i in range(len(input) - n+1):
    # 0:2, 1:3, 2:4...
    output.append(input[i : i+n])
  return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "html.parser")

# Get the body of text without the HTML
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()

# Split into 2 word groups
ngrams = getNgrams(content, 2)

pprint.pprint(ngrams)
print("2-grams count is: "+str(len(ngrams)))

[[u'This', u'article'],
 [u'article', u'is'],
 [u'is', u'about'],
 [u'about', u'the'],
 [u'the', u'programming'],
 [u'programming', u'language.'],
 [u'language.', u'For'],
 [u'For', u'the'],
 [u'the', u'snake'],
 [u'snake', u'genus,'],
 [u'genus,', u'see'],
 [u'see', u'Python'],
 [u'Python', u'(genus).'],
 [u'(genus).', u'For'],
 [u'For', u'other'],
 [u'other', u'uses,'],
 [u'uses,', u'see'],
 [u'see', u'Python'],
 [u'Python',
  u'(disambiguation).\n\nPython\n\n\n\n\nParadigm\nmulti-paradigm:'],
 [u'(disambiguation).\n\nPython\n\n\n\n\nParadigm\nmulti-paradigm:',
  u'object-oriented,'],
 [u'object-oriented,', u'imperative,'],
 [u'imperative,', u'functional,'],
 [u'functional,', u'procedural,'],
 [u'procedural,', u'reflective\n\n\nDesigned\xa0by\nGuido'],
 [u'reflective\n\n\nDesigned\xa0by\nGuido', u'van'],
 [u'van', u'Rossum\n\n\nDeveloper\nPython'],
 [u'Rossum\n\n\nDeveloper\nPython', u'Software'],
 [u'Software', u'Foundation\n\n\nFirst\xa0appeared\n20\xa0February'],
 [u'Foundation\n\

## 1.1 Clean the data ##

In [13]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import OrderedDict
import pprint

# Clean the data
def cleanInput(input):
    # Replace all newline characters with a space
    input = re.sub('\n+', " ", input)
    
    # Remove wikipedia citation marks which are of the form [24]
    input = re.sub('\[[0-9]*\]', "", input)
    
    # replace all instances of multiple spaces with single space
    input = re.sub(' +', " ", input)
    
    # Eleminate escape characters by encoding the content with UTF-8
    input = input.encode('utf-8')
    input = input.decode("ascii", "ignore")
    
    cleanInput = []
    input = input.split(' ')
    
    # remove all punctuation marks
    for item in input:
        item = item.strip(string.punctuation)
        # Remove all single character words except 'i' and 'a'
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

# Same as in 1.1
def getNgrams(input, n):
    input = cleanInput(input)
    output = dict()
    for i in range(len(input)-n+1):
        newNGram = " ".join(input[i:i+n])
        if newNGram in output:
            output[newNGram] += 1
        else:
            output[newNGram] = 1
    return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "html.parser")

# Get the textual data without the HTML
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()

# 2-gram
ngrams = getNgrams(content, 2)

# Order the output in the form of a dictionary sorted in descending order in terms of value
ngrams = OrderedDict(sorted(ngrams.items(), 
                            key = lambda t: t[1], 
                            reverse=True))
pprint.pprint(ngrams)

OrderedDict([(u'Software Foundation', 37), (u'Python Software', 37), (u'of Python', 36), (u'of the', 34), (u'Foundation Retrieved', 32), (u'in the', 21), (u'such as', 20), (u'in Python', 19), (u'van Rossum', 19), (u'Retrieved 24', 18), (u'is a', 18), (u'February 2012', 17), (u'Python Enhancement', 15), (u'from the', 15), (u'to the', 15), (u'Python is', 14), (u'the Python', 14), (u'Enhancement Proposals', 14), (u'Proposals Python', 14), (u'Python has', 13), (u'can be', 12), (u'standard library', 12), (u'as a', 12), (u'Rossum Guido', 12), (u'be used', 11), (u'for the', 10), (u'Python and', 10), (u'of a', 10), (u'by the', 10), (u'programming language', 10), (u'statement which', 9), (u'Python Python', 9), (u'and a', 9), (u'to be', 9), (u'November 2008', 9), (u'December 2012', 9), (u'The Python', 9), (u'with the', 9), (u'24 November', 9), (u'used to', 8), (u'it is', 8), (u'for Python', 8), (u'August 2016', 8), (u'Retrieved December', 8), (u'Retrieved 19', 8), (u'11 February', 7), (u'Python 