# Working with text data #

## 1.0 n-gram in text data: A sequence of n words used in text ##

In [3]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import pprint
# get the sequence of n words
def getNgrams(input, n):
  input = input.split(' ')
  output = []
  # return words from the text in groups of n
  for i in range(len(input) - n+1):
    # 0:2, 1:3, 2:4...
    output.append(input[i : i+n])
  return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "html.parser")

# Get the body of text without the HTML
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()

# Split into 2 word groups
ngrams = getNgrams(content, 2)

pprint.pprint(ngrams)
print("2-grams count is: "+str(len(ngrams)))

[[u'This', u'article'],
 [u'article', u'is'],
 [u'is', u'about'],
 [u'about', u'the'],
 [u'the', u'programming'],
 [u'programming', u'language.'],
 [u'language.', u'For'],
 [u'For', u'the'],
 [u'the', u'snake'],
 [u'snake', u'genus,'],
 [u'genus,', u'see'],
 [u'see', u'Python'],
 [u'Python', u'(genus).'],
 [u'(genus).', u'For'],
 [u'For', u'other'],
 [u'other', u'uses,'],
 [u'uses,', u'see'],
 [u'see', u'Python'],
 [u'Python',
  u'(disambiguation).\n\nPython\n\n\n\n\nParadigm\nmulti-paradigm:'],
 [u'(disambiguation).\n\nPython\n\n\n\n\nParadigm\nmulti-paradigm:',
  u'object-oriented,'],
 [u'object-oriented,', u'imperative,'],
 [u'imperative,', u'functional,'],
 [u'functional,', u'procedural,'],
 [u'procedural,', u'reflective\n\n\nDesigned\xa0by\nGuido'],
 [u'reflective\n\n\nDesigned\xa0by\nGuido', u'van'],
 [u'van', u'Rossum\n\n\nDeveloper\nPython'],
 [u'Rossum\n\n\nDeveloper\nPython', u'Software'],
 [u'Software', u'Foundation\n\n\nFirst\xa0appeared\n20\xa0February'],
 [u'Foundation\n\

## 1.1 Clean the data ##

In [13]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import OrderedDict
import pprint

# Clean the data
def cleanInput(input):
    # Replace all newline characters with a space
    input = re.sub('\n+', " ", input)
    
    # Remove wikipedia citation marks which are of the form [24]
    input = re.sub('\[[0-9]*\]', "", input)
    
    # replace all instances of multiple spaces with single space
    input = re.sub(' +', " ", input)
    
    # Eleminate escape characters by encoding the content with UTF-8
    input = input.encode('utf-8')
    input = input.decode("ascii", "ignore")
    
    cleanInput = []
    input = input.split(' ')
    
    # remove all punctuation marks
    for item in input:
        item = item.strip(string.punctuation)
        # Remove all single character words except 'i' and 'a'
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

# Same as in 1.1
def getNgrams(input, n):
    input = cleanInput(input)
    output = dict()
    for i in range(len(input)-n+1):
        newNGram = " ".join(input[i:i+n])
        if newNGram in output:
            output[newNGram] += 1
        else:
            output[newNGram] = 1
    return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj = BeautifulSoup(html, "html.parser")

# Get the textual data without the HTML
content = bsObj.find("div", {"id":"mw-content-text"}).get_text()

# 2-gram
ngrams = getNgrams(content, 2)

# Order the output in the form of a dictionary sorted in descending order in terms of value
ngrams = OrderedDict(sorted(ngrams.items(), 
                            key = lambda t: t[1], 
                            reverse=True))
pprint.pprint(ngrams)

OrderedDict([(u'Software Foundation', 37), (u'Python Software', 37), (u'of Python', 36), (u'of the', 34), (u'Foundation Retrieved', 32), (u'in the', 21), (u'such as', 20), (u'in Python', 19), (u'van Rossum', 19), (u'Retrieved 24', 18), (u'is a', 18), (u'February 2012', 17), (u'Python Enhancement', 15), (u'from the', 15), (u'to the', 15), (u'Python is', 14), (u'the Python', 14), (u'Enhancement Proposals', 14), (u'Proposals Python', 14), (u'Python has', 13), (u'can be', 12), (u'standard library', 12), (u'as a', 12), (u'Rossum Guido', 12), (u'be used', 11), (u'for the', 10), (u'Python and', 10), (u'of a', 10), (u'by the', 10), (u'programming language', 10), (u'statement which', 9), (u'Python Python', 9), (u'and a', 9), (u'to be', 9), (u'November 2008', 9), (u'December 2012', 9), (u'The Python', 9), (u'with the', 9), (u'24 November', 9), (u'used to', 8), (u'it is', 8), (u'for Python', 8), (u'August 2016', 8), (u'Retrieved December', 8), (u'Retrieved 19', 8), (u'11 February', 7), (u'Python 

## 2: Using OpenRefine ##

In [14]:
# http://openrefine.org/download.html

## 3. Summarizing data using n-gram(2nd example) ##

In [12]:
'''
The data we are summarzing in the inaugration speach of the 9th president of the USA.
'''
from urllib2 import urlopen
from bs4 import BeautifulSoup
import re
import string
import operator
import pprint

def cleanInput(input):
    # Similar to clean function in 1.1
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = input.encode('utf-8')
    input = input.decode("ascii", "ignore")
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def getNgrams(input, n):
    input = cleanInput(input)
    output = {}
    for i in range(len(input)-n+1):
        ngramTemp = " ".join(input[i:i+n])
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output

content = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read())
ngrams = getNgrams(content, 2)
sortedNGrams = sorted(ngrams.items(), 
                      key = operator.itemgetter(1), 
                      reverse=True)
pprint.pprint(sortedNGrams)

[(u'of the', 213),
 (u'in the', 65),
 (u'to the', 61),
 (u'by the', 41),
 (u'the constitution', 34),
 (u'of our', 29),
 (u'to be', 26),
 (u'the people', 24),
 (u'from the', 24),
 (u'that the', 23),
 (u'it is', 23),
 (u'and the', 23),
 (u'of a', 22),
 (u'of their', 19),
 (u'the executive', 19),
 (u'may be', 19),
 (u'of that', 18),
 (u'for the', 16),
 (u'is the', 16),
 (u'have been', 16),
 (u'of its', 16),
 (u'of power', 16),
 (u'all the', 15),
 (u'with the', 15),
 (u'the government', 15),
 (u'which they', 13),
 (u'has been', 13),
 (u'there is', 13),
 (u'power to', 12),
 (u'the same', 11),
 (u'as to', 11),
 (u'the power', 11),
 (u'spirit of', 11),
 (u'that of', 11),
 (u'their own', 11),
 (u'can be', 10),
 (u'which the', 10),
 (u'the character', 10),
 (u'the united', 10),
 (u'of all', 10),
 (u'part of', 10),
 (u'which it', 10),
 (u'but the', 10),
 (u'upon the', 10),
 (u'united states', 10),
 (u'as the', 10),
 (u'of liberty', 10),
 (u'the president', 9),
 (u'the great', 9),
 (u'on the', 9)

## 3.1 Getting rid of unwanted words like 'the' using the BYU corpus ##

In [None]:
# import sys
# reload(sys)
# sys.setdefaultencoding("utf-8")

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from urllib2 import urlopen
from bs4 import BeautifulSoup
import re
import string
import codecs
import operator


# BYU corpus of common words
def isCommon(ngram):
    commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it", "i", "that", "for", "you", "he", 
                   "with", "on", "do", "say", "this", "they", "is", "an", "at", "but","we", "his", "from", 
                   "that", "not", "by", "she", "or", "as", "what", "go", "their","can", "who", "get", "if", 
                   "would", "her", "all", "my", "make", "about", "know", "will","as", "up", "one", "time", "has", 
                   "been", "there", "year", "so", "think", "when", "which", "them", "some", "me", "people", 
                   "take", "out", "into", "just", "see", "him", "your", "come", "could", "now", "than", "like", 
                   "other", "how", "then", "its", "our", "two", "more", "these", "want", "way", "look", "first", 
                   "also", "new", "because", "day", "more", "use", "no", "man", "find", "here", "thing", "give", 
                   "many", "well"]
    for word in ngram:
        if word in commonWords:
            return True
    return False

# Remove newlines, double spaces, citation marks
def cleanText(input):
    input = re.sub('\n+', " ", input).lower()
    input = re.sub('\[[0-9]*\]', "", input)
    input = re.sub(' +', " ", input)
    input = re.sub("u\.s\.", "us", input)
    input = input.encode('utf-8')
#     input = input.decode('ascii')
    input = input.decode("ascii", "ignore")
    return input

# Remove punctuation and single character words
def cleanInput(input):
    input = cleanText(input)
    cleanInput = []
    input = input.split(' ')
    for item in input:
        item = item.strip(string.punctuation)
        if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
            cleanInput.append(item)
    return cleanInput

def getNgrams(input, n):
    input = cleanInput(input)
    output = {}
    for i in range(len(input)-n+1):
        ngramTemp = " ".join(input[i:i+n])
        if ngramTemp not in output:
            output[ngramTemp] = 0
        output[ngramTemp] += 1
    return output

def getFirstSentenceContaining(ngram, content):
    #print(ngram)
    sentences = content.split(".")
    for sentence in sentences: 
        if ngram in sentence:
            return sentence
    return ""

content = str(urlopen("http://pythonscraping.com/files/space.txt").read())
# content = codecs.decode(content.encode("UTF-8"))
ngrams = getNgrams(content, 2)
sortedNGrams = sorted(ngrams.items(), 
                      key = operator.itemgetter(1), 
                      reverse = True)
print(sortedNGrams)