In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

def getNgrams(content, n):
  content = content.split(' ')
  output = []
  for i in range(len(content)-n+1):
    output.append(content[i:i+n])
  return output

html = urlopen('http://en.wikipedia.org/wiki/Python_(programming_language)')
bs = BeautifulSoup(html, 'html.parser')
content = bs.find('div', {'id':'mw-content-text'}).get_text()
ngrams = getNgrams(content, 2)
print(ngrams)
print('2-grams count is: '+str(len(ngrams)))

[['General-purpose,', 'high-level'], ['high-level', 'programming'], ['programming', 'language\n\n\nPythonParadigmMulti-paradigm:'], ['language\n\n\nPythonParadigmMulti-paradigm:', 'functional,'], ['functional,', 'imperative,'], ['imperative,', 'object-oriented,'], ['object-oriented,', 'structured,'], ['structured,', 'reflectiveDesigned\xa0byGuido'], ['reflectiveDesigned\xa0byGuido', 'van'], ['van', 'RossumDeveloperPython'], ['RossumDeveloperPython', 'Software'], ['Software', 'FoundationFirst\xa0appeared1990;'], ['FoundationFirst\xa0appeared1990;', '30\xa0years'], ['30\xa0years', 'ago\xa0(1990)[1]Stable'], ['ago\xa0(1990)[1]Stable', 'release3.8.6\n'], ['release3.8.6\n', ''], ['', ''], ['', '/'], ['/', '24\xa0September'], ['24\xa0September', '2020;'], ['2020;', '8\xa0days'], ['8\xa0days', 'ago\xa0(2020-09-24)[2]Preview'], ['ago\xa0(2020-09-24)[2]Preview', 'release3.9.0rc2\n'], ['release3.9.0rc2\n', ''], ['', ''], ['', '/'], ['/', '17\xa0September'], ['17\xa0September', '2020;'], ['2020;'

In [2]:
import re

def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string

def cleanSentence(sentence):
    sentence = sentence.split(' ')
    sentence = [word.strip(string.punctuation+string.whitespace) for word in sentence]
    sentence = [word for word in sentence if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')]
    return sentence

def cleanInput(content):
    content = content.upper()
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]

def getNgramsFromSentence(content, n):
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return output

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = []
    for sentence in content:
        ngrams.extend(getNgramsFromSentence(sentence, n))
    return(ngrams)

In [4]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
from collections import Counter

def getNgrams(content, n):
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [' '.join(ngram) for ngram in getNgramsFromSentence(sentence, n)]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return(ngrams)

In [6]:
print(getNgrams(content, 2))

Counter({'PYTHON SOFTWARE': 42, 'SOFTWARE FOUNDATION': 41, 'OF THE': 37, 'IN PYTHON': 32, 'OF PYTHON': 26, 'IN THE': 25, 'FROM THE': 25, 'VAN ROSSUM': 23, 'THE PYTHON': 21, 'RETRIEVED FEBRUARY': 21, 'AS A': 20, 'SUCH AS': 20, 'RETRIEVED NOVEMBER': 17, 'ARCHIVED FROM': 17, 'THE ORIGINAL': 17, 'TO THE': 15, 'PYTHON ENHANCEMENT': 15, 'ORIGINAL ON': 15, 'PYTHON IS': 14, 'RETRIEVED SEPTEMBER': 14, 'ENHANCEMENT PROPOSALS': 14, 'RETRIEVED JUNE': 14, 'PROGRAMMING LANGUAGE': 13, 'IS A': 13, 'CAN BE': 13, 'BE USED': 13, 'RETRIEVED MARCH': 13, 'OF A': 12, 'IT IS': 12, 'FOR PYTHON': 11, 'FOR EXAMPLE': 11, 'ROSSUM GUIDO': 11, 'RETRIEVED APRIL': 11, 'RETRIEVED JANUARY': 11, 'GUIDO VAN': 10, 'STANDARD LIBRARY': 10, 'OTHER LANGUAGES': 10, 'TO PYTHON': 10, 'RETRIEVED MAY': 10, 'RETRIEVED DECEMBER': 10, 'AND A': 9, 'SYNTAX AND': 9, 'WITH THE': 9, 'FOR THE': 9, 'AS THE': 9, 'TO BE': 9, 'PROGRAMMING LANGUAGES': 9, 'STATEMENT WHICH': 9, 'USED TO': 9, 'FROM PYTHON': 9, 'RETRIEVED JULY': 9, 'THE LANGUAGE': 8

  content = re.sub('\n|[[\d+\]]', ' ', content)
