In [7]:
# lets write a Simple script
# to get the 20 top words and their frequency percentage
# with highest frequency in an English Wikipedia article.

#Beautiful Soup is a Python library
#for pulling data out of HTML and XML files.
from bs4 import BeautifulSoup
#Requests is one of the most downloaded
#Python packages of all time,
#pulling in over 7,000,000 downloads every month.
#HTTP library for pulling pushing and authenticating
import requests
#lets you do Regular expression operations
#special text string for describing a search pattern.
#find and replace
import re
#The operator module exports a
#set of efficient functions
#corresponding to the intrinsic operators of Python.
#comparison, addition, greater than less then
import operator
#parses json, formats it
import json
#The module provides just one function,
#tabulate, which takes a list of lists or another
#tabular data type as the first argument,
#and outputs a nicely formatted plain-text table:
from tabulate import tabulate
#system calls, dealw with user arguments
import sys
#list of common stop words various languages like the
from stop_words import get_stop_words
#random
import random
#science tools
import numpy as np
import urllib


In [8]:
#get the words
def getWordList(url):
    word_list = []
    #raw data
    source_code = requests.get(url)
    #convert to text
    plain_text = source_code.text
    #lxml format
    soup = BeautifulSoup(plain_text, 'lxml')

    #find the words in paragraph tag
    for text in soup.findAll('p'):
        if text.text is None:
            continue
        #content
        content = text.text
        #lowercase and split into an array
        words = content.lower().split()

        #for each word
        for word in words:
            #remove non-chars
            cleaned_word = clean_word(word)
            #if there is still something there
            if len(cleaned_word) > 0:
                #add it to our word list
                word_list.append(cleaned_word)

    return word_list


#clean word with regex
def clean_word(word):
    cleaned_word = re.sub('[^A-Za-z]+', '', word)
    return cleaned_word


def createFrquencyTable(word_list):
    #word count
    never_count = 0
    word_count = {}
    for word in word_list:
        #index is the word
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    return word_count


#remove stop words
def remove_stop_words(frequency_list):
    stop_words = get_stop_words('en')

    temp_list = []
    for key, value in frequency_list:
        if key not in stop_words:
            temp_list.append([key, value])

    return temp_list


In [9]:
search_word='dog'
search_modeFlag='Yes' 
target_word='cat'

In [10]:
def scrapWiki(search_word, search_modeFlag, target_word, printing):
    #access wiki API. json format. query it for data. search tyep. shows list of possibilities
    wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
    wikipedia_link = "https://en.wikipedia.org/wiki/"
    percentage_value_targeted = 0;
    nextWord = search_word
    
    #if the search word is too small, throw error
    if (len(search_word) < 2):
        print("Enter valid string")
        return None, search_word

    #get the search word
    string_query = search_word

    #to remove stop words or not
    if (len(search_modeFlag) > 2):
        search_mode = True
    else:
        search_mode = False

    #create our URL
    url = wikipedia_api_link + string_query

    #try-except block. simple way to deal with exceptions
    #great for HTTP requests
    try:
        #use requests to retrieve raw data from wiki API URL we
        #just constructed
        response = requests.get(url)

        #format that data as a JSON dictionary
        data = json.loads(response.content.decode("utf-8"))

        #page title, first option
        #show this in web browser
        if data['query']['search'][1]['title'] == string_query:
            wikipedia_page_tag = data['query']['search'][1]['title']
        else:
            wikipedia_page_tag = data['query']['search'][0]['title']

        #get actual wiki page based on retrieved title
        url = wikipedia_link + wikipedia_page_tag
        print("The URL is " + url)
        
        #get list of words from that page
        page_word_list = getWordList(url)
            
        #create table of word counts, dictionary
        page_word_count = createFrquencyTable(page_word_list)
        #sort the table by the frequency count
        sorted_word_frequency_list = sorted(
            page_word_count.items(), key=operator.itemgetter(1), reverse=True)
        #remove stop words if the user specified
        if (search_mode):
            sorted_word_frequency_list = remove_stop_words(
                sorted_word_frequency_list)

        #sum the total words to calculate frequencies
        total_words_sum = 0
        
        for key, value in sorted_word_frequency_list:
            total_words_sum = total_words_sum + value
 
        #create our final list which contains words, frequency (word count), percentage
        final_list = []
        for key, value in sorted_word_frequency_list:
            percentage_value = float(value * 100) / total_words_sum
            final_list.append([key, value, round(percentage_value, 4)])
            if key == target_word:
                percentage_value_targeted = percentage_value
                
        nextWord = key
                       
        print('Freq of targeted word: ' + str(percentage_value_targeted))

        #just get the top 20 words
        if len(sorted_word_frequency_list) > 20:
            final_list = final_list[:20]
        
        if (printing):                
            #headers before the table
            print_headers = ['Word', 'Frequency', 'Frequency Percentage']
            #print the table with tabulate
            print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl'))
        
    #throw an exception in case it breaks
    except requests.exceptions.Timeout:
        print("The server didn't respond. This word has no page.")
        return None, search_word
        
    return percentage_value_targeted, nextWord


In [11]:
a, nw = scrapWiki("dog", "Yes", "animal", True)
print("Next word to explore : " + str(nw))

The URL is https://en.wikipedia.org/wiki/Dog
Freq of targeted word: 0.25906735751295334
| Word     |   Frequency |   Frequency Percentage |
|----------+-------------+------------------------|
| dogs     |         206 |                 4.8516 |
| dog      |         125 |                 2.9439 |
| wolves   |          31 |                 0.7301 |
| humans   |          30 |                 0.7065 |
| can      |          30 |                 0.7065 |
| human    |          29 |                 0.683  |
| breeds   |          27 |                 0.6359 |
| domestic |          26 |                 0.6123 |
| study    |          26 |                 0.6123 |
| pet      |          26 |                 0.6123 |
| years    |          20 |                 0.471  |
| also     |          19 |                 0.4475 |
| one      |          19 |                 0.4475 |
| may      |          18 |                 0.4239 |
| many     |          16 |                 0.3768 |
| people   |          15 |  

In [12]:
# Sampling Exercise
# Design a strategy to obtain samples
target_word = "information"
searchInWord = []
rate, searchInWord = scrapWiki(searchInWord, "Yes", target_word, False)

rates = []
with open('data.json', 'w') as outfile:
    json.dump(rates, outfile)

Enter valid string
