In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import re 

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
raw_html = simple_get('https://en.wiktionary.org/wiki/Appendix:Kangaroo_words')
len(raw_html)

45648

In [4]:
html = BeautifulSoup(raw_html, 'html.parser')
counter = 0
kangaroo_messy_list = []
for p in html.select('#mw-content-text ul li'):
    kangaroo_messy_list.append(p)
    counter+=1
    if counter >= 400:
        break

In [5]:
joey_list = []
pattern="<b>(\w+?)<\/b>"
for html_row in kangaroo_messy_list:
    m = re.split(pattern, str(html_row))
    joey_list.append(''.join(m[1:][::2]))

In [6]:
kangaroo_list = []
pattern="<\/*.+?>(\w+?)<\/*.+?>"
pattern_2 = "(\w+)"
for html_row in kangaroo_messy_list:
    m = re.split(pattern, str(html_row))
    new_string = ''.join(m)
    m = re.split(pattern_2, new_string)
    kangaroo_list.append(m[1])    

In [7]:
df = pd.DataFrame({'joey_word':joey_list, 'kangaroo_word':kangaroo_list})

In [8]:
df['anti-kangaroo'] = False
df['twin-kangaroo'] = False
df.loc[325:,'anti-kangaroo'] = True

In [9]:
df.loc[df['kangaroo_word'] == 'alone', 'joey_word'] = 'one'
df.loc[df['kangaroo_word'] == 'amicability', 'joey_word'] = 'amity'
df.loc[df['kangaroo_word'] == 'blatherskite', 'joey_word'] = 'blah'
df.loc[df['kangaroo_word'] == 'canister', 'joey_word'] = 'can'
df.loc[df['kangaroo_word'] == 'feasted', 'joey_word'] = 'ate'
df.loc[df['kangaroo_word'] == 'frangible ', 'joey_word'] = 'frail' #grand-kangaroo (fragile)
df.loc[df['kangaroo_word'] == 'masculine', 'joey_word'] = 'man'

In [10]:
df.loc[df['kangaroo_word'] == 'alone', 'twin-kangaroo'] = True
df.loc[df['kangaroo_word'] == 'amicability', 'twin-kangaroo'] = True
df.loc[df['kangaroo_word'] == 'blatherskite', 'twin-kangaroo'] = True
df.loc[df['kangaroo_word'] == 'canister', 'twin-kangaroo'] = True
df.loc[df['kangaroo_word'] == 'feasted', 'twin-kangaroo'] = True
df.loc[df['kangaroo_word'] == 'frangible ', 'twin-kangaroo'] = True #grand-kangaroo
df.loc[df['kangaroo_word'] == 'masculine', 'twin-kangaroo'] = True

In [11]:
## manual fixes
df.loc[df['kangaroo_word'] == 'strives', 'joey_word'] = 'tries'

In [13]:
df.to_csv('wiki_kangaroo_words.csv')