<div class="alert alert-success">
**Requirements**
</div>

In [2]:
# DATA
% matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')

# NLTK
import nltk, re, pprint
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.probability import FreqDist
import urllib
from urllib import request
from urllib.request import urlopen
from bs4 import BeautifulSoup

<div class="alert alert-success">
**Functions**
</div>

In [3]:
def raw_url(url):
    ''' Extracts the raw html from an url '''
    html = request.urlopen(url).read().decode('latin-1')
    soup = BeautifulSoup(html)
    return str(soup)

In [4]:
def strip_html(url):
    ''' Extracts the principal content from a url and returns a list'''
    from urllib import request
    from bs4 import BeautifulSoup
    html = request.urlopen(url).read().decode('latin-1')
    soup = BeautifulSoup(html,"lxml")
    [s.extract() for s in soup('script')]
    raw = soup.get_text()
    raw = re.sub(' +', ' ', raw)
    raw = re.sub('\x93', '"', raw)
    raw = re.sub('\x94', '"', raw)
    raw = re.split(r'[\t\n\xa0\|]+', raw)
    rv = []
    for x in raw:
        rv.append(x.strip())    
    return rv

In [5]:
def alphanumeric_split(s):
    try:
        r = re.compile("([0-9]+)([a-zA-Z]+)")
        m = r.match(s)
        return m.group(1), m.group(2)
    except AttributeError:
        return s, "Nothing"

In [6]:
def extract_text(text):
    ''' This function extracts a list with relevant words from a string '''
    tokens = word_tokenize(text)
    new_text = [w for w in tokens if w not in stopwords.words('spanish') and w.isalnum()]    
    return new_text

In [7]:
BENCHMARK_countries = ["ARG",  "BOL",  "BRA",  "CHL",  "COL", "ECU", 
                       "PER",  "PRY",  "URY",  "VEN",  "GEO", "CHE"]

path_start = 'https://www.constituteproject.org/constitution/'
path_end   = '?lang=en' 

constitutions = {"ARG": 'Argentina_1994',
                 "BOL": 'Bolivia_2009',
                 "BRA": 'Brazil_2014',  
                 "CHL": 'Chile_2014',
                 "COL": 'Colombia_2013',
                 "ECU": 'Ecuador_2011',
                 "PER": 'Peru_2009',
                 "PRY": 'Paraguay_2011',  
                 "URY": 'Uruguay_2004',
                 "VEN": 'Venezuela_2009',
                 "GEO": 'Georgia_2013', 
                 "CHE": 'Switzerland_2014',
                 "USA": 'United_States_of_America_1992'}

In [8]:
def hedge(text, word = " "):
    ''' Inserts a word in between words | Default = " " '''
    new_text = []
    for i, w in enumerate(text):
        if i % 1 == 0 and i != len(text) - 1:
            new_text.append(w)
            new_text.append(word)            
        else:
            new_text.append(w)
    return new_text

In [9]:
def relevant_words_extractor(text):
    raw = re.split(r'[\s]+', text)
    content = [w for w in raw if w.lower() not in stopwords.words('english')]
    content_spaces = hedge(content, word = " ")
    rv = ''.join(content_spaces)
    return rv

In [10]:
def constitution(country, complete = True):
    ''' Extracts the constitution of a country either complete or without stop words'''
    trash = ['', 'Share', 'Home', 'No provisions found', 'Try a new topic or search term.']
    raw_list = strip_html(path_start + constitutions[country] + path_end)
    if complete == True:
        constitution = [x for x in raw_list if x not in trash]
    else:
        constitution = []
        for x in raw_list:
            constitution.append(relevant_words_extractor(x))
        constitution = [x for x in constitution if x not in trash]
    return constitution

In [11]:
def constitution_path_finder(country):
    rv = []
    raw = constitution(country)
    for x in raw:
        rv.append(len(x))
    fig = plt.figure(figsize=(18,6))
    plt.title("Pattern in {} Constitution".format(country))
    plt.xlabel("Entries")
    plt.ylabel("Lenght")
    plt.xlim(0, len(rv))
    plt.ylim(0, max(rv))
    plt.plot(rv)  

In [12]:
def constitution_counter(country):
    ''' Counts in a constitution a list of items '''
    items = ['Part', 'Article', 'Chapter', 'Section']
    count_part    = 0
    count_article = 0
    count_chapter = 0
    count_section = 0
    constitution_raw = constitution(country, True)
    
    for x in constitution_raw:
        if x[0:4] == 'Part':
            count_part += 1
        elif x[0:7] == 'Article' or x[0:3] == 'Art':
            count_article += 1
        elif x[0:7] == 'Chapter':
            count_chapter += 1    
        elif x[0:7] == 'Section':
            count_section += 1
    
    rv = pd.DataFrame({'Parts': [count_part], 'Articles': [count_article],
                       'Chapters': [count_chapter], 'Sections': [count_section]})
    
    return rv

In [13]:
def constitution_issue_finder(country, issue):
    ''' Counts issue in a constitution '''
    constitution_raw = constitution(country, True)
    rv = []
    for x in constitution_raw:
        x_count = 0
        x_list = re.split(r'[\s]+', x)
        for xx in x_list:
            if xx == issue:
                x_count += 1
        rv.append(x_count)
    return sum(rv)

In [15]:
for country in constitutions.keys():
    print(country, constitution_issue_finder(country, 'freedom'))

ARG 2
BOL 8
BRA 7
CHL 14
COL 7
ECU 21
PER 16
PRY 14
URY 1
VEN 8
GEO 4
CHE 12
USA 1
