In [1]:
cd ..

/home/jovyan/dsi_plus/Projects/Project-4-semantic_search


In [2]:
%run lib/__init__.py
%matplotlib inline

# Develope text cleaning process for wiki articles

## Build function to load a wiki page to test cleaning process

In [175]:
def pull_wiki_page(page):
    '''
    pull page extract and category list for a specified page using 
    wikipedia API
    '''
    
    #pull page extract
    params_p = {"action": "query",
            "titles": page,
            "prop": 'extracts' ,
            "format": 'json' ,
        }
            
    response_p = requests.get("https://en.wikipedia.org/w/api.php", params = params_p)
    data = response_p.json()
    
    #pull page categories
    params_c = {"action": "query",
            "titles": page,
            "prop": 'categories' ,
            "format": 'json' ,
            }
            
    response_c = requests.get("https://en.wikipedia.org/w/api.php", params = params_c)
    cats = response_c.json()
            
    return data , cats

## Set up tools  for cleaning article
- use beautiful soup to parse html and extract text 
- use regular expressions to remove specific parts types of characters and parts of text
- use spacy to identify structure and provide lemmatized version of words
- use nltk to remove english stop words


In [181]:
import re 
from bs4 import BeautifulSoup
import spacy
import nltk  # natural language toolkit
from nltk.corpus import stopwords

In [177]:
nlp=spacy.load('en')

#download stopwords from nltk
nltk.download('stopwords')

#create list of english stop words from nltk
nltk_stop=stopwords.words('english')
#add addtional items to list of stopwords
nltk_stop.append('displaystyle')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [178]:
#Have a look at stop words
#nltk_stop

## First version of cleaner

In [48]:
def cleaner(text):
    ''' Clean text data, apply spacy lemmatization and nltk stop words'''
    text = re.sub('<* />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('[\d]',' ',text)
    text = re.sub('{*}',' ',text)
    text = re.sub('[\n]',' ',text)
    text = re.sub('[^a-zA-Z ]',' ',text)
    text = ' '.join(i.lemma_ for i in nlp(text)
                    if i.orth_ not in nltk_stop)
    text = ' '.join(text.split())
    return text

In [179]:
def read_articles( pageid, page):
        '''
        read page extract and category list from wikipedi 
        API query and add to page dict
        '''
        data, cats=pull_wiki_page(page)
        
        # first data clean - extra from json query and parse html
        article=data['query']['pages'][pageid]['extract']
        soup = BeautifulSoup(article, 'html.parser')
        extract=soup.get_text()
        
        # 2nd data clean including lemmatization and stop words
        
        extract_clean=cleaner(extract)

        return article , soup, extract , extract_clean 

In [180]:
article , soup, extract , extract_clean = read_articles('3771060', 'Accuracy paradox')

## Review results of extraction and cleaning process

In [170]:
#article

In [171]:
# cleans up  and structures the ram article html
#soup

In [152]:
#extract text from cleaned html (soup)
extract

'The accuracy paradox for predictive analytics states that predictive models with a given level of accuracy may have greater predictive power than models with higher accuracy. It may be better to avoid the accuracy metric in favor of other metrics such as precision and recall.\nAccuracy is often the starting point for analyzing the quality of a predictive model, as well as an obvious criterion for prediction. Accuracy measures the ratio of correct predictions to the total number of cases evaluated. It may seem obvious that the ratio of correct predictions to cases should be a key metric. A predictive model may have high accuracy, but be useless.\nIn an example predictive model for an insurance fraud application, all cases that are predicted as high-risk by the model will be investigated. To evaluate the performance of the model, the insurance company has created a sample data set of 10,000 claims. All 10,000 cases in the validation sample have been carefully checked and it is known whi

In [172]:
#investigate impact of spacy nlp pipeline
dir(doc)
l=[]
for i in doc:
    if i.lemma_ not in nltk_stop:
        l.append(i.lemma_)

display(len(l))
#l[:20]

516

In [53]:
nlp(extract)

The accuracy paradox for predictive analytics states that predictive models with a given level of accuracy may have greater predictive power than models with higher accuracy. It may be better to avoid the accuracy metric in favor of other metrics such as precision and recall.
Accuracy is often the starting point for analyzing the quality of a predictive model, as well as an obvious criterion for prediction. Accuracy measures the ratio of correct predictions to the total number of cases evaluated. It may seem obvious that the ratio of correct predictions to cases should be a key metric. A predictive model may have high accuracy, but be useless.
In an example predictive model for an insurance fraud application, all cases that are predicted as high-risk by the model will be investigated. To evaluate the performance of the model, the insurance company has created a sample data set of 10,000 claims. All 10,000 cases in the validation sample have been carefully checked and it is known which 

In [173]:
#look at impact of cleaning
display(len(extract_clean))
extract_clean

2056

'the accuracy paradox predictive analytic state predictive model give level accuracy may great predictive power model high accuracy -PRON- may well avoid accuracy metric favor metric precision recall accuracy often start point analyze quality predictive model good obvious criterion prediction accuracy measure ratio correct prediction total numb case evaluate -PRON- may seem obvious ratio correct prediction case key metric a predictive model may high accuracy useless in example predictive model insurance fraud application case predict high risk model investigate to evaluate performance model insurance company create sample datum set claim all case validation sample carefully check know case fraudulent a table confusion assist analyze quality model the definition accuracy table confusion model m fraud calculation accuracy model m fraud show a m t n t p t n f p f n t p mathrm a m frac tn tp tn fp fn tp tn numb true negative case fp numb false positive case fn numb false negative case tp n

##  Consider following changes to cleaning...
 - Remove single charcters as add no information
 - Remove all items within {}
 - Remove duplication in removal process
 - '-PRON-' is appearing as a result of the spacy processing - remove


In [196]:
def cleaner_v2(text):
    ''' Clean text data, apply spacy lemmatization and nltk stop words'''
    text = re.sub('{.*}',' ',text)
    #text = re.sub('<* />','',text)   # not neeeded as already removed by beautiful soup
    #text = re.sub('<.*>.*</.*>','', text) # not neeeded as already removed by beautiful soup
    #text = re.sub('[\d]',' ',text) # not needed as removed by last 3 steps
    #text = re.sub('[\n]',' ',text) # not needed as removed by last 3 steps
    text = re.sub('[^a-zA-Z ]',' ',text) # remove numbers and characters not in latin alphabet 
    text = ' '.join(i.lemma_ for i in nlp(text)
                    if i.lemma_ not in nltk_stop)
    text = re.sub('-PRON-',' ',text)  # added by spacy lemmatization ?? - remove
    text = ' '.join(i for i in text.split() if len(i)!=1)  # remove redundant spaces and individual letters
    return text

In [197]:
#look at impact of revised cleaner
ec=cleaner_v2(extract)
print(len(ec))
ec

1879


'accuracy paradox predictive analytic state predictive model give level accuracy may great predictive power model high accuracy may well avoid accuracy metric favor metric precision recall accuracy often start point analyze quality predictive model good obvious criterion prediction accuracy measure ratio correct prediction total numb case evaluate may seem obvious ratio correct prediction case key metric predictive model may high accuracy useless example predictive model insurance fraud application case predict high risk model investigate evaluate performance model insurance company create sample datum set claim case validation sample carefully check know case fraudulent table confusion assist analyze quality model definition accuracy table confusion model fraud calculation accuracy model fraud show tn numb true negative case fp numb false positive case fn numb false negative case tp numb true positive case formula definition accuracy table table confusion fraud model fraud formula acc

## Use version 2 of cleaner to build functions for load process - wiki_api.py