# Webscrape college applications

This notebook was developed to work with pdfs scraped with `ColAppScrape.ipynb`.


In [3]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

## Define Helper Functions

In [6]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def get_words(text):
    return nltk.word_tokenize(text)

def remove_non_ascii(words):
    # Remove non-ASCII characters from list of tokenized words
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    # Convert all characters to lowercase from list of tokenized words
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def replace_contractions(text):
    # Replace contractions in string of text
    return contractions.fix(text, leftovers=False)

def remove_contractions(words):
    new_words = []
    for word in words:
        new_word = replace_contractions(word)
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    # Remove punctuation from list of tokenized words
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    # Replace all interger occurrences in list of tokenized words with textual representation
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    # Remove stop words from list of tokenized words
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    # Stem words in list of tokenized words
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    # Lemmatize verbs in list of tokenized words
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def remove_pmarks(text):
    text = re.sub(r'~|`|:|;|"|,', '', text)
    text = str.replace(text, '"', '')
    text = str.replace(text, "'", '')
    text = str.replace(text, '.', '')
    text = str.replace(text, '?', '')
    return text

def handl_ssn(text):
    text = str.replace(text, 'social security number', 'socialsecuritynumber')
    text = str.replace(text, 'ssn', 'socialsecuritynumber')
    return text

def normalize(text):
    # Remove punctuation from entire string.
    print('Remove punctuation.', end='')
    text = remove_pmarks(text)
    print(' DONE')
    
    # Put to lowercase first
    print('Converting case....', end='')
    text = text.lower()
    print(' DONE')
    
    # Handle social security number
    print('Handle ss numbers..', end='')
    text = handl_ssn(text)
    print(' DONE')
    
    # Tokenize the string.
    print('Tokenize string....', end='')
    words = get_words(text)
    print(' DONE')
    
    # Remove ascii characters.
    print('Remove non ascii...', end='')
    words = remove_non_ascii(words)
    print(' DONE')
    
    # Put everything to lowercase.
    # print('To lowercase txt...', end='')
    # words = to_lowercase(words)
    # print(' DONE')
    
    # For this use case contractions managed above.
    # print('Remove contracts...', end='')
    # words = remove_contractions(words)
    # print(' DONE')
    
    # For this use case punctuation managed above.
    # print('Remove puncuation..', end='')
    # words = remove_punctuation(words)
    # print(' DONE')
    
    # Convert numebrs to words.
    print('Replace numbers....', end='')
    words = replace_numbers(words)
    print(' DONE')
    
    # For this use case, leaving stop words.
    # print('Remove stopwords...', end='')
    # words = remove_stopwords(words)
    # print(' DONE')
    
    
    
    return words

In [8]:
some_test_text = '''Here are a few words that I will use to test.

Don't think this is the end of it.

Social Security Number

ssn

SSN

social security number

1. Hi hi hi
2. Three four five
6. What?'''

print(normalize(some_test_text))

Remove punctuation. DONE
Converting case.... DONE
Handle ss numbers.. DONE
here are a few words that i will use to test

dont think this is the end of it

socialsecuritynumber

socialsecuritynumber

socialsecuritynumber

socialsecuritynumber

1 hi hi hi
2 three four five
6 what
Tokenize string.... DONE
Remove non ascii... DONE
Replace numbers.... DONE
['here', 'are', 'a', 'few', 'words', 'that', 'i', 'will', 'use', 'to', 'test', 'dont', 'think', 'this', 'is', 'the', 'end', 'of', 'it', 'socialsecuritynumber', 'socialsecuritynumber', 'socialsecuritynumber', 'socialsecuritynumber', 'one', 'hi', 'hi', 'hi', 'two', 'three', 'four', 'five', 'six', 'what']
