# Webscrape college applications

This notebook demonstrates:
1. Pre-processing
2. Counting word occurances
3. Making individual dataframes
4. Merging those dataframes

Inspired by: https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

In [None]:
import re, string, unicodedata
import nltk
import contractions
import inflect
import pandas as pd
from pandas import Series, DataFrame
from operator import itemgetter
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

## Define Helper Functions

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def stem_words(words):
    # Stem words in list of tokenized words
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    # Lemmatize verbs in list of tokenized words
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def remove_pmarks(text):
    text = re.sub(r'~|`|:|;|"|,', '', text)
    text = str.replace(text, '"', '')
    text = str.replace(text, "'", '')
    text = str.replace(text, '.', '')
    text = str.replace(text, '?', '')
    return text

def remove_common_words(text):
    words_to_drop = ['a','is','it','of','in','at','to','the']
    for word in words_to_drop:
        text = re.sub(''.join((r'\b', word, r'\b')), '', text)
    return text

def combine_cword(text):
    compound_words = {
        'social security number': 'socialsecuritynumber',
        'ssn': 'socialsecuritynumber',
        'high school': 'highschool'
    }
    for cword in compound_words:
        text = re.sub(''.join((r'\b', cword, r'\b')), compound_words[cword], text)
    return text

def get_words(text):
    return nltk.word_tokenize(text)

def remove_non_ascii(words):
    # Remove non-ASCII characters from list of tokenized words
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def replace_numbers(words):
    # Replace all interger occurrences in list of tokenized words with textual representation
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def normalize(text):
    # Remove punctuation from entire string.
    print('Remove punctuation.', end='')
    text = remove_pmarks(text)
    print(' DONE')
    # Put to lowercase first
    print('Converting case....', end='')
    text = text.lower()
    print(' DONE')
    # Remove unwanted common words.
    print('Remove freq words..', end='')
    text = remove_common_words(text)
    print(' DONE')
    # Handle common application compound words
    print('Handle compounds...', end='')
    text = combine_cword(text)
    print(' DONE')
    # Tokenize the string.
    print('Tokenize string....', end='')
    words = get_words(text)
    print(' DONE')
    # Remove ascii characters.
    print('Remove non ascii...', end='')
    words = remove_non_ascii(words)
    print(' DONE')
    # Convert numebrs to words.
    print('Replace numbers....', end='')
    words = replace_numbers(words)
    print(' DONE')
    return words

def get_sorted_count(words):
    index_list = sorted(set(words))
    count_list = list(range(len(index_list)))
    for i in range(len(index_list)):
        count_list[i] = 0
        for word in words:
            if word == index_list[i]:
                count_list[i] = count_list[i] + 1
    grand_list = []
    for i in range(len(index_list)):
        item_list = []
        item_list.append(index_list[i])
        item_list.append(count_list[i])
        grand_list.append(item_list)
    return sorted(grand_list, key=itemgetter(0), reverse=True)

In [None]:
some_test_text = '''Here are a few words that I will use to test.
high
Don't think this is the end of it.
high school
Social Security Number
ssn 'Here are a few words that I will use to test.
high
Don't think this is the end of it.
high school
SSN
high school
social security number
1. Hi hi hi
2. Three four five
6. What?'''

more_test_text = '''Cookie information can be found through the use of a
high school SSN high school plug-in for your web browser. (I use 
'Cookie Manager' on FireFox, although there are many other options 
for FireFox and other browsers). The two cookies you are looking 
for are called Y and T, and they are linked to the domain yahoo.com.
Extract the data from these cookies, and paste it into the appropriate 
variables... a cookie will expire after a certain amount of time, 
which varies between computers. This means that you may have to 
re-fetch the Y and T cookie data every few days, or you will not be
able to archive private groups. 'Here are a few words that I 
will use to test. high Don't think this is the end of it. high school'''

print(sorted(set(normalize(some_test_text))))
print(sorted(set(normalize(more_test_text))))

In [None]:
for vname in ['some_test_text', 'more_test_text']:
    vars()[vname + '_df'] = DataFrame(get_sorted_count(normalize(vars()[vname])), columns=['word','freq'])
    
some_test_text_df['freq1'] = some_test_text_df['freq']
some_test_text_df = some_test_text_df.drop(columns=['freq'])
print(some_test_text_df.head())
more_test_text_df['freq2'] = more_test_text_df['freq']
more_test_text_df = more_test_text_df.drop(columns=['freq'])
print(more_test_text_df.head())

In [None]:
pd.merge(some_test_text_df,more_test_text_df,on='word',how='outer').head()