# Webscrape college applications

This notebook was developed to work with pdfs scraped with `ColAppScrape.ipynb`.


In [16]:
import re, string, unicodedata
import os, sys
import nltk
import contractions
import inflect
import PyPDF2
import pandas as pd
from pandas import Series, DataFrame
from operator import itemgetter
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

## Define Helper Functions

In [17]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def stem_words(words):
    # Stem words in list of tokenized words
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    # Lemmatize verbs in list of tokenized words
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def remove_pmarks(text):
    text = re.sub(r'~|`|:|;|"|,|=|-|_|#', ' ', text)
    text = str.replace(text, '"', ' ')
    text = str.replace(text, "'", ' ')
    text = str.replace(text, '.', ' ')
    text = str.replace(text, '?', ' ')
    text = str.replace(text, ')', ' ')
    text = str.replace(text, '(', ' ')
    text = str.replace(text, '[', ' ')
    text = str.replace(text, ']', ' ')
    text = str.replace(text, '|', ' ')
    text = str.replace(text, '@', ' ')
    text = str.replace(text, '$', ' ')
    text = str.replace(text, '%', ' ')
    text = str.replace(text, '&', ' ')
    text = str.replace(text, r'/', ' ')
    # text = str.replace(text, r'\', ' ')
    return text

def remove_nontxt(text):
    text = str.replace(text, r'\\n', '')
    text = str.replace(text, r'\n', '')
    return(text)

def remove_common_words(text):
    words_to_drop = ['a', 'all', 'an', 'and', 'are', 'as', 'at', 'be', 'but',
                     'by', 'can', 'for', 'i', 'if', 'in', 'is', 'it', 
                     'my', 'not', 'of', 'on', 'or', 'that', 
                     'the', 'to', 'will', 'with', 'you']
    for word in words_to_drop:
        text = re.sub(''.join((r'\b', word, r'\b')), '', text)
    return text

def combine_cword(text):
    compound_words = {
        'social security number': 'socialsecuritynumber',
        'ssn': 'socialsecuritynumber',
        'high school': 'highschool',
        'application for admission': 'applicationforadmission',
        'admission application': 'applicationforadmission',
        'admissions application': 'applicationforadmission',
        'office of admission': 'officeofadmission',
        'admission office': 'officeofadmission',
        'admissions office': 'officeofadmission',
        'application fee': 'applicationfee'
    }
    for cword in compound_words:
        text = re.sub(''.join((r'\b', cword, r'\b')), compound_words[cword], text)
    return text

def get_words(text):
    return nltk.word_tokenize(text)

def remove_non_ascii(words):
    # Remove non-ASCII characters from list of tokenized words
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def replace_numbers(words):
    # Replace all interger occurrences in list of tokenized words with textual representation
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_numbers(words):
    # Remove all intergers
    new_words = []
    for word in words:
        if word.isdigit() == False:
            new_words.append(word)
    return new_words

def replace_nolen(words):
    # Replcae words that are no length (i.e. '')
    new_words = []
    for word in words:
        if len(word) > 0:
            new_words.append(word)
    return new_words

def normalize(text):
    text = remove_pmarks(text)
    text = remove_nontxt(text)
    text = text.lower()
    text = remove_common_words(text)
    text = combine_cword(text)
    words = get_words(text)
    words = remove_non_ascii(words)
    words = remove_numbers(words)
    words = replace_nolen(words)
    return words

def get_sorted_count(words):
    index_list = sorted(set(words))
    count_list = list(range(len(index_list)))
    for i in range(len(index_list)):
        count_list[i] = 0
        for word in words:
            if word == index_list[i]:
                count_list[i] = count_list[i] + 1
    grand_list = []
    for i in range(len(index_list)):
        item_list = []
        item_list.append(index_list[i])
        item_list.append(count_list[i])
        grand_list.append(item_list)
    return sorted(grand_list, key=itemgetter(1), reverse=True)

def i_am_done(tone):
    # https://stackoverflow.com/questions/16573051/sound-alarm-when-code-finishes 
    import winsound
    duration = 1000  # millisecond
    freq = tone # Hz
    winsound.Beep(freq, duration)

SyntaxError: EOL while scanning string literal (<ipython-input-17-11e2720bd15d>, line 39)

# Implement Text Processing

In [3]:
all_apps_df = DataFrame([], columns=['word'])

In [4]:
# test_file = open(os.path.join('pprapps', 'carver.edu2.pdf'), 'rb')
# test_pdf = PyPDF2.PdfFileReader(test_file)
# test_pdf.numPages

In [6]:
import time
from tqdm import tqdm_notebook

# PyPDF2 has an undocumented PdfReadWarning
# (PdfReadWarning: Superfluous whitespace found in object header b'1' b'0')
# Example: Throws this warning for carver.edu2.pdf (among others)
# https://stackoverflow.com/questions/5644836/in-python-how-does-one-catch-warnings-as-if-they-were-exceptions/39077786
# import warnings
# warnings.filterwarnings("error")

error_log = []

# for file in tqdm_notebook(os.listdir(os.path.join('pprapps'))[224:228]):
# for file in os.listdir(os.path.join('pprapps'))[:3]:
for file in tqdm_notebook(os.listdir(os.path.join('pprapps'))):
    # print(file)
    try:
        # print('a')
        app_file = PyPDF2.PdfFileReader(
            open(os.path.join('pprapps', file), 'rb'))
    except:
        # print('b')
        error_log.append('Error loading : ' + os.path.join(
            'pprapps', file))
    
    try:
        # print('c')
        num_of_pgs = app_file.numPages
    except:
        # print('d')
        num_of_pgs = 100
        error_log.append('Error getting no of pgs : ' + os.path.join(
            'pprapps', file))
        
    if num_of_pgs < 20:
        # print('e')
        app_file_text = []
        try:
            for pageNum in range(app_file.numPages):
                app_file_text.append(
                    app_file.getPage(pageNum).extractText().encode(
                        'utf-8', 'ignore').decode('utf-8', 'ignore'))
            # Use file name to create an unique header
            f_col = str.replace(file, 'pdf', '')
            f_col = f_col[:-1]

            app_file_listed = get_sorted_count(
                normalize(str(app_file_text)))
            app_file_listed.append(['0pages',num_of_pgs])

            next_record = DataFrame(app_file_listed, columns=['word',f_col])

            all_apps_df = pd.merge(
                all_apps_df, next_record, on='word', how='outer')
        except KeyError:
            error_log.append('Error (KeyError) : ' + os.path.join(
            'pprapps', file))
        except TypeError:
            error_log.append('Error (TypeError) : ' + os.path.join(
            'pprapps', file))
    else:
        # print('f')
        error_log.append('Longer than 19 pgs : ' + os.path.join(
            'pprapps', file))

# https://stackoverflow.com/questions/16573051/sound-alarm-when-code-finishes        
i_am_done(350)
        






In [7]:
# Set aside saved version of the data frame
all_apps_df_saved = all_apps_df

In [8]:
all_apps_df.fillna(0)[all_apps_df['word'] == '0pages']

Unnamed: 0,word,aacc.edu0,aacc.edu1,aamu.edu2,albanytech.edu0
1153,0pages,12.0,6.0,6.0,2.0


In [9]:
all_apps_df.fillna(0).head()

Unnamed: 0,word,aacc.edu0,aacc.edu1,aamu.edu2,albanytech.edu0
0,management,28.0,0.0,0.0,0.0
1,college,26.0,0.0,2.0,9.0
2,information,21.0,0.0,3.0,7.0
3,your,19.0,0.0,5.0,6.0
4,business,17.0,0.0,0.0,0.0


In [10]:
all_apps_df = all_apps_df_saved

In [11]:
# all_apps_df = all_apps_df[all_apps_df.columns[1:]]
all_apps_df = all_apps_df.rename(columns={'word': 'rootdom'})
all_apps_df.set_index(['rootdom']).fillna(0).transpose().to_csv('all_apps_df.csv')

In [12]:
all_apps_df.set_index(['rootdom']).fillna(0).transpose().head()

rootdom,management,college,information,your,business,highschool,studies,science,application,arts,...,typeand/,united,via,viewed,violation,visiting,voice,voicepursuant,white,wishes
aacc.edu0,28.0,26.0,21.0,19.0,17.0,17.0,17.0,14.0,13.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aacc.edu1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aamu.edu2,0.0,2.0,3.0,5.0,0.0,0.0,8.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
albanytech.edu0,0.0,9.0,7.0,6.0,0.0,3.0,0.0,1.0,5.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
all_apps_df = all_apps_df.set_index(['rootdom']).fillna(0).transpose()

In [14]:
all_apps_df.head()

rootdom,management,college,information,your,business,highschool,studies,science,application,arts,...,typeand/,united,via,viewed,violation,visiting,voice,voicepursuant,white,wishes
aacc.edu0,28.0,26.0,21.0,19.0,17.0,17.0,17.0,14.0,13.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aacc.edu1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aamu.edu2,0.0,2.0,3.0,5.0,0.0,0.0,8.0,0.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
albanytech.edu0,0.0,9.0,7.0,6.0,0.0,3.0,0.0,1.0,5.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
all_apps_df = pd.read_csv('all_apps_df.csv')

In [None]:
aaa_appcodes2_df = pd.read_stata(os.path.join('App_Rec_Train', 'aaa_appcodes2.dta'), index_col='rootdom')

In [None]:
aaa_appcodes2_df.head().fillna(0).reset_index()

In [None]:
# all_apps_df = all_apps_df.rename(columns={'Unnamed: 0': 'rootdom'})
all_apps_df.head()

In [None]:
# Merge reference:
# https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html
pd.merge(
    aaa_appcodes2_df.reset_index().fillna(0), 
    all_apps_df.reset_index().fillna(0), 
    left_on='rootdom', 
    right_on='Unnamed: 0',
    how='left').head(n=10)

In [None]:
all_apps_df = pd.merge(
    aaa_appcodes2_df.reset_index().fillna(0), 
    all_apps_df.reset_index().fillna(0), 
    left_on='rootdom', 
    right_on='level_0',
    how='outer')

In [None]:
all_apps_df[['rootdom','isApp','socialsecuritynumber',
             'highschool','applicationforadmission',
             'officeofadmission','applicationfee',
             'signature','undergraduate']][all_apps_df['isApp'] == 1]

In [None]:
all_apps_df[all_apps_df['highschool'] > 0]

In [None]:
error_log

In [None]:
list_of_isApp_df = pd.read_csv(
    os.path.join('app_rec_train', 'list_of_isApp.csv'))

In [None]:
lengths = []
for app in list_of_isApp_df['rootdom']:
    try:
        # print(os.path.join('pprapps', app + '.pdf'))
        app_c = PyPDF2.PdfFileReader(os.path.join('pprapps', app + '.pdf'))
        lengths.append(app_c.numPages)
        # lengths.append(app)
    except FileNotFoundError:
        print('{} : {}'.format(
                'Could not find', os.path.join('pprapps', app + '.pdf')))

In [None]:
list_of_isApp_df.head()

In [None]:
print (max(lengths))