In [1]:
# Set Working Directory
import os
os.chdir('..')

In [2]:
# Load Requirements
import pandas as pd
import  numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer 

# nltk.download('stopwords')
# nltk.download('wordnet')

from keras.preprocessing.text import Tokenizer

from helper import *

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


In [3]:
# Load Data
data_train, data_test = load_data()
print('data_train shape:', data_train.shape)
print('data_test shape:', data_test.shape)

data_train shape: (400277, 25)
data_test shape: (50064, 16)


In [4]:
# Load Features
data_features = load_features(data_train, data_test)
print('data_features shape:', data_features.shape)

data_features shape: (450341, 16)


# Combine text features

In [5]:
text_to_process = (
    data_features
        .drop(columns=['FTE', 'Total'])
        .fillna("")
        .apply(lambda x: " ".join(x), axis=1)
)

text_to_process.sample(5)

226659    WORKSHOP PARTICIPANT             TEACHER ASSIS...
222590    Department Chair Supp   ED. SUPPORT TEAM CHA  ...
59040     Other Pupil Transportation Services  Support S...
300482    RETIREMENT CONTRIB. EMPLOYEE BENEFITS GENERAL ...
61667     ADDITIONAL/EXTRA DUTY PAY/STIP INSTRUCTIONAL S...
dtype: object

# Case Normalization

In [6]:
text_to_process_lowercase = text_to_process.apply(lambda x: x.lower())
text_to_process_lowercase.sample(5)

365973    supplies staff development staff development  ...
176242    contra benefits employee benefits general fund...
65622     employee benefits general elementary education...
218221    retirement contrib. employee benefits general ...
210300    employee benefits undesignated general fund te...
dtype: object

# Punctuation Removal

In [7]:
text_to_process_no_punctuation = text_to_process_lowercase.apply(lambda x: re.sub(r"[^a-z0-9-]", " ", x))
text_to_process_no_punctuation.sample(5)

208640    personal services - teachers instruction - spe...
361093    department chair supp   grade level dept cha  ...
348989    employee benefits general elementary education...
62778     employee benefits general elementary education...
377046    professional   tech svcs          digital desi...
dtype: object

In [8]:
del text_to_process_lowercase # for memory management

# Tokenization

In [9]:
text_to_process_tokens = text_to_process_no_punctuation.apply(lambda x: x.split())
text_to_process_tokens.sample(5)

292917    [employer, pd, med, contribution, school, libr...
146937    [employee, travel, including, in-district, and...
15575     [employee, benefits, instructional, staff, tra...
310791    [purchase, of, equipment-other, than, buses, a...
248837    [salaries, and, wages, for, teachers, and, oth...
dtype: object

In [10]:
del text_to_process_no_punctuation # for memory management

# Remove stop words

In [11]:
text_to_process_no_stopwords = text_to_process_tokens.apply(lambda x: [word for word in x if word not in stopwords.words('english')])
text_to_process_no_stopwords.sample(5)

45979     [employee, benefits, security, services, gener...
207916    [professional, technical, services, debt, serv...
89960     [books, periodicals, english, language, arts, ...
124402    [employee, benefits, general, elementary, educ...
289354    [equipment, maintenance, repairs, basic, educa...
dtype: object

In [12]:
del text_to_process_tokens # for memory management

# Lemmatization

In [13]:
text_to_process_lemmed = text_to_process_no_stopwords.apply(lambda x: 
                                                            [WordNetLemmatizer().lemmatize(word) for word in x])
text_to_process_lemmed.sample(5)

90682     [employee, benefit, instructional, staff, trai...
19712     [salary, part, time, employee, instructional, ...
201251    [contra, benefit, employee, benefit, general, ...
259858    [salary, wage, teacher, professi, title, part,...
196551    [personal, service, -, compensation, instructi...
dtype: object

In [14]:
del text_to_process_no_stopwords # for memory management

# Combine Steps into Function

In [15]:
def text_processing(phrase):
    """
    Return list processed_phrase: phrase tokens after processing has been completed
    
    param string phrase: phrase to be processed
    
    Required Libraries: re, nltk
    """
    
    # Case Normalization
    processed_phrase = phrase.lower()
    
    # Remove Punctuations
    processed_phrase = re.sub(r"[^a-z0-9-]", " ", processed_phrase)
    
    # Tokenize Phrase
    processed_phrase = processed_phrase.split()
    
    # Remove Stopwords
    processed_phrase = [word for word in processed_phrase if word not in stopwords.words("english")]
    
    # Lemmatization
    processed_phrase = [WordNetLemmatizer().lemmatize(word) for word in processed_phrase]
    
    # Recombine list into phrase
    processed_phrase = ' '.join(processed_phrase)
    
    return processed_phrase

In [16]:
data_features['combined_text'] = (data_features
                                      .drop(columns=['FTE', 'Total'])
                                      .fillna("")
                                      .apply(lambda x: " ".join(x), axis=1)
                                 )

text_processed = data_features['combined_text'].apply(lambda x: text_processing(x))

In [17]:
text_processed.sample(5)

232981             coaching supplement track coach head-gir
90694     summer ed art sport - sea teacher afterschool ...
318627    consultant school parental involvement fed thr...
305988    salary part time employee instructional staff ...
195162    contra benefit employee benefit general fund l...
Name: combined_text, dtype: object