In [9]:
import spacy
import pandas as pd
from sqlalchemy import create_engine
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from collections import defaultdict
from spacy.lang.en.stop_words import STOP_WORDS
from IPython.core.display import display, HTML
from configparser import ConfigParser, ExtendedInterpolation

import string
from collections import defaultdict
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import words as nltk_words
import scipy as sp

from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import  RandomizedSearchCV
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder, LabelBinarizer, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.ensemble import  RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel


from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel, SelectKBest, SelectPercentile
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.decomposition import PCA, TruncatedSVD




In [10]:
# configuration for data, acronyms, and gensim paths
config = ConfigParser(interpolation=ExtendedInterpolation())
config.read('../../config.ini')

DB_PATH = config['DATABASES']['PROJECT_DB_PATH']
AIRLINE_ACRONYMS_FILEPATH = config['NLP']['AIRLINE_ACRONYMS_FILEPATH']
AIRLINE_CLEANED_TEXT_PATH = config['NLP']['AIRLINE_CLEANED_TEXT_PATH']
GENSIM_DICTIONARY_PATH = config['NLP']['GENSIM_DICTIONARY_PATH']
GENSIM_CORPUS_PATH = config['NLP']['GENSIM_CORPUS_PATH']

In [11]:
engine = create_engine(DB_PATH)
df = pd.read_sql("SELECT * FROM Sections", con=engine)

# the annual report from 1992 was scanned in poor quality
# and the text was not legible
df = df[df.filename != 'southwest-airlines-co_annual_report_1992.docx']

# filter to relevant sections
df = df[df['section_text'].str.contains('fee')]
df.head()

Unnamed: 0,section_id,filename,section_name,section_text,criteria,section_length
291,292,southwest-airlines-co_annual_report_1994.docx,DEPARTMENT OF TRANSPORTATION RANKINGS FOR 1994...,A multitude of challenges faced the People of ...,"<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",2849
297,298,southwest-airlines-co_annual_report_1994.docx,RESULTS OF OPERATIONS,1994 COMPARED WITH 1993 The Company's consolid...,"<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",13806
305,306,southwest-airlines-co_annual_report_1994.docx,ACQUISITION,"On December 31, 1993, Southwest exchanged 3,57...","<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",2141
308,309,southwest-airlines-co_annual_report_1994.docx,ACCRUED LIABILITIES (IN THOUSANDS) LONG-TERM D...,"On March 1, 1993, the Company redeemed the $10...","<, f, u, n, c, t, i, o, n, , h, e, a, d, i, n...",1855
359,360,southwest-airlines-co_annual_report_1995.docx,SECRET NUMBER 1 STICK TO WHAT YOU’RE GOOD AT.,"Since 1971, Southwest Airlines has offered sin...","<, f, u, n, c, t, i, o, n, , s, t, y, l, e, ...",2566


In [12]:
# store section matches in list
text = [section for section in df['section_text'].values]

# review first sentence of a section match
text[0][0:299]

'A multitude of challenges faced the People of Southwest Airlines in 1994. The mark of a true champion is the ability to “rise to the occasion” and meet challenges. We believe our Employees showed their true Southwest Spirit in 1994, accomplishing three- or four-fold what a normal year would  bring.'

In [15]:
class GrammarTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = pd.DataFrame()
        df['title_punc_count']= X.apply(lambda sent: sum(1 for word in sent if word in string.punctuation))
        df['title_exclamation_count']= X.str.count('!')
        df['title_all_caps'] = X.apply(lambda x: x.isupper())
        df['contains_numbers'] = X.apply(lambda s: any(i.isdigit() for i in s))
        df['sent_len'] = X.apply(len)
        df['num_distinct_words'] = X.apply(lambda sent: len(set(sent.split())))

        return df
    
    def get_feature_names(self):
        return self.columns
        

In [21]:
grammar = GrammarTransformer()
grammar.fit(text)
result = grammar.transform(df['section_text'])
result['text'] = df['section_text']
result.head()

Unnamed: 0,title_punc_count,title_exclamation_count,title_all_caps,contains_numbers,sent_len,num_distinct_words,text
291,61,0,False,True,2849,273,A multitude of challenges faced the People of ...
297,434,0,False,True,13806,649,1994 COMPARED WITH 1993 The Company's consolid...
305,54,0,False,True,2141,172,"On December 31, 1993, Southwest exchanged 3,57..."
308,83,0,False,True,1855,174,"On March 1, 1993, the Company redeemed the $10..."
359,84,0,False,True,2566,243,"Since 1971, Southwest Airlines has offered sin..."


In [22]:
class LocationTransformer(BaseEstimator, TransformerMixin):
    """ 
    extract housing features, using dependency parsing to combine descriptive words with their ROOT or noun subject
    e.g. 'beautiful brownstone 1 bedroom' ---> 'beautiful bedroom', 'brownstone bedroom', '1 bedroom'
    this may improve upon n-grams which would connect words by character location, not meaning
    """
    
    def __init__(self, text_field):
        self.text_field = text_field
        self.columns = None

    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        #time_text = []
        #gpe_text = []
        gpe_count = []
        cardinal_count = []
        time_count = []
        quantity_count = []

        for doc in nlp.pipe(df[self.text_field], disable=['tagger']):
            _gpe,_cardinal,_time,_quantity = 0,0,0,0
            #_gpe_text,_time_text = '',''

            for ent in doc.ents:    
                if ent.label_ == 'GPE':
                    _gpe += 1
                    #_gpe_text = ' '.join([_gpe_text, ent.text])
                if ent.label_ == 'CARDINAL':
                    _cardinal += 1
                if ent.label_ == 'DATE':
                    _time += 1
                   # _time_text = ' '.join([_time_text, ent.text])
                if ent.label_ == 'QUANTITY':
                    _quantity += 1

            #gpe_text.append(_gpe_text)
            #time_text.append(_time_text)  
            gpe_count.append(_gpe)
            cardinal_count.append(_cardinal)
            time_count.append(_time)
            quantity_count.append(_quantity)
        
        df['gpe_count'] = gpe_count
        df['cardinal_count'] = cardinal_count
        df['time_count'] = time_count
        df['quantity_count'] = quantity_count
        
        df.drop(self.text_field, axis=1, inplace=True)
        self.columns = df.columns
        
        return df

    def get_feature_names(self):
        return self.columns

In [23]:
class HousingTransformer(BaseEstimator, TransformerMixin):
    """ 
    extract housing features, using dependency parsing to combine descriptive words with their ROOT or noun subject
    e.g. 'beautiful brownstone 1 bedroom' ---> 'beautiful bedroom', 'brownstone bedroom', '1 bedroom'
    this may improve upon n-grams which would connect words by character location, not meaning
    """
    
    def __init__(self, text_field):
        self.text_field = text_field
        self.columns = None

    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        home_size = []
        home_type_descriptions = []
        home_type = []
        home_description_count = []

        for doc in nlp.pipe(df[self.text_field], disable=['tagger','ner']):
            # containers for housing types and descriptions for each listing
            _type = ''
            _size = ''
            _type_descriptions = ''
            _description_count = 0

            for token in doc:
                # find root and nsubj - most likely to be housing types
                if (token.dep_ == 'ROOT') or (token.dep_ == 'nsubj'):
                    token_text = token.lemma_.lower()
                    _type = ' '.join([_type, token_text])

                    # find all words that are preceeded by a num
                    for left_term in [t.text for t in token.lefts]:
                        if left_term.isdigit():
                            _size = ' '.join(
                                [_size,'{}_{}'.format(left_term,token_text)])
                    for left_term in [t.text for t in token.lefts]:
                        _type_descriptions = ' '.join(
                            [_type_descriptions,'{}_{}'.format(left_term,token_text)])
                        _description_count += 1

            home_type.append(_type)
            home_size.append(_size)
            home_type_descriptions.append(_type_descriptions)
            home_description_count.append(_description_count)

        # cast list of terms to string for each listing's housing features
        df['home_type'] = home_type
        df['home_size'] = home_size
        df['home_type_descriptions'] = home_type_descriptions
        df['home_description_count'] = home_description_count

        # combine all housing text features into a single column, space separated
        # this single column will be used for text vectorization of housing features
        df['_space'] = ' '
        df['housing_features'] = df[['home_type','_space'
                                       , 'home_size','_space'
                                       , 'home_type_descriptions'
                                      ]].sum(axis=1)
        self.columns = df.columns
        
        return df['housing_features']
    
    def get_feature_names(self):
        return self.columns

In [None]:

# SOURCE: scikit-learn.org/stable/auto_examples/hetero_feature_union.html
class ItemSelector(BaseEstimator, TransformerMixin):
    """ Select a subset of features as a step in a sklearn pipeline """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class DummyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.dummy_cols = None
            
    def fit(self, X, y=None):
        self.dummy_cols = pd.get_dummies(X).columns
        return self
    
    def transform(self, X):
        X = pd.get_dummies(X)
        for col in self.dummy_cols:
            if col not in X.columns:
                X[col] = 0
        
        return X[self.dummy_cols]
    
    def get_feature_names(self):
        return self.dummy_cols
        
    
class SparseMatrixTransformer(BaseEstimator, TransformerMixin):
    """ Converts a dense matrix into a sparse matrix

    Note: used in sklearn pipeline to reformat numeric data to match sparse matrix for text data
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        train_extra = sp.sparse.csr_matrix(X.astype(float))

        return train_extra