# Sklearn FeatureUnion
- Use custom transformers for feature engineering
- Then merge the features horizontally for feeding into an ML classifier

## FeatureUnion & Pipelines with Pandas
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

# Feature Engineering Plan

Based on previous data exploration, we'll start with the following:
- Drop location
- Convert keyword to a categorical
- Vectorize tweet text using TF-IDF
- Create categorical indicators from the text:
    - all capitalized
    - all lowercased
    - count of hashtags
    - count of user handles
    - contains a date
    - contains link
    - contains timestamp

In [128]:
import numpy as np 
import pandas as pd
import spacy
import regex as re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split#, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.decomposition import PCA, TruncatedSVD # SparsePCA may avoid problems below
from scipy import sparse
from collections import Counter

In [129]:
spacy.prefer_gpu()

False

In [130]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher

In [131]:
# !python -m spacy download en_core_web_sm

# nlp = English() # This does not include certain features like lemmatization!

nlp = spacy.load("en_core_web_sm") # includes more features!

In [132]:
# # Load English tokenizer, tagger, parser, NER and word vectors
# parser = English()

In [133]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [134]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [135]:
# # Create matcher for hashtags
# matcher = Matcher(nlp.vocab)
# matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

In [136]:
def get_token_counts(text_series, return_df = True, tokenizer=Tokenizer(nlp.vocab)):
    """
    Helper function to get most common tokens.
    """
    token_counts = Counter()
    for doc in tokenizer.pipe(text_series, batch_size=50):
        for token in doc:
            # Skip url-like
            if token.like_url:
                continue
            # Skip emails
            if token.like_email:
                continue
            if token.text.lower() in custom_stopwords:
                continue
            token_counts[token.orth_.lower()] += 1 # Equivalently, token.text

    if return_df:
        token_counts_df = pd.DataFrame.from_dict(token_counts, orient='index').reset_index().sort_values(by=0, ascending=False)
        return token_counts_df
    else:
        return token_counts

# Feature selector transformer
- Feed it the columns you want, and it returns a dataframe with just those features

In [137]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    # Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # Method that describes what we need this transformer to do
    # This one pulls up the list of feature columns you pass in and returns just those columns
    def transform(self, X, y = None):
        return X[self.feature_names] 

# Text processing transformer
- Take in tweet text
- Create features
    - contains hashtag
    - isupper
    - islower
    - has mispellings

# Create the text feature pipeline
- Takes in the tweet text and returns various meta features about it
- Does not tokenize or encode the text itself (taken care of in a separate pipeline)

In [138]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self, use_count_hashtags=True, use_count_user_handles=True):
        self.use_count_hashtags = use_count_hashtags
        self.use_count_user_handles = use_count_user_handles
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        self.user_handle_pattern = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@[A-Za-z0-9_]+[A-Za-z0-9-_]+", re.UNICODE)

        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0


    def count_hashtags(self, obj):
        hashtag_count = len(re.findall(self.hashtag_pattern, obj))
        return hashtag_count
        
        
    def count_user_handles(self, obj):
        user_handle_count = len(re.findall(self.user_handle_pattern, obj))
        return user_handle_count
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        if self.count_hashtags:
            # Count the number of hashtags in the text
            X['hashtag_count'] = X['text'].apply(self.count_hashtags)
        
        if self.count_user_handles:
            # Count number of user handles
            X['user_handle_count'] = X['text'].apply(self.count_user_handles)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
        
        if self.count_hashtags:
            # Count the number of hashtags in the text
            X['hashtag_count'] = X['text'].apply(self.count_hashtags)
        
        if self.count_user_handles:
            # Count number of user handles
            X['user_handle_count'] = X['text'].apply(self.count_user_handles)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    

In [139]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalLemmatizedKeywordTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)
        
    def spacy_lemmatizer(self, obj):
        doc = nlp(obj)

        # There should only be one keyword (not removing %20 spaces)
        if len(doc) > 1:
            print('More than one token found; expecting single token')
            
        return doc[0].lemma_
    
    
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")

        # Convert the keywords to the lemmatized version
        X['lemmatized_keyword'] = X['keyword'].apply(self.spacy_lemmatizer)
        
#         # Drop the keyword col
#         X = X.drop('keyword', axis=1)
        
        X = self.ohe_model.transform(X[['lemmatized_keyword']])
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        # Convert the keywords to the lemmatized version
        X['lemmatized_keyword'] = X['keyword'].apply(self.spacy_lemmatizer)
        
#         # Drop the keyword col
#         X = X.drop('keyword', axis=1)
        
        X = self.ohe_model.fit_transform(X[['lemmatized_keyword']])
        
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix
        
#         # One-hot encode the keyword col
#         X = pd.get_dummies(X, 
#                            columns=['keyword'], 
#                            drop_first=True, 
#                            dummy_na=True)

#         # Drop original keyword col
#         # The only thing remaining now will be the keyword labels
#         X = X.drop('keyword', axis=1)
        
        # returns numpy array
        return X 

In [140]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalRawKeywordTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)

        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        X = self.ohe_model.transform(X)
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
#         # Instantiate OneHotEncoder
#         ohe = preprocessing.OneHotEncoder(handle_unknown='error',
#                                          drop='first',
#                                          sparse=False) 
        
        X = self.ohe_model.fit_transform(X)
        
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix
        
#         # One-hot encode the keyword col
#         X = pd.get_dummies(X, 
#                            columns=['keyword'], 
#                            drop_first=True, 
#                            dummy_na=True)

#         # Drop original keyword col
#         # The only thing remaining now will be the keyword labels
#         X = X.drop('keyword', axis=1)
        
        # returns numpy array
        return X 

In [141]:
class DenseTfidfVectorizer(TfidfVectorizer):
    def __init__(self, pca=False, target_dim = None, remove_hashtag=True, remove_user_handle=True, remove_stop_words=True):
        self.tfidf_model = TfidfVectorizer(tokenizer=self.spacy_tokenizer)
        self.pca = pca
        self.target_dim = target_dim
        self.remove_stop_words = remove_stop_words
        self.remove_hashtag = remove_hashtag
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
        self.remove_user_handle = remove_user_handle
        self.user_handle_pattern = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z]+[A-Za-z0-9-_]+)", re.UNICODE)
        
    def spacy_tokenizer(self, obj):
        doc = nlp(obj)

        # Looks for hashtags
        matches = self.matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])

        for span in spans:
            span.merge()

            
        # Create a list of user handles
        user_handles = re.findall(self.user_handle_pattern, doc.text)      
        
        # Convert spacy tokens to a list of string tokens
        token_list = [t.text.lower() for t in doc if not t.is_punct | t.is_space]
        
        if self.remove_stop_words:
            token_list = [t for t in token_list if t not in stop_words]
        
        if self.remove_user_handle:
            token_list = [t for t in token_list if t not in user_handles]
        
        if self.remove_hashtag:
            token_list = [t.replace("#", "") for t in token_list]
            
        return token_list

        
    def transform(self, raw_documents):
        X = self.tfidf_model.transform(raw_documents['text'])

        # If PCA
        if self.pca:
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.transform(X)

            return X

        return X.toarray() # Changes the scipy sparse array to a numpy matrix

    
    def fit_transform(self, raw_documents, y=None):
        X = self.tfidf_model.fit_transform(raw_documents['text'], y=y)

        # If PCA
        if self.pca:
            self.pca_model = PCA(n_components=self.target_dim)
            
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.fit_transform(X)

            return X
            
        return X.toarray()

In [142]:
class TextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vec_method='tfidf', pca=False, target_dim = None, trunc_svd=False, remove_hashtag=True, remove_user_handle=True, remove_stop_words=True):
        self.vec_method = vec_method
        if self.vec_method == 'tfidf':
            self.tfidf_model = TfidfVectorizer(tokenizer=self.spacy_tokenizer)
        elif self.vec_method == 'bow':
            self.bow_model = CountVectorizer(tokenizer=self.spacy_tokenizer)
        self.pca = pca
        self.trunc_svd = trunc_svd
        self.target_dim = target_dim
        self.remove_stop_words = remove_stop_words
        self.remove_hashtag = remove_hashtag
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
        self.remove_user_handle = remove_user_handle
        self.user_handle_pattern = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z]+[A-Za-z0-9-_]+)", re.UNICODE)
        
    def spacy_tokenizer(self, obj):
        doc = nlp(obj)

        # Looks for hashtags
        matches = self.matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])

        for span in spans:
            span.merge()

            
        # Create a list of user handles
        user_handles = re.findall(self.user_handle_pattern, doc.text)      
        
        # Convert spacy tokens to a list of string tokens
        token_list = [t.text.lower() for t in doc if not t.is_punct | t.is_space]
        
        if self.remove_stop_words:
            token_list = [t for t in token_list if t not in stop_words]
        
        if self.remove_user_handle:
            token_list = [t for t in token_list if t not in user_handles]
        
        if self.remove_hashtag:
            token_list = [t.replace("#", "") for t in token_list]
            
        return token_list

    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    def transform(self, raw_documents):
        if self.vec_method == 'tfidf':
            X = self.tfidf_model.transform(raw_documents['text'])
        elif self.vec_method == 'bow':
            X = self.bow_model.transform(raw_documents['text'])
            
        # If PCA
        if self.pca:
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.transform(X)

            return X

        # If truncated SVD
        if self.trunc_svd:
#             # PCA requires a dense matrix
#             # Tf-idf returns a sparse one
#             X = X.todense()
            
            # Run standard scaler for truncated svd - unsure if needed
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Transform using trained model and return reduced array
            X = self.trunc_svd_model.transform(X)

            return X
        
        return X.toarray() # Changes the scipy sparse array to a numpy matrix

    
    def fit_transform(self, raw_documents, y=None):
        if self.vec_method == 'tfidf':
            X = self.tfidf_model.fit_transform(raw_documents['text'], y=y)
        elif self.vec_method == 'bow':
            X = self.bow_model.fit_transform(raw_documents['text'])
            
        # TODO: try sklearn.truncatedSVD which can work on scipy sparse data; same as LSA
        # If PCA
        if self.pca:
            self.pca_model = PCA(n_components=self.target_dim)
            
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.fit_transform(X)

            return X
        
        
        if self.trunc_svd:
            self.trunc_svd_model = TruncatedSVD(n_components=self.target_dim) # Recommended 100
            
#             # TruncatedSVD may work w/o a dense matrix
#             # Tf-idf returns a sparse one
#             X = X.todense()
            
            # Run standard scaler - unsure if needed for svd
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.trunc_svd_model.fit_transform(X)

            return X
        
        return X.toarray()

In [143]:
custom_stopwords = [
    r"'quantit\x89Û_https://t.co/64cyMG1lTG",
    r"'quantitÛ_https://t.co/64cyMG1lTG",
    r"'quantitû_https://t.co/64cymg1ltg",
    r"quantitÛ_https://t.co/64cyMG1lTG",
    r"quantitû_https://t.co/64cymg1ltg",
    r"\\r\\n",
    r"\r\n",
    r"indiahttp://www.informationng.com/?p=309943",
]

In [144]:
# Modified from https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/contractions.py
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
"can 't": "cannot",
"b/c" : "because",
"cuz": "because",
"kinda": "kind of",
"hes": "he is",
"shes" : "she is",
"oh my god": "omg",
"omfg": "omg",
"didnt": "did not",
"iûªm": "i am",
"youûªve": "you have",
"ûª": "'",
}

In [115]:
line_ending_pattern = re.compile("(\r\n|\r|\n)", re.UNICODE)
line_end_str = train_text_preprocessed[6272]

test_doc = nlp(line_end_str)
for token in test_doc:
    print(token)
    if len(re.findall(line_ending_pattern, token.text)) > 0:
        print(['LINEENDING'])
    else:
        print(token.text)


# handle_test = "@asdfs @asdfklj asdfl;kjsl;fksdl;fj"
# no_handle = "#asdf asdflkfdljk sdl;s. asdfpike!"
# # print(len(re.findall(user_handle_pattern, handle_test)))
# handle_test2 = X_train.loc[180, 'text']
# print(handle_test2)
# for test_str in [handle_test2]:
#     test_doc = nlp(test_str)
#     for token in test_doc:
#         print(token)
#         if len(re.findall(user_handle_pattern, token.text)) > 0:
#             print(['HANDLE'])
#         else:
#             print(token.text)

okay
okay
welcome
welcome
rain
rain

 
['LINEENDING']
Gave
Gave
storm
storm
weather
weather


In [145]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    """
    Takes in a pandas series of strings, and returns the same.
    """
    def __init__(self, expand_contractions=True, strip_url=True, strip_emails=True, strip_stopwords=True, strip_punct_flag=True, tag_numbers=True, lemmatize=True):
        self.expand_contractions_flag = expand_contractions
        self.strip_url_flag = strip_url
        self.strip_emails_flag = strip_emails
        self.strip_stopwords_flag = strip_stopwords
        self.strip_punct_flag = strip_punct_flag
        self.tag_numbers = tag_numbers
        self.lemmatize = lemmatize
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        self.user_handle_pattern = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@[A-Za-z0-9_]+[A-Za-z0-9-_]+", re.UNICODE)
        self.line_break_pattern = re.compile("(\r\n|\r|\n)", re.UNICODE)
        
        
        
    def expand_contractions(self, text, contraction_mapping=CONTRACTION_MAP):
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                          flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match)\
                                    if contraction_mapping.get(match)\
                                    else contraction_mapping.get(match.lower())                       
            expanded_contraction = first_char+expanded_contraction[1:]
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text
    
#     def spacy_lemmatizer(self, token):
#         return token.lemma_
    
    def spacy_token_preprocessing(self, text):
        doc = nlp(text)
        cleaned_tokens = []
        for token in doc:
            if token.like_url & self.strip_url_flag:
                cleaned_tokens.append("[LINK]")
            elif token.like_email & self.strip_emails_flag:
                cleaned_tokens.append("[EMAIL]")
            elif len(re.findall(self.user_handle_pattern, token.text)) > 0:
                cleaned_tokens.append("[HANDLE]")
            elif token.is_punct & self.strip_punct_flag:
                continue
            elif token.like_num & self.tag_numbers:
                cleaned_tokens.append("[NUM]")
            elif len(re.findall(self.line_break_pattern, token.text)) > 0:
                continue
            elif self.strip_stopwords_flag:
                if (token.text.lower() in custom_stopwords) | token.is_stop:
                    continue
                elif self.lemmatize:
                    cleaned_tokens.append(token.lemma_)
                else:
                    cleaned_tokens.append(token.text)
            elif self.lemmatize:
                cleaned_tokens.append(token.lemma_)
            else:
                cleaned_tokens.append(token.text)
                
        return " ".join(cleaned_tokens)
    
    
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    def transform(self, raw_documents):
        X = raw_documents.copy()
        
        if self.expand_contractions_flag:
            # Expand contractions in text
            X = X.apply(self.expand_contractions)
        
        # All the remaining preprocessing
        X = X.apply(self.spacy_token_preprocessing)
            
        return X

    
    def fit_transform(self, raw_documents, y=None):
        X = raw_documents.copy()
        
        if self.expand_contractions_flag:
            # Expand contractions in text
            X = X.apply(self.expand_contractions)
            
        # All the remaining preprocessing
        X = X.apply(self.spacy_token_preprocessing)
        
        return X

In [146]:
fs = FeatureSelector(['text'])
fs_transformed = fs.fit_transform(X_train)
print(type(fs_transformed))
text_processor_pipe = TextPreprocessor()
transformed = text_processor_pipe.fit_transform(fs_transformed)

<class 'pandas.core.frame.DataFrame'>


TypeError: expected string or buffer

In [120]:
# X_train.loc[15:20, 'text']
X_train.loc[180:190, 'text']

180    @20skyhawkmm20 @traplord_29 @FREDOSANTANA300 @...
181    If I get run over by an ambulance am I lucky? ...
182    #news Twelve feared killed in Pakistani air am...
183    http://t.co/7xGLah10zL Twelve feared killed in...
184                   @TanSlash waiting for an ambulance
185    @fouseyTUBE you ok? Need a ambulance. Hahahah ...
186    AMBULANCE SPRINTER AUTOMATIC FRONTLINE VEHICLE...
187    Pakistan air ambulance helicopter crash kills ...
188    @TheNissonian @RejectdCartoons nissan are you ...
189    EMS1: NY EMTs petition for $17 per hour Û÷min...
190    http://t.co/FCqmKFfflW Twelve feared killed in...
Name: text, dtype: object

In [121]:
text_processor = TextPreprocessor()

In [122]:
text_processor.fit_transform(X_train.loc[180:190, 'text'])

180    [HANDLE] [HANDLE] [HANDLE] [HANDLE] hella craz...
181           run ambulance lucky justsaye randomthought
182    news [NUM] fear kill pakistani air ambulance h...
183    [LINK] [NUM] fear kill pakistani air ambulance...
184                              [HANDLE] wait ambulance
185       [HANDLE] ok need ambulance hahahah good [LINK]
186    AMBULANCE SPRINTER AUTOMATIC FRONTLINE vehicle...
187    pakistan air ambulance helicopter crash kill [...
188    [HANDLE] [HANDLE] nissan ok need medical assis...
189    ems1 NY emt petition $ [NUM] hour û÷minimum w...
190    [LINK] [NUM] fear kill pakistani air ambulance...
Name: text, dtype: object

In [123]:
%time train_text_preprocessed = text_processor.fit_transform(X_train['text'])

Wall time: 53.7 s


In [124]:
pd.DataFrame(train_text_preprocessed)

Unnamed: 0,text
0,deed reason earthquake allah forgive
1,forest fire near La Ronge Sask Canada
2,resident ask shelter place notify officer evac...
3,[NUM] people receive wildfire evacuation order...
4,get send photo Ruby Alaska smoke wildfire pour...
...,...
7608,[NUM] giant crane hold bridge collapse nearby ...
7609,[HANDLE] [HANDLE] control wild fire California...
7610,M1.94 01:04 UTC]?5 km S Volcano Hawaii [LINK]
7611,Police investigate e bike collide car Little P...


In [126]:
test_token_counts = get_token_counts(train_text_preprocessed)
test_token_counts

Unnamed: 0,index,0
118,[link],4723
114,[handle],2667
21,[num],2337
158,,568
111,like,394
...,...,...
6720,ft.åêm.o.p.,1
6719,detonate&amp;shot,1
6718,succeed,1
6717,fiya,1


In [108]:
test_token_counts = get_token_counts(train_text_preprocessed)
test_token_counts.to_csv("test_token_counts.csv", index=False)

In [113]:
train_text_preprocessed[6272]

'okay welcome rain \r\n Gave storm weather'

In [31]:
# Categorical text features
cat_text_features = ['text']

# Text features for text pipeline
text_features = ['text']

# Categorical features for text pipeline
cat_features = ['keyword']

# Define categorical pipeline
cat_text_pipeline = Pipeline(
    steps = [('cat_text_selector', FeatureSelector(cat_text_features)),
             ('cat_text_transformer', CategoricalTextTransformer()),
            ],
    verbose = True
)

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
#              ('text_transformer', TextTokenizerTransformer()),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Define the bow text training pipeline
text_bow_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_bow', TextVectorizer(vec_method='bow'))
            ],
    verbose = True
)

# Define the keyword categorical training pipeline
cat_raw_keyword_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalRawKeywordTransformer())
            ],
    verbose = True
)

# Define the lemmatized keyword categorical pipeline
cat_lemma_keyword_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalLemmatizedKeywordTransformer())
            ],
    verbose = True
)

# Define the text training pipeline
text_pca_50_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_tfidf', DenseTfidfVectorizer(pca=True, target_dim = 50))
            ],
    verbose = True
)

# Define the bow text training with pca pipeline
text_bow_pca_50_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_bow', TextVectorizer(vec_method='bow', pca=True, target_dim=50))
            ],
    verbose = True
)

# Define the text training pipeline
test_text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_test_pipe', TextPreprocessor()),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Create the full pipeline

In [32]:
# #Combining numerical and categorical piepline into one full big pipeline horizontally 
# #using FeatureUnion
# full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
#                                                   ( 'numerical_pipeline', numerical_pipeline ) ] )

# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_raw_keyword_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_lemma_keyword_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_raw_keyword_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('text_pipeline', text_pca_50_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_lemma_keyword_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('text_pipeline', text_pca_50_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_raw_keyword_bow_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
        ('text_pipeline', text_bow_pipeline),
                     ]
)

full_lemma_keyword_bow_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
        ('text_pipeline', text_bow_pipeline),
                     ]
)

full_raw_keyword_bow_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
        ('text_pipeline', text_bow_pca_50_pipeline),
                     ]
)

test_text_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
        ('text_pipeline', test_text_pipeline),
                     ]
)

# Test pipeline

In [29]:
train_df = pd.read_csv("data/train.csv")

# Can't run the full pipeline with the training and the model
- The data transformation part of the pipeline that does TFIDF will return different number of features based on the data fed in

# Solution: Separate the feature pipeline with the model pipeline

In [30]:
X_train = train_df.copy()
y_train = X_train.pop('target').values

test_df = pd.read_csv('data/test.csv')

In [35]:
def transform_and_save_data(pipeline, description, save_dir="./", keep_y = True, sparse_output = True):
    # Keep target with features for h2o models
    if keep_y:
        train_processed = pipeline.fit_transform(X_train)
        combined_train = np.concatenate([train_processed, y_train.reshape(-1, 1)], axis=1)
    elif not keep_y:
        train_processed = pipeline.fit_transform(X_train)
        np.save(save_dir + description + '_y_train', y_train)
    
    test_processed = pipeline.transform(test_df)
    
    # Check that the dimensions are correct
    if keep_y:
        assert(combined_train.shape[1] == test_processed.shape[1] + 1), "Shapes incorrect"
    else:
        assert(combined_train.shape[1] == test_processed.shape[1]), "Shapes incorrect"
    
    if sparse_output:
        sparse.save_npz(save_dir + description + '_train_sparse', sparse.csr_matrix(combined_train))
        sparse.save_npz(save_dir + description + '_test_sparse', sparse.csr_matrix(test_processed))
    elif not sparse_output:
        np.save(save_dir + description + '_train_ndarray', combined_train)
        np.save(save_dir + description + '_test_ndarray', test_processed)
        
    print("Done!")

In [36]:
%%time
transform_and_save_data(test_text_pipeline, "test_text_pipeline", keep_y=True, sparse_output=True)


[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total=   0.0s
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.2s
[Pipeline] ..... (step 1 of 3) Processing text_selector, total=   0.0s
[Pipeline] .... (step 2 of 3) Processing text_test_pipe, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing text_tfidf, total= 6.9min
Done!
Wall time: 9min


In [22]:
%%time
transform_and_save_data(full_raw_keyword_bow_pipeline, "full_raw_keyword_bow_pipeline_tt", keep_y=True, sparse_output=False)
# transform_and_save_data(full_lemma_keyword_pca_50_pipeline, "full_lemma_keyword_pca_50_pipeline_ft", keep_y=False, sparse_output=True)
# transform_and_save_data(full_lemma_keyword_pca_50_pipeline, "full_lemma_keyword_pca_50_pipeline_tf", keep_y=True, sparse_output=False)
# transform_and_save_data(full_lemma_keyword_pca_50_pipeline, "full_lemma_keyword_pca_50_pipeline_ff", keep_y=False, sparse_output=False)

[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total=   0.0s
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.2s
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing text_bow, total= 3.3min
Done!
CPU times: user 4min 41s, sys: 3.18 s, total: 4min 44s
Wall time: 4min 46s


In [39]:
%%time
X_train_processed = full_raw_keyword_pipeline.fit_transform(X_train)

[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 3.0min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.1s
CPU times: user 2min 57s, sys: 1.64 s, total: 2min 59s
Wall time: 2min 59s


In [46]:
X_train_processed.shape

NameError: name 'X_train_processed' is not defined

In [26]:
%%time
test_processed = full_raw_keyword_pipeline.transform(test_df)

CPU times: user 1min 20s, sys: 972 ms, total: 1min 21s
Wall time: 1min 21s


In [30]:
# Save training and test numpy arrays
sparse.save_npz('raw_keyword_categorical_X_train_20k_feat', sparse.csr_matrix(X_train_processed))
np.save('raw_keyword_categorical_y_train', y_train)
sparse.save_npz('raw_keyword_categorical_test_processed_20k_feat', sparse.csr_matrix(test_processed))

# Create PCA'd (50 dims) training and test sets for the lemma and raw pipelines

In [None]:
%%time
for pipeline, name in zip([full_raw_keyword_pca_50_pipeline, full_lemma_keyword_pca_50_pipeline], ['full_raw_keyword_pca_50_pipeline', 'full_lemma_keyword_pca_50_pipeline']):
    X_train_processed = pipeline.fit_transform(X_train)
    test_processed = pipeline.transform(test_df)
    
    np.save(name + '_X_train', X_train_processed)
    np.save(name + '_test_processed', test_processed)

In [None]:
%%time
# Process text and categorical features
X_train_processed = full_lemma_keyword_pipeline.fit_transform(X_train)

In [None]:
%%time
# Preprocess test data
test_processed = full_lemma_keyword_pipeline.transform(test_df)

In [None]:
# Save training and test numpy arrays
np.save('raw_keyword_categorical_X_train', X_train_processed)
np.save('raw_keyword_categorical_y_train', y_train)
np.save('raw_keyword_categorical_test_processed', test_processed)

In [None]:
from scipy import sparse
np.save('lemma_keyword_categorical_X_train_csr', sparse.csr_matrix(X_train_processed))
np.save('lemma_keyword_categorical_y_train', y_train)
# np.save('raw_keyword_categorical_y_train_csr', sparse.csr_matrix(y_train)) # Don't save as a sparse matrix, else you will need to reshape it later for training
np.save('lemma_keyword_categorical_test_processed_csr', sparse.csr_matrix(test_processed))

# Saving processed data
- Save the output of the transform pipelines to save memory
- Can save it raw (very large)
- Or save as a sparse matrix
- Do not save the target labels as a sparse as you'll have to reshape it from (1, n) to (n, ) later, and the space savings is probably very small
- Note that there were some problems reading the sparse matrix into some sklearn models at training
    - Will need to look into this problem more

In [None]:
%%time
lrcv =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000, # Try 4000...
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )
lrcv.fit(X_train_processed, y_train)

In [None]:
# #The full pipeline as a step in another pipeline with an estimator as the final step
# full_pipeline_m = Pipeline(steps = [
#     ('full_pipeline', full_pipeline),
#     ('model', LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)) 
# ])

# #Can call fit on it just like any other pipeline
# full_pipeline_m.fit(X_train, y_train)

In [None]:
X_test = train_df.copy().sample(1000, random_state=42)
y_test = X_test.pop('target').values

In [None]:
# Preprocess test data
X_test_processed = full_pipeline.transform(X_test)

In [None]:
X_test_processed.shape

In [None]:
X_train_processed.shape

In [None]:
%%time
# Predict
predicted = lrcv.predict(X_test_processed) 

In [None]:
# full_pipeline_m.get_params()

In [None]:
# # %%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# LR with tfidf, upper, lower text
Logistic Regression Accuracy: 0.851  
Logistic Regression Precision: 0.9287925696594427  
Logistic Regression Recall: 0.704225352112676  



# Same as above but using the keyword column as a categorical feature

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# Bumped up the LRCV iterations to 4000 due to non-convergence at iterations=100

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [None]:
metrics.f1_score(y_test,predicted)

# Upped cv to 10 using additional feature of num_hashtags in text
# Model training takes under 40 mins

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [None]:
metrics.f1_score(y_test,predicted)

In [None]:
for key, vals in lrcv.scores_.items():
    for idx, val in enumerate(vals):
        print(idx)
        print(val)
        print()

In [None]:
lrcv.classes_

In [None]:
lrcv.scores_[1].mean(axis=0).max()

In [None]:
lrcv.scores_[1]

In [None]:
lrcv.scores_[1].mean(axis=0)

In [None]:
lrcv.scores_[1][0].mean()

In [None]:
print ('Max auc_roc:', searchCV.scores_[1].mean(axis=0).max())

In [None]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

In [None]:
%%time
# Preprocess test data
test_processed = full_pipeline.transform(test_df)

In [None]:
test_predictions = lrcv.predict(test_processed)

In [None]:
test_predictions

# Merge predictions with correct ids

In [20]:
test_id = test_df['id']

In [22]:
new_predictions = pd.read_csv("h2o_predictions.csv")

In [24]:
test_predictions_df = pd.DataFrame([test_id, new_predictions['predict']]).T
test_predictions_df.columns = ['id', 'target']

(3263, 2)

In [None]:
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']

In [None]:
test_predictions_df.head()

In [27]:
test_predictions_df.to_csv('test_preds_glm.csv', index=False)