# Sklearn FeatureUnion
- Use custom transformers for feature engineering
- Then merge the features horizontally for feeding into an ML classifier

## FeatureUnion & Pipelines with Pandas
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [4]:
# import numpy as np
# import pandas as pd
# import spacy
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# from sklearn.model_selection import train_test_split
# from sklearn import metrics

import numpy as np 
import pandas as pd
import spacy
from sklearn.base import BaseEstimator, TransformerMixin



In [5]:
spacy.prefer_gpu()

True

In [10]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [11]:
# !python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

In [12]:
# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Feature selector transformer
- Feed it the columns you want, and it returns a dataframe with just those features

In [14]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X[self.feature_names] 

# Text processing transformer
- Take in tweet text
- Create features
    - contains hashtag
    - isupper
    - islower
    - has mispellings

In [None]:
#Custom transformer that breaks dates column into year, month and day into separate columns and
#converts certain features to binary 
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, use_dates = ['year', 'month', 'day'] ):
        self._use_dates = use_dates
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self

    #Helper function to extract year from column 'dates' 
    def get_year( self, obj ):
        return str(obj)[:4]
    
    #Helper function to extract month from column 'dates'
    def get_month( self, obj ):
        return str(obj)[4:6]
    
    #Helper function to extract day from column 'dates'
    def get_day(self, obj):
        return str(obj)[6:8]
    
    #Helper function that converts values to Binary depending on input 
    def create_binary(self, obj):
        if obj == 0:
            return 'No'
        else:
            return 'Yes'
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
       #Depending on constructor argument break dates column into specified units
       #using the helper functions written above 
       for spec in self._use_dates:
        
        exec( "X.loc[:,'{}'] = X['date'].apply(self.get_{})".format( spec, spec ) )
       #Drop unusable column 
       X = X.drop('date', axis = 1 )
       
       #Convert these columns to binary for one-hot-encoding later
       X.loc[:,'waterfront'] = X['waterfront'].apply( self.create_binary )
       
       X.loc[:,'view'] = X['view'].apply( self.create_binary )
       
       X.loc[:,'yr_renovated'] = X['yr_renovated'].apply( self.create_binary )
       #returns numpy array
       return X.values 

In [6]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [71]:
def spacy_tokenizer(string):
    doc = nlp(string)
#     return doc.text.split() # Disregards puncutation and contractions
#     return [token.orth_ for token in doc] # This should work better, but isn't perfect. This also splits hashtags hash symbol and the word tag
#     return [token.orth_ for token in doc if not token.is_punct | token.is_space]

    # Looks for hashtags
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        spans.append(doc[start:end])


    for span in spans:
        span.merge()

#     print([t.text for t in doc])
#     return [t.text for t in doc if not t.is_punct | t.is_space]

#     return string.isupper(), string.islower(), [t.text for t in doc if not t.is_punct | t.is_space]
    return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]

In [91]:
def spacy_tokenizer_dataframe(df):
    """
    Works on a dataframe rather than directly on the strings
    Can't use this one for the sklearn pipeline
    """
    doc = nlp(df['text'])

    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        spans.append(doc[start:end])

    for span in spans:
        span.merge()

    df['upper'] = df['text'].isupper()
    df['lower'] = df['text'].islower()
    df['token_list'] = [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]
    return df

In [48]:
# spacy_tokenizer("This is a   sentence!")

In [27]:
doc = nlp("Some text is going down the street")

In [None]:
doc

In [None]:
dir(doc)

In [None]:
# for el in doc.vocab:
#     print(el.text)

# for chunk in doc.noun_chunks:
#     print(chunk)

# for sent in doc.sents:
#     print(sent)

doc.text.split()

In [72]:
train_df = pd.read_csv("data/train.csv")

In [73]:
train_df.head(30)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [63]:
train_df = train_df.head(30).apply(spacy_tokenizer, axis=1)

In [64]:
for el in train_df.head(30)['token_list']:
    print(el)

['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', '#earthquake', 'may', 'allah', 'forgive', 'us', 'all']
['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']
['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']
['13,000', 'people', 'receive', '#wildfires', 'evacuation', 'orders', 'in', 'california']
['just', 'got', 'sent', 'this', 'photo', 'from', 'ruby', '#alaska', 'as', 'smoke', 'from', '#wildfires', 'pours', 'into', 'a', 'school']
['#rockyfire', 'update', '=', '>', 'california', 'hwy', '20', 'closed', 'in', 'both', 'directions', 'due', 'to', 'lake', 'county', 'fire', '#cafire', '#wildfires']
['#flood', '#disaster', 'heavy', 'rain', 'causes', 'flash', 'flooding', 'of', 'streets', 'in', 'manitou', 'colorado', 'springs', 'areas']
['i', "'m", 'on', 'top', 'of', 'the', 'hill', 'and', 'i', 'can', 'see', 'a', 'fire', 'in', 'the', 'wood

In [12]:
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

doc = nlp("this is an #example of an awesome tweet with #manyhashtags #amazing")
matches = matcher(doc)
spans = []
for match_id, start, end in matches:
    spans.append(doc[start:end])
    
    
for span in spans:
    span.merge()

print([t.text for t in doc])

['this', 'is', 'an', '#example', 'of', 'an', 'awesome', 'tweet', 'with', '#manyhashtags', '#amazing']


In [13]:
spans

[#example, #manyhashtags, #amazing]

In [74]:
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer)
# x = v.fit_transform(df['sent'])

In [75]:
x = tfidf.fit_transform(train_df['text'].head(30))

In [83]:
x.todense()[0]

matrix([[0.        , 0.        , 0.        , 0.        , 0.29865656,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.26628554, 0.29865656, 0.        ,
         0.        , 0.        , 0.26628554, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.29865656, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [99]:
# X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.3, random_state=42)

# classifier = LogisticRegression()

# # Create pipeline using Bag of Words
# pipe = Pipeline([
#                 ('vectorizer', tfidf),
#                  ('classifier', classifier)])

In [101]:
# classifier = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1).fit(train_df['text'], train_df['target'])
classifier = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)
# Create pipeline using Bag of Words
pipe = Pipeline([
                ('vectorizer', tfidf),
                 ('classifier', classifier)])

In [102]:
%%time
# model generation
pipe.fit(train_df['text'],train_df['target'])

CPU times: user 2min 41s, sys: 1.13 s, total: 2min 42s
Wall time: 2min 44s


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                                 tokenizer=<function spacy_tokenizer at 0x7f000293e320>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=Fal

In [103]:
%%time
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.8507005253940455
Logistic Regression Precision: 0.9194630872483222
Logistic Regression Recall: 0.7091097308488613
CPU times: user 47.5 s, sys: 359 ms, total: 47.8 s
Wall time: 47.5 s


# Wow, 85% cross-validation accuracy with almost no feature engineering