# Sklearn FeatureUnion
- Use custom transformers for feature engineering
- Then merge the features horizontally for feeding into an ML classifier

## FeatureUnion & Pipelines with Pandas
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [1]:
# import numpy as np
# import pandas as pd
# import spacy
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# from sklearn.model_selection import train_test_split
# from sklearn import metrics

import numpy as np 
import pandas as pd
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
spacy.prefer_gpu()

True

In [3]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [4]:
# !python -m spacy download en_core_web_sm

# nlp = spacy.load("en_core_web_sm")
nlp = English()

In [5]:
# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [6]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [7]:
# Create matcher for hashtags
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

# Feature selector transformer
- Feed it the columns you want, and it returns a dataframe with just those features

In [8]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    # Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # Method that describes what we need this transformer to do
    # This one pulls up the list of feature columns you pass in and returns just those columns
    def transform(self, X, y = None):
        return X[self.feature_names] 

# Text processing transformer
- Take in tweet text
- Create features
    - contains hashtag
    - isupper
    - islower
    - has mispellings

# Create the text pipeline

In [9]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass
        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0


    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 

In [43]:
# Custom transformer that takes in a string and returns some features
class TextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    def spacy_tokenizer(self, obj):
        doc = nlp(obj)

        # Looks for hashtags
        matches = matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])

        for span in spans:
            span.merge()

        return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]


    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
#         # Tokenize the text
#         X['tokens'] = X['text'].apply(self.spacy_tokenizer)
        
        # Embed text as a bag of words using tfidf
        tfidf = TfidfVectorizer(tokenizer = self.spacy_tokenizer)
        X = tfidf.fit_transform(X['text'])
#         X['tfidf'] = X['text'].apply(tfidf.fit_transform)
    
#         # Drop original text col
#         # The only thing remaining now will be the lowercased text
#         X = X.drop('text', axis=1)
        
        # returns numpy array
        return X

In [44]:
# # Categrical features to pass down the categorical pipeline 
# categorical_features = ['date', 'waterfront', 'view', 'yr_renovated']

# #Numerical features to pass down the numerical pipeline 
# numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
#                 'condition', 'grade', 'sqft_basement', 'yr_built']

# Categorical features (in this case it's still just 'text' until we include keywords and locations)
cat_features = ['text']

# Text features for text pipeline
text_features = ['text']

# #Defining the steps in the categorical pipeline 
# categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                  
#                                   ( 'cat_transformer', CategoricalTransformer() ), 
                                  
#                                   ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )
    
# #Defining the steps in the numerical pipeline     
# numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
#                                   ( 'num_transformer', NumericalTransformer() ),
                                  
#                                   ('imputer', SimpleImputer(strategy = 'median') ),
                                  
#                                   ( 'std_scaler', StandardScaler() ) ] )

# Define categorical pipeline
cat_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalTransformer()),
            ],
    verbose = True
)

# Define the text pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_transformer', TextTransformer()),
            ],
    verbose = True
)


# Create the full pipeline

In [55]:
# #Combining numerical and categorical piepline into one full big pipeline horizontally 
# #using FeatureUnion
# full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
#                                                   ( 'numerical_pipeline', numerical_pipeline ) ] )

# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_pipeline = FeatureUnion(
    transformer_list=[
                      ('text_pipeline', text_pipeline),
                      ('cat_pipeline', cat_pipeline)
                     ]
)

# Test pipeline

In [46]:
train_df = pd.read_csv("data/train.csv")

In [47]:
train_small_df = train_df.sample(50, random_state=42)
train_small_df.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [48]:
train_small_y = train_small_df.pop('target')

In [49]:
train_small_y.head()

2644    1
2227    0
5448    1
132     0
6845    0
Name: target, dtype: int64

In [57]:
x_50 = full_pipeline.transform(train_small_df)

In [None]:
# # Create a blank Tokenizer with just the English vocab
# tokenizer = Tokenizer(nlp.vocab)

In [None]:
def spacy_tokenizer(string):
    doc = nlp(string)
#     return doc.text.split() # Disregards puncutation and contractions
#     return [token.orth_ for token in doc] # This should work better, but isn't perfect. This also splits hashtags hash symbol and the word tag
#     return [token.orth_ for token in doc if not token.is_punct | token.is_space]

    # Looks for hashtags
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        spans.append(doc[start:end])


    for span in spans:
        span.merge()

#     print([t.text for t in doc])
#     return [t.text for t in doc if not t.is_punct | t.is_space]

#     return string.isupper(), string.islower(), [t.text for t in doc if not t.is_punct | t.is_space]
    return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]

In [None]:
def spacy_tokenizer_dataframe(df):
    """
    Works on a dataframe rather than directly on the strings
    Can't use this one for the sklearn pipeline
    """
    doc = nlp(df['text'])

    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        spans.append(doc[start:end])

    for span in spans:
        span.merge()

    df['upper'] = df['text'].isupper()
    df['lower'] = df['text'].islower()
    df['token_list'] = [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]
    return df

In [None]:
x = tfidf.fit_transform(train_df['text'].head(30))

In [None]:
x.todense()[0]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.3, random_state=42)

# classifier = LogisticRegression()

# # Create pipeline using Bag of Words
# pipe = Pipeline([
#                 ('vectorizer', tfidf),
#                  ('classifier', classifier)])

In [None]:
# classifier = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1).fit(train_df['text'], train_df['target'])
classifier = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)
# Create pipeline using Bag of Words
pipe = Pipeline([
                ('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
%%time
# model generation
pipe.fit(train_df['text'],train_df['target'])

In [None]:
%%time
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# Scratch

In [None]:
# # Custom transformer that breaks dates column into year, month and day into separate columns and
# # converts certain features to binary 
# class CategoricalTransformer( BaseEstimator, TransformerMixin ):
#     # Class constructor method that takes in a list of values as its argument
#     def __init__(self, use_dates = ['year', 'month', 'day'] ):
#         self._use_dates = use_dates
        
#     # Return self nothing else to do here
#     def fit( self, X, y = None  ):
#         return self

#     # Helper function to extract year from column 'dates' 
#     def get_year( self, obj ):
#         return str(obj)[:4]
    
#     # Helper function to extract month from column 'dates'
#     def get_month( self, obj ):
#         return str(obj)[4:6]
    
#     # Helper function to extract day from column 'dates'
#     def get_day(self, obj):
#         return str(obj)[6:8]
    
#     # Helper function that converts values to Binary depending on input 
#     def create_binary(self, obj):
#         if obj == 0:
#             return 'No'
#         else:
#             return 'Yes'
    
#     # Transformer method we wrote for this transformer 
#     def transform(self, X , y = None ):
#         # Depending on constructor argument break dates column into specified units
#         #using the helper functions written above 
#         for spec in self._use_dates:

#         exec( "X.loc[:,'{}'] = X['date'].apply(self.get_{})".format( spec, spec ) )
#         # Drop unusable column 
#         X = X.drop('date', axis = 1 )

#         # Convert these columns to binary for one-hot-encoding later
#         X.loc[:,'waterfront'] = X['waterfront'].apply( self.create_binary )

#         X.loc[:,'view'] = X['view'].apply( self.create_binary )

#         X.loc[:,'yr_renovated'] = X['yr_renovated'].apply( self.create_binary )
#         # returns numpy array
#         return X.values 