# Sklearn FeatureUnion
- Use custom transformers for feature engineering
- Then merge the features horizontally for feeding into an ML classifier

## FeatureUnion & Pipelines with Pandas
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

In [18]:
# import numpy as np
# import pandas as pd
# import spacy
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# from sklearn.model_selection import train_test_split
# from sklearn import metrics

import numpy as np 
import pandas as pd
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline


In [2]:
spacy.prefer_gpu()

True

In [3]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [8]:
# !python -m spacy download en_core_web_sm

# nlp = spacy.load("en_core_web_sm")
nlp = English()

In [5]:
# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Feature selector transformer
- Feed it the columns you want, and it returns a dataframe with just those features

In [6]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    # Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # Method that describes what we need this transformer to do
    # This one pulls up the list of feature columns you pass in and returns just those columns
    def transform(self, X, y = None):
        return X[self.feature_names] 

# Text processing transformer
- Take in tweet text
- Create features
    - contains hashtag
    - isupper
    - islower
    - has mispellings

In [28]:
# Custom transformer that takes in a string and returns some features
class TextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        pass
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self

#     # Helper function to extract year from column 'dates' 
#     def get_year( self, obj ):
#         return str(obj)[:4]
    
#     # Helper function to extract month from column 'dates'
#     def get_month( self, obj ):
#         return str(obj)[4:6]
    
#     # Helper function to extract day from column 'dates'
#     def get_day(self, obj):
#         return str(obj)[6:8]
    
#     # Helper function that converts values to Binary depending on input 
#     def create_binary(self, obj):
#         if obj == 0:
#             return 'No'
#         else:
#             return 'Yes'
    
    # Test helper func to just return the text in all lower case
    def lower_text(self, obj):
        return obj.lower()
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
#         # Depending on constructor argument break dates column into specified units
#         # using the helper functions written above 
#         for spec in self._use_dates:
#             exec( "X.loc[:,'{}'] = X['date'].apply(self.get_{})".format( spec, spec ) )
        
#         # Drop unusable column 
#         X = X.drop('date', axis = 1 )

#         # Convert these columns to binary for one-hot-encoding later
#         X.loc[:,'waterfront'] = X['waterfront'].apply( self.create_binary )

#         X.loc[:,'view'] = X['view'].apply( self.create_binary )

#         X.loc[:,'yr_renovated'] = X['yr_renovated'].apply( self.create_binary )
        
        # Test function should return a new col called 'lower' that has the same text as the tweet text, but all lowercase
        X.loc[:, 'lower'] = X['text'].apply(self.lower_text)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 

In [29]:
# # Custom transformer that breaks dates column into year, month and day into separate columns and
# # converts certain features to binary 
# class CategoricalTransformer( BaseEstimator, TransformerMixin ):
#     # Class constructor method that takes in a list of values as its argument
#     def __init__(self, use_dates = ['year', 'month', 'day'] ):
#         self._use_dates = use_dates
        
#     # Return self nothing else to do here
#     def fit( self, X, y = None  ):
#         return self

#     # Helper function to extract year from column 'dates' 
#     def get_year( self, obj ):
#         return str(obj)[:4]
    
#     # Helper function to extract month from column 'dates'
#     def get_month( self, obj ):
#         return str(obj)[4:6]
    
#     # Helper function to extract day from column 'dates'
#     def get_day(self, obj):
#         return str(obj)[6:8]
    
#     # Helper function that converts values to Binary depending on input 
#     def create_binary(self, obj):
#         if obj == 0:
#             return 'No'
#         else:
#             return 'Yes'
    
#     # Transformer method we wrote for this transformer 
#     def transform(self, X , y = None ):
#         # Depending on constructor argument break dates column into specified units
#         #using the helper functions written above 
#         for spec in self._use_dates:

#         exec( "X.loc[:,'{}'] = X['date'].apply(self.get_{})".format( spec, spec ) )
#         # Drop unusable column 
#         X = X.drop('date', axis = 1 )

#         # Convert these columns to binary for one-hot-encoding later
#         X.loc[:,'waterfront'] = X['waterfront'].apply( self.create_binary )

#         X.loc[:,'view'] = X['view'].apply( self.create_binary )

#         X.loc[:,'yr_renovated'] = X['yr_renovated'].apply( self.create_binary )
#         # returns numpy array
#         return X.values 

# Create the text pipeline

In [30]:
# # Categrical features to pass down the categorical pipeline 
# categorical_features = ['date', 'waterfront', 'view', 'yr_renovated']

# #Numerical features to pass down the numerical pipeline 
# numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
#                 'condition', 'grade', 'sqft_basement', 'yr_built']

# Text features for text pipeline
text_features = ['text']

# #Defining the steps in the categorical pipeline 
# categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                  
#                                   ( 'cat_transformer', CategoricalTransformer() ), 
                                  
#                                   ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )
    
# #Defining the steps in the numerical pipeline     
# numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
#                                   ( 'num_transformer', NumericalTransformer() ),
                                  
#                                   ('imputer', SimpleImputer(strategy = 'median') ),
                                  
#                                   ( 'std_scaler', StandardScaler() ) ] )

# Define the text pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_transformer', TextTransformer()),
            ],
    verbose = True
)


# Create the full pipeline

In [31]:
# #Combining numerical and categorical piepline into one full big pipeline horizontally 
# #using FeatureUnion
# full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
#                                                   ( 'numerical_pipeline', numerical_pipeline ) ] )

# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_pipeline = FeatureUnion(
    transformer_list=[('text_pipeline', text_pipeline)]
)

# Test pipeline

In [32]:
train_df = pd.read_csv("data/train.csv")

In [33]:
full_pipeline.transform(train_df)

array([['our deeds are the reason of this #earthquake may allah forgive us all'],
       ['forest fire near la ronge sask. canada'],
       ["all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected"],
       ...,
       ['m1.94 [01:04 utc]?5km s of volcano hawaii. http://t.co/zdtoyd8ebj'],
       ['police investigating after an e-bike collided with a car in little portugal. e-bike rider suffered serious non-life threatening injuries.'],
       ['the latest: more homes razed by northern california wildfire - abc news http://t.co/ymy4rskq3d']],
      dtype=object)

In [9]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [10]:
def spacy_tokenizer(string):
    doc = nlp(string)
#     return doc.text.split() # Disregards puncutation and contractions
#     return [token.orth_ for token in doc] # This should work better, but isn't perfect. This also splits hashtags hash symbol and the word tag
#     return [token.orth_ for token in doc if not token.is_punct | token.is_space]

    # Looks for hashtags
    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        spans.append(doc[start:end])


    for span in spans:
        span.merge()

#     print([t.text for t in doc])
#     return [t.text for t in doc if not t.is_punct | t.is_space]

#     return string.isupper(), string.islower(), [t.text for t in doc if not t.is_punct | t.is_space]
    return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]

In [None]:
def spacy_tokenizer_dataframe(df):
    """
    Works on a dataframe rather than directly on the strings
    Can't use this one for the sklearn pipeline
    """
    doc = nlp(df['text'])

    matches = matcher(doc)
    spans = []
    for match_id, start, end in matches:
        spans.append(doc[start:end])

    for span in spans:
        span.merge()

    df['upper'] = df['text'].isupper()
    df['lower'] = df['text'].islower()
    df['token_list'] = [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]
    return df

In [None]:
# spacy_tokenizer("This is a   sentence!")

In [11]:
doc = nlp("Some text is going down the street")

In [12]:
doc

Some text is going down the street

In [13]:
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_py_tokens',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_byte

In [14]:
# for el in doc.vocab:
#     print(el.text)

# for chunk in doc.noun_chunks:
#     print(chunk)

# for sent in doc.sents:
#     print(sent)

doc.text.split()

['Some', 'text', 'is', 'going', 'down', 'the', 'street']

In [None]:
train_df = pd.read_csv("data/train.csv")

In [None]:
train_df.head(30)

In [None]:
train_df = train_df.head(30).apply(spacy_tokenizer, axis=1)

In [None]:
for el in train_df.head(30)['token_list']:
    print(el)

In [None]:
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

doc = nlp("this is an #example of an awesome tweet with #manyhashtags #amazing")
matches = matcher(doc)
spans = []
for match_id, start, end in matches:
    spans.append(doc[start:end])
    
    
for span in spans:
    span.merge()

print([t.text for t in doc])

In [None]:
spans

In [None]:
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer)
# x = v.fit_transform(df['sent'])

In [None]:
x = tfidf.fit_transform(train_df['text'].head(30))

In [None]:
x.todense()[0]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.3, random_state=42)

# classifier = LogisticRegression()

# # Create pipeline using Bag of Words
# pipe = Pipeline([
#                 ('vectorizer', tfidf),
#                  ('classifier', classifier)])

In [None]:
# classifier = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1).fit(train_df['text'], train_df['target'])
classifier = LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)
# Create pipeline using Bag of Words
pipe = Pipeline([
                ('vectorizer', tfidf),
                 ('classifier', classifier)])

In [None]:
%%time
# model generation
pipe.fit(train_df['text'],train_df['target'])

In [None]:
%%time
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# Wow, 85% cross-validation accuracy with almost no feature engineering