# Sklearn FeatureUnion
- Use custom transformers for feature engineering
- Then merge the features horizontally for feeding into an ML classifier

## FeatureUnion & Pipelines with Pandas
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

# Feature Engineering Plan

Based on previous data exploration, we'll start with the following:
- Drop location
- Convert keyword to a categorical
- Vectorize tweet text using TF-IDF
- Create categorical indicators from the text:
    - all capitalized
    - all lowercased
    - contains hashtags
    - contains a date
    - contains link
    - contains timestamp

In [1]:
import numpy as np 
import pandas as pd
import spacy
import regex as re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split#, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.decomposition import PCA

In [2]:
spacy.prefer_gpu()

True

In [3]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher

In [4]:
# !python -m spacy download en_core_web_sm

# nlp = English() # This does not include certain features like lemmatization!

nlp = spacy.load("en_core_web_sm") # includes more features!

In [5]:
# # Load English tokenizer, tagger, parser, NER and word vectors
# parser = English()

In [6]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [7]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [9]:
# Create matcher for hashtags
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

# Feature selector transformer
- Feed it the columns you want, and it returns a dataframe with just those features

In [10]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    # Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # Method that describes what we need this transformer to do
    # This one pulls up the list of feature columns you pass in and returns just those columns
    def transform(self, X, y = None):
        return X[self.feature_names] 

# Text processing transformer
- Take in tweet text
- Create features
    - contains hashtag
    - isupper
    - islower
    - has mispellings

# Create the text feature pipeline
- Takes in the tweet text and returns various meta features about it
- Does not tokenize or encode the text itself (taken care of in a separate pipeline)

In [11]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0


    def count_hashtags(self, obj):
        hashtag_count = len(re.findall(self.hashtag_pattern, obj))
        return hashtag_count
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        # Count the number of hashtags in the text
        X['hashtag_count'] = X['text'].apply(self.count_hashtags)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
        
        # Count the number of hashtags in the text
        X['hashtag_count'] = X['text'].apply(self.count_hashtags)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    

In [12]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalLemmatizedKeywordTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)
        
    def spacy_lemmatizer(self, obj):
        doc = nlp(obj)

        # There should only be one keyword (not removing %20 spaces)
        if len(doc) > 1:
            print('More than one token found; expecting single token')
            
        return doc[0].lemma_
    
    
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")

        # Convert the keywords to the lemmatized version
        X['lemmatized_keyword'] = X['keyword'].apply(self.spacy_lemmatizer)
        
#         # Drop the keyword col
#         X = X.drop('keyword', axis=1)
        
        X = self.ohe_model.transform(X[['lemmatized_keyword']])
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        # Convert the keywords to the lemmatized version
        X['lemmatized_keyword'] = X['keyword'].apply(self.spacy_lemmatizer)
        
#         # Drop the keyword col
#         X = X.drop('keyword', axis=1)
        
        X = self.ohe_model.fit_transform(X[['lemmatized_keyword']])
        
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix
        
#         # One-hot encode the keyword col
#         X = pd.get_dummies(X, 
#                            columns=['keyword'], 
#                            drop_first=True, 
#                            dummy_na=True)

#         # Drop original keyword col
#         # The only thing remaining now will be the keyword labels
#         X = X.drop('keyword', axis=1)
        
        # returns numpy array
        return X 

In [13]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalRawKeywordTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)

        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        X = self.ohe_model.transform(X)
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
#         # Instantiate OneHotEncoder
#         ohe = preprocessing.OneHotEncoder(handle_unknown='error',
#                                          drop='first',
#                                          sparse=False) 
        
        X = self.ohe_model.fit_transform(X)
        
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix
        
#         # One-hot encode the keyword col
#         X = pd.get_dummies(X, 
#                            columns=['keyword'], 
#                            drop_first=True, 
#                            dummy_na=True)

#         # Drop original keyword col
#         # The only thing remaining now will be the keyword labels
#         X = X.drop('keyword', axis=1)
        
        # returns numpy array
        return X 

In [84]:
class DenseTfidfVectorizer(TfidfVectorizer):
    def __init__(self, pca=False, pca_n = None, remove_hashtag=True):
        self.tfidf_model = TfidfVectorizer(tokenizer=self.spacy_tokenizer)
        self.pca = pca
        self.pca_n = pca_n
        self.remove_hashtag = remove_hashtag
        
    def spacy_tokenizer(self, obj):
        doc = nlp(obj)

        # Looks for hashtags
        matches = matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])

        for span in spans:
            span.merge()

        if self.remove_hashtag:
            # Lower cases text and strips the hash symbol from hashtag while leaving rest of tag
            return [t.text.lower().replace("#", "") for t in doc if t.text.lower() not in stop_words and not t.is_punct | t.is_space] 
        else:
            # Lower cases text but keeps hash symbol in hashtag
            return [t.text.lower() for t in doc if t.text.lower() not in stop_words and not t.is_punct | t.is_space]


        
    def transform(self, raw_documents):
#         X = super().transform(raw_documents, copy=copy)
        X = self.tfidf_model.transform(raw_documents['text'])
#         df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
#         return df
#         return X.toarray()

        # If PCA
        if self.pca:
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.transform(X)

            return X

        return X.toarray() # Changes the scipy sparse array to a numpy matrix

    
    def fit_transform(self, raw_documents, y=None):
#         X = super().fit_transform(raw_documents, y=y)
        X = self.tfidf_model.fit_transform(raw_documents['text'], y=y)
#         df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
#         return df
#         return X.toarray()

        # If PCA
        if self.pca:
            self.pca_model = PCA(n_components=self.pca_n)
            
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.fit_transform(X)

            return X
            
        return X.toarray()

In [85]:
test_tokenizer = DenseTfidfVectorizer()

In [86]:
test_tokenizer.spacy_tokenizer(train_df.loc[0, 'text'])

['deeds', 'reason', 'earthquake', 'allah', 'forgive']

In [24]:
train_df.loc[train_df['text'].str.contains("http")]['target'].mean()

0.5469654998740872

In [27]:
train_df.loc[train_df['text'].str.contains("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)")]

  return func(self, *args, **kwargs)


Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
36,54,ablaze,Pretoria,@PhDSquares #mufc they've built so much hype a...,0
43,63,ablaze,,SOOOO PUMPED FOR ABLAZE ???? @southridgelife,0
54,78,ablaze,Abuja,Noches El-Bestia '@Alexis_Sanchez: happy to se...,0
63,91,ablaze,"Concord, CA",@Navista7 Steve these fires out here are somet...,1
...,...,...,...,...,...
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7578,10830,wrecked,,@jt_ruff23 @cameronhacker and I wrecked you both,0
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0
7596,10851,,,RT @LivingSafely: #NWS issues Severe #Thunders...,1


In [37]:
train_df['text'].str.extractall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)").dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
31,0,bbcmtd
36,0,PhDSquares
43,0,southridgelife
54,0,Alexis_Sanchez
63,0,Navista7
...,...,...
7578,1,cameronhacker
7581,0,engineshed
7596,0,LivingSafely
7609,0,aria_ahrary


In [38]:
train_df.loc[7578, 'text'].str.extractall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)")

AttributeError: 'str' object has no attribute 'str'

In [39]:
train_df.loc[7578, 'text']

'@jt_ruff23 @cameronhacker and I wrecked you both'

In [52]:
# # Custom transformer that tokenizes text
# class TextTokenizerTransformer(BaseEstimator, TransformerMixin):
#     # Class constructor method that takes in a list of values as its argument
#     def __init__(self):
#         pass
        
        
#     # Return self nothing else to do here
#     def fit(self, X, y = None):
#         return self
    
    
#     def spacy_tokenizer(self, obj):
#         doc = nlp(obj)

#         # Looks for hashtags
#         matches = matcher(doc)
#         spans = []
#         for match_id, start, end in matches:
#             spans.append(doc[start:end])

#         for span in spans:
#             span.merge()

#         return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]


#     # Transformer method to take in strings from a dataframe and return some extra features
#     def transform(self, X , y = None):
#         # Copy the incoming df to prevent setting on copy errors
#         X = X.copy()
        
#         X['tokens'] = X['text'].apply(self.spacy_tokenizer)
        
#         X['tokens'] = " ".join(X['tokens'])
#         return X['tokens']
    
#     # Transformer method to take in strings from a dataframe and return some extra features
#     def fit_transform(self, X , y = None):
#         # Copy the incoming df to prevent setting on copy errors
#         X = X.copy()
        
#         X['tokens'] = X['text'].apply(self.spacy_tokenizer)
        
#         X['tokens'] = " ".join(X['tokens'])
#         return X['tokens']

In [53]:
# # Custom transformer that takes in a string and returns some features
# class TextTransformer(BaseEstimator, TransformerMixin):
#     # Class constructor method that takes in a list of values as its argument
#     def __init__(self):
#         pass
        
#     # Return self nothing else to do here
#     def fit(self, X, y = None):
#         return self
    
    
#     def spacy_tokenizer(self, obj):
#         doc = nlp(obj)

#         # Looks for hashtags
#         matches = matcher(doc)
#         spans = []
#         for match_id, start, end in matches:
#             spans.append(doc[start:end])

#         for span in spans:
#             span.merge()

#         return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]


#     # Transformer method to take in strings from a dataframe and return some extra features
#     def transform(self, X , y = None):
#         # Copy the incoming df to prevent setting on copy errors
#         X = X.copy()
        
#         # Embed text as a bag of words using tfidf
#         tfidf = TfidfVectorizer(tokenizer = self.spacy_tokenizer)
#         X = tfidf.fit_transform(X['text'])
        
#         # returns numpy array
#         return X

In [54]:
# Categorical text features
cat_text_features = ['text']

# Text features for text pipeline
text_features = ['text']

# Categorical features for text pipeline
cat_features = ['keyword']

# Define categorical pipeline
cat_text_pipeline = Pipeline(
    steps = [('cat_text_selector', FeatureSelector(cat_text_features)),
             ('cat_text_transformer', CategoricalTextTransformer()),
            ],
    verbose = True
)

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
#              ('text_transformer', TextTokenizerTransformer()),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Define the keyword categorical training pipeline
cat_raw_keyword_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalRawKeywordTransformer())
            ],
    verbose = True
)

# Define the lemmatized keyword categorical pipeline
cat_lemma_keyword_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalLemmatizedKeywordTransformer())
            ],
    verbose = True
)

# Define the text training pipeline
text_pca_50_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_tfidf', DenseTfidfVectorizer(pca=True, pca_n = 50))
            ],
    verbose = True
)

# #Defining the steps in the categorical pipeline 
# categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                  
#                                   ( 'cat_transformer', CategoricalTransformer() ), 
                                  
#                                   ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )
    
# #Defining the steps in the numerical pipeline     
# numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
#                                   ( 'num_transformer', NumericalTransformer() ),
                                  
#                                   ('imputer', SimpleImputer(strategy = 'median') ),
                                  
#                                   ( 'std_scaler', StandardScaler() ) ] )

# Create the full pipeline

In [59]:
# #Combining numerical and categorical piepline into one full big pipeline horizontally 
# #using FeatureUnion
# full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
#                                                   ( 'numerical_pipeline', numerical_pipeline ) ] )

# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_raw_keyword_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_lemma_keyword_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_raw_keyword_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('text_pipeline', text_pca_50_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_lemma_keyword_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('text_pipeline', text_pca_50_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

# Test pipeline

In [16]:
train_df = pd.read_csv("data/train.csv")

# Can't run the full pipeline with the training and the model
- The data transformation part of the pipeline that does TFIDF will return different number of features based on the data fed in

# Solution: Separate the feature pipeline with the model pipeline

training time: tfidf.fit_transform(X_train)
inference: tfidf.transform(X_test)

In [17]:
X_train = train_df.copy()
y_train = X_train.pop('target').values

test_df = pd.read_csv('data/test.csv')

# Create PCA'd (50 dims) training and test sets for the lemma and raw pipelines

In [62]:
%%time
for pipeline, name in zip([full_raw_keyword_pca_50_pipeline, full_lemma_keyword_pca_50_pipeline], ['full_raw_keyword_pca_50_pipeline', 'full_lemma_keyword_pca_50_pipeline']):
    X_train_processed = pipeline.fit_transform(X_train)
    test_processed = pipeline.transform(test_df)
    
    np.save(name + '_X_train', X_train_processed)
    np.save(name + '_test_processed', test_processed)

[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 3.2min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.1s
[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total= 1.3min
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 3.1min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.1s
CPU times: user 11min 8s, sys: 7.88 s, total: 11min 15s
Wall time: 10min 43s


In [227]:
%%time
# Process text and categorical features
X_train_processed = full_lemma_keyword_pipeline.fit_transform(X_train)

[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total= 1.3min
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 3.0min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.1s
CPU times: user 4min 15s, sys: 1.79 s, total: 4min 17s
Wall time: 4min 17s


In [228]:
%%time
# Preprocess test data
test_processed = full_lemma_keyword_pipeline.transform(test_df)

CPU times: user 1min 50s, sys: 683 ms, total: 1min 51s
Wall time: 1min 51s


In [201]:
# Save training and test numpy arrays
np.save('raw_keyword_categorical_X_train', X_train_processed)
np.save('raw_keyword_categorical_y_train', y_train)
np.save('raw_keyword_categorical_test_processed', test_processed)

In [229]:
from scipy import sparse
np.save('lemma_keyword_categorical_X_train_csr', sparse.csr_matrix(X_train_processed))
np.save('lemma_keyword_categorical_y_train', y_train)
# np.save('raw_keyword_categorical_y_train_csr', sparse.csr_matrix(y_train)) # Don't save as a sparse matrix, else you will need to reshape it later for training
np.save('lemma_keyword_categorical_test_processed_csr', sparse.csr_matrix(test_processed))

# Saving processed data
- Save the output of the transform pipelines to save memory
- Can save it raw (very large)
- Or save as a sparse matrix
- Do not save the target labels as a sparse as you'll have to reshape it from (1, n) to (n, ) later, and the space savings is probably very small
- Note that there were some problems reading the sparse matrix into some sklearn models at training
    - Will need to look into this problem more

In [121]:
%%time
lrcv =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000, # Try 4000...
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )
lrcv.fit(X_train_processed, y_train)

CPU times: user 8min 17s, sys: 31 s, total: 8min 48s
Wall time: 38min 22s


LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=4000, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=42, refit=True, scoring='f1', solver='lbfgs',
                     tol=0.0001, verbose=0)

In [23]:
# #The full pipeline as a step in another pipeline with an estimator as the final step
# full_pipeline_m = Pipeline(steps = [
#     ('full_pipeline', full_pipeline),
#     ('model', LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)) 
# ])

# #Can call fit on it just like any other pipeline
# full_pipeline_m.fit(X_train, y_train)

In [123]:
X_test = train_df.copy().sample(1000, random_state=42)
y_test = X_test.pop('target').values

In [124]:
# Preprocess test data
X_test_processed = full_pipeline.transform(X_test)

In [125]:
X_test_processed.shape

(1000, 23447)

In [126]:
X_train_processed.shape

(7613, 23447)

In [128]:
%%time
# Predict
predicted = lrcv.predict(X_test_processed) 

CPU times: user 58.3 ms, sys: 8.12 ms, total: 66.5 ms
Wall time: 39.1 ms


In [55]:
# full_pipeline_m.get_params()

In [56]:
# # %%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.851
Logistic Regression Precision: 0.9287925696594427
Logistic Regression Recall: 0.704225352112676


# LR with tfidf, upper, lower text
Logistic Regression Accuracy: 0.851  
Logistic Regression Precision: 0.9287925696594427  
Logistic Regression Recall: 0.704225352112676  



# Same as above but using the keyword column as a categorical feature

In [50]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.925
Logistic Regression Precision: 0.9511568123393316
Logistic Regression Recall: 0.8685446009389671
CPU times: user 2.08 ms, sys: 0 ns, total: 2.08 ms
Wall time: 1.99 ms


# Bumped up the LRCV iterations to 4000 due to non-convergence at iterations=100

In [53]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.925
Logistic Regression Precision: 0.9511568123393316
Logistic Regression Recall: 0.8685446009389671
CPU times: user 2.86 ms, sys: 81 µs, total: 2.94 ms
Wall time: 2.41 ms


In [98]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.991
Logistic Regression Precision: 0.995249406175772
Logistic Regression Recall: 0.9835680751173709
CPU times: user 4.16 ms, sys: 8 µs, total: 4.17 ms
Wall time: 3.58 ms


In [100]:
metrics.f1_score(y_test,predicted)

0.9893742621015348

# Upped cv to 10 using additional feature of num_hashtags in text
# Model training takes under 40 mins

In [129]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.998
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.9953051643192489
CPU times: user 19.3 ms, sys: 4.17 ms, total: 23.4 ms
Wall time: 616 ms


In [130]:
metrics.f1_score(y_test,predicted)

0.9976470588235293

In [136]:
for key, vals in lrcv.scores_.items():
    for idx, val in enumerate(vals):
        print(idx)
        print(val)
        print()

0
[0.         0.02898551 0.06197183 0.23173804 0.5950096  0.63829787
 0.61848739 0.61435726 0.61258278 0.61083744]

1
[0.         0.         0.01204819 0.13736264 0.27697842 0.26367461
 0.27530364 0.28726287 0.28922237 0.2946794 ]

2
[0.         0.01201201 0.02292264 0.10383747 0.34726688 0.36936937
 0.3826087  0.38205499 0.37845706 0.38205499]

3
[0.         0.01215805 0.04733728 0.08866995 0.21463415 0.2124431
 0.22288262 0.21791045 0.22056632 0.21791045]

4
[0.         0.         0.03478261 0.11160714 0.33063209 0.38818565
 0.40540541 0.41644562 0.42272127 0.42687747]

5
[0.         0.03003003 0.06395349 0.16091954 0.37873754 0.44216691
 0.44219653 0.43804035 0.44189383 0.43965517]

6
[0.         0.03003003 0.05763689 0.06970509 0.33397313 0.39092496
 0.40734558 0.44732577 0.45088567 0.45980707]

7
[0.         0.01796407 0.03529412 0.16284987 0.40618956 0.4020979
 0.3965812  0.39255499 0.39261745 0.3919598 ]

8
[0.         0.02967359 0.03488372 0.1010101  0.5347432  0.56363636
 0.55

In [132]:
lrcv.classes_

array([0, 1])

In [139]:
lrcv.scores_[1].mean(axis=0).max()

0.43945727482425345

In [140]:
lrcv.scores_[1]

array([[0.        , 0.02898551, 0.06197183, 0.23173804, 0.5950096 ,
        0.63829787, 0.61848739, 0.61435726, 0.61258278, 0.61083744],
       [0.        , 0.        , 0.01204819, 0.13736264, 0.27697842,
        0.26367461, 0.27530364, 0.28726287, 0.28922237, 0.2946794 ],
       [0.        , 0.01201201, 0.02292264, 0.10383747, 0.34726688,
        0.36936937, 0.3826087 , 0.38205499, 0.37845706, 0.38205499],
       [0.        , 0.01215805, 0.04733728, 0.08866995, 0.21463415,
        0.2124431 , 0.22288262, 0.21791045, 0.22056632, 0.21791045],
       [0.        , 0.        , 0.03478261, 0.11160714, 0.33063209,
        0.38818565, 0.40540541, 0.41644562, 0.42272127, 0.42687747],
       [0.        , 0.03003003, 0.06395349, 0.16091954, 0.37873754,
        0.44216691, 0.44219653, 0.43804035, 0.44189383, 0.43965517],
       [0.        , 0.03003003, 0.05763689, 0.06970509, 0.33397313,
        0.39092496, 0.40734558, 0.44732577, 0.45088567, 0.45980707],
       [0.        , 0.01796407, 0.0352941

In [145]:
lrcv.scores_[1].mean(axis=0)

array([0.        , 0.01729745, 0.04124974, 0.14423986, 0.40620215,
       0.43184158, 0.43430521, 0.43732334, 0.43764998, 0.43945727])

In [143]:
lrcv.scores_[1][0].mean()

0.4012267719708273

In [None]:
print ('Max auc_roc:', searchCV.scores_[1].mean(axis=0).max())

In [148]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [149]:
%%time
# Preprocess test data
test_processed = full_pipeline.transform(test_df)

CPU times: user 43.8 s, sys: 743 ms, total: 44.6 s
Wall time: 48.4 s


In [151]:
test_predictions = lrcv.predict(test_processed)

In [152]:
test_predictions

array([0, 1, 1, ..., 1, 1, 0])

In [153]:
test_id = test_df['id']

In [157]:
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']

In [159]:
test_predictions_df.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


In [161]:
test_predictions_df.to_csv('test_preds.csv', index=False)