# Sklearn FeatureUnion
- Use custom transformers for feature engineering
- Then merge the features horizontally for feeding into an ML classifier

## FeatureUnion & Pipelines with Pandas
https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65

# Feature Engineering Plan

Based on previous data exploration, we'll start with the following:
- Drop location
- Convert keyword to a categorical
- Vectorize tweet text using TF-IDF
- Create categorical indicators from the text:
    - all capitalized
    - all lowercased
    - contains hashtags
    - contains a date
    - contains link
    - contains timestamp

In [1]:
import numpy as np 
import pandas as pd
import spacy
import regex as re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split#, cross_val_score
from sklearn import metrics, preprocessing
from sklearn.decomposition import PCA

In [2]:
spacy.prefer_gpu()

True

In [3]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher

In [4]:
# !python -m spacy download en_core_web_sm

# nlp = English() # This does not include certain features like lemmatization!

nlp = spacy.load("en_core_web_sm") # includes more features!

In [5]:
# # Load English tokenizer, tagger, parser, NER and word vectors
# parser = English()

In [6]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [7]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [8]:
# # Create matcher for hashtags
# matcher = Matcher(nlp.vocab)
# matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

# Feature selector transformer
- Feed it the columns you want, and it returns a dataframe with just those features

In [9]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    # Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # Method that describes what we need this transformer to do
    # This one pulls up the list of feature columns you pass in and returns just those columns
    def transform(self, X, y = None):
        return X[self.feature_names] 

# Text processing transformer
- Take in tweet text
- Create features
    - contains hashtag
    - isupper
    - islower
    - has mispellings

# Create the text feature pipeline
- Takes in the tweet text and returns various meta features about it
- Does not tokenize or encode the text itself (taken care of in a separate pipeline)

In [10]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self, use_count_hashtags=True, use_count_user_handles=True):
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        self.user_handle_pattern = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)", re.UNICODE)
        self.use_count_hashtags = use_count_hashtags
        self.use_count_user_handles = use_count_user_handles
        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0


    def count_hashtags(self, obj):
        hashtag_count = len(re.findall(self.hashtag_pattern, obj))
        return hashtag_count
        
        
    def count_user_handles(self, obj):
        user_handle_count = len(re.findall(self.user_handle_pattern, obj))
        return user_handle_count
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        if self.count_hashtags:
            # Count the number of hashtags in the text
            X['hashtag_count'] = X['text'].apply(self.count_hashtags)
        
        if self.count_user_handles:
            # Count number of user handles
            X['user_handle_count'] = X['text'].apply(self.count_user_handles)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
        
        if self.count_hashtags:
            # Count the number of hashtags in the text
            X['hashtag_count'] = X['text'].apply(self.count_hashtags)
        
        if self.count_user_handles:
            # Count number of user handles
            X['user_handle_count'] = X['text'].apply(self.count_user_handles)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    

In [11]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalLemmatizedKeywordTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)
        
    def spacy_lemmatizer(self, obj):
        doc = nlp(obj)

        # There should only be one keyword (not removing %20 spaces)
        if len(doc) > 1:
            print('More than one token found; expecting single token')
            
        return doc[0].lemma_
    
    
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")

        # Convert the keywords to the lemmatized version
        X['lemmatized_keyword'] = X['keyword'].apply(self.spacy_lemmatizer)
        
#         # Drop the keyword col
#         X = X.drop('keyword', axis=1)
        
        X = self.ohe_model.transform(X[['lemmatized_keyword']])
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        # Convert the keywords to the lemmatized version
        X['lemmatized_keyword'] = X['keyword'].apply(self.spacy_lemmatizer)
        
#         # Drop the keyword col
#         X = X.drop('keyword', axis=1)
        
        X = self.ohe_model.fit_transform(X[['lemmatized_keyword']])
        
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix
        
#         # One-hot encode the keyword col
#         X = pd.get_dummies(X, 
#                            columns=['keyword'], 
#                            drop_first=True, 
#                            dummy_na=True)

#         # Drop original keyword col
#         # The only thing remaining now will be the keyword labels
#         X = X.drop('keyword', axis=1)
        
        # returns numpy array
        return X 

In [12]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalRawKeywordTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)

        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        X = self.ohe_model.transform(X)
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
#         # Instantiate OneHotEncoder
#         ohe = preprocessing.OneHotEncoder(handle_unknown='error',
#                                          drop='first',
#                                          sparse=False) 
        
        X = self.ohe_model.fit_transform(X)
        
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix
        
#         # One-hot encode the keyword col
#         X = pd.get_dummies(X, 
#                            columns=['keyword'], 
#                            drop_first=True, 
#                            dummy_na=True)

#         # Drop original keyword col
#         # The only thing remaining now will be the keyword labels
#         X = X.drop('keyword', axis=1)
        
        # returns numpy array
        return X 

In [124]:
class DenseTfidfVectorizer(TfidfVectorizer):
    def __init__(self, pca=False, pca_n = None, remove_hashtag=True, remove_user_handle=True, remove_stop_words=True):
        self.tfidf_model = TfidfVectorizer(tokenizer=self.spacy_tokenizer)
        self.pca = pca
        self.pca_n = pca_n
        self.remove_stop_words = remove_stop_words
        self.remove_hashtag = remove_hashtag
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
        self.remove_user_handle = remove_user_handle
#         self.user_handle_matcher = Matcher(nlp.vocab)
#         self.user_handle_matcher.add('HANDLE', None, [{'ORTH': ')'}, {'IS_ASCII': True}])
        self.user_handle_pattern = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))(@[A-Za-z]+[A-Za-z0-9-_]+)", re.UNICODE)
        
    def spacy_tokenizer(self, obj):
        doc = nlp(obj)

        # Looks for hashtags
        matches = self.matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])

        for span in spans:
            span.merge()

            
        # Create a list of user handles
        user_handles = re.findall(self.user_handle_pattern, doc.text)      
        
        # Convert spacy tokens to a list of string tokens
        token_list = [t.text.lower() for t in doc if not t.is_punct | t.is_space]
        
        if self.remove_stop_words:
            token_list = [t for t in token_list if t not in stop_words]
        
        if self.remove_user_handle:
            token_list = [t for t in token_list if t not in user_handles]
        
        if self.remove_hashtag:
            token_list = [t.replace("#", "") for t in token_list]
            
        return token_list
        
#         if self.remove_hashtag:
#             # Lower cases text and strips the hash symbol from hashtag while leaving rest of tag
#             return [t.text.lower().replace("#", "") for t in doc if t.text.lower() not in stop_words and not t.is_punct | t.is_space] 
#         else:
#             # Lower cases text but keeps hash symbol in hashtag
#             return [t.text.lower() for t in doc if t.text.lower() not in stop_words and not t.is_punct | t.is_space]


        
    def transform(self, raw_documents):
#         X = super().transform(raw_documents, copy=copy)
        X = self.tfidf_model.transform(raw_documents['text'])
#         df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
#         return df
#         return X.toarray()

        # If PCA
        if self.pca:
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.transform(X)

            return X

        return X.toarray() # Changes the scipy sparse array to a numpy matrix

    
    def fit_transform(self, raw_documents, y=None):
#         X = super().fit_transform(raw_documents, y=y)
        X = self.tfidf_model.fit_transform(raw_documents['text'], y=y)
#         df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
#         return df
#         return X.toarray()

        # If PCA
        if self.pca:
            self.pca_model = PCA(n_components=self.pca_n)
            
            # PCA requires a dense matrix
            # Tf-idf returns a sparse one
            X = X.todense()
            
            # Run standard scaler for PCA
            X = preprocessing.StandardScaler().fit_transform(X)
            
            # Run PCA and return reduced array
            X = self.pca_model.fit_transform(X)

            return X
            
        return X.toarray()

In [None]:
# # Custom transformer that tokenizes text
# class TextTokenizerTransformer(BaseEstimator, TransformerMixin):
#     # Class constructor method that takes in a list of values as its argument
#     def __init__(self):
#         pass
        
        
#     # Return self nothing else to do here
#     def fit(self, X, y = None):
#         return self
    
    
#     def spacy_tokenizer(self, obj):
#         doc = nlp(obj)

#         # Looks for hashtags
#         matches = matcher(doc)
#         spans = []
#         for match_id, start, end in matches:
#             spans.append(doc[start:end])

#         for span in spans:
#             span.merge()

#         return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]


#     # Transformer method to take in strings from a dataframe and return some extra features
#     def transform(self, X , y = None):
#         # Copy the incoming df to prevent setting on copy errors
#         X = X.copy()
        
#         X['tokens'] = X['text'].apply(self.spacy_tokenizer)
        
#         X['tokens'] = " ".join(X['tokens'])
#         return X['tokens']
    
#     # Transformer method to take in strings from a dataframe and return some extra features
#     def fit_transform(self, X , y = None):
#         # Copy the incoming df to prevent setting on copy errors
#         X = X.copy()
        
#         X['tokens'] = X['text'].apply(self.spacy_tokenizer)
        
#         X['tokens'] = " ".join(X['tokens'])
#         return X['tokens']

In [None]:
# # Custom transformer that takes in a string and returns some features
# class TextTransformer(BaseEstimator, TransformerMixin):
#     # Class constructor method that takes in a list of values as its argument
#     def __init__(self):
#         pass
        
#     # Return self nothing else to do here
#     def fit(self, X, y = None):
#         return self
    
    
#     def spacy_tokenizer(self, obj):
#         doc = nlp(obj)

#         # Looks for hashtags
#         matches = matcher(doc)
#         spans = []
#         for match_id, start, end in matches:
#             spans.append(doc[start:end])

#         for span in spans:
#             span.merge()

#         return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]


#     # Transformer method to take in strings from a dataframe and return some extra features
#     def transform(self, X , y = None):
#         # Copy the incoming df to prevent setting on copy errors
#         X = X.copy()
        
#         # Embed text as a bag of words using tfidf
#         tfidf = TfidfVectorizer(tokenizer = self.spacy_tokenizer)
#         X = tfidf.fit_transform(X['text'])
        
#         # returns numpy array
#         return X

In [None]:
# Categorical text features
cat_text_features = ['text']

# Text features for text pipeline
text_features = ['text']

# Categorical features for text pipeline
cat_features = ['keyword']

# Define categorical pipeline
cat_text_pipeline = Pipeline(
    steps = [('cat_text_selector', FeatureSelector(cat_text_features)),
             ('cat_text_transformer', CategoricalTextTransformer()),
            ],
    verbose = True
)

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
#              ('text_transformer', TextTokenizerTransformer()),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Define the keyword categorical training pipeline
cat_raw_keyword_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalRawKeywordTransformer())
            ],
    verbose = True
)

# Define the lemmatized keyword categorical pipeline
cat_lemma_keyword_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalLemmatizedKeywordTransformer())
            ],
    verbose = True
)

# Define the text training pipeline
text_pca_50_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_tfidf', DenseTfidfVectorizer(pca=True, pca_n = 50))
            ],
    verbose = True
)

# #Defining the steps in the categorical pipeline 
# categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                  
#                                   ( 'cat_transformer', CategoricalTransformer() ), 
                                  
#                                   ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )
    
# #Defining the steps in the numerical pipeline     
# numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
#                                   ( 'num_transformer', NumericalTransformer() ),
                                  
#                                   ('imputer', SimpleImputer(strategy = 'median') ),
                                  
#                                   ( 'std_scaler', StandardScaler() ) ] )

# Create the full pipeline

In [None]:
# #Combining numerical and categorical piepline into one full big pipeline horizontally 
# #using FeatureUnion
# full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
#                                                   ( 'numerical_pipeline', numerical_pipeline ) ] )

# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_raw_keyword_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_lemma_keyword_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_raw_keyword_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_raw_keyword_pipeline),
        ('text_pipeline', text_pca_50_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

full_lemma_keyword_pca_50_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_raw_keyword_pipeline', cat_lemma_keyword_pipeline),
        ('text_pipeline', text_pca_50_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

# Test pipeline

In [14]:
train_df = pd.read_csv("data/train.csv")

# Can't run the full pipeline with the training and the model
- The data transformation part of the pipeline that does TFIDF will return different number of features based on the data fed in

# Solution: Separate the feature pipeline with the model pipeline

training time: tfidf.fit_transform(X_train)
inference: tfidf.transform(X_test)

In [15]:
X_train = train_df.copy()
y_train = X_train.pop('target').values

test_df = pd.read_csv('data/test.csv')

# Create PCA'd (50 dims) training and test sets for the lemma and raw pipelines

In [None]:
%%time
for pipeline, name in zip([full_raw_keyword_pca_50_pipeline, full_lemma_keyword_pca_50_pipeline], ['full_raw_keyword_pca_50_pipeline', 'full_lemma_keyword_pca_50_pipeline']):
    X_train_processed = pipeline.fit_transform(X_train)
    test_processed = pipeline.transform(test_df)
    
    np.save(name + '_X_train', X_train_processed)
    np.save(name + '_test_processed', test_processed)

In [None]:
%%time
# Process text and categorical features
X_train_processed = full_lemma_keyword_pipeline.fit_transform(X_train)

In [None]:
%%time
# Preprocess test data
test_processed = full_lemma_keyword_pipeline.transform(test_df)

In [None]:
# Save training and test numpy arrays
np.save('raw_keyword_categorical_X_train', X_train_processed)
np.save('raw_keyword_categorical_y_train', y_train)
np.save('raw_keyword_categorical_test_processed', test_processed)

In [None]:
from scipy import sparse
np.save('lemma_keyword_categorical_X_train_csr', sparse.csr_matrix(X_train_processed))
np.save('lemma_keyword_categorical_y_train', y_train)
# np.save('raw_keyword_categorical_y_train_csr', sparse.csr_matrix(y_train)) # Don't save as a sparse matrix, else you will need to reshape it later for training
np.save('lemma_keyword_categorical_test_processed_csr', sparse.csr_matrix(test_processed))

# Saving processed data
- Save the output of the transform pipelines to save memory
- Can save it raw (very large)
- Or save as a sparse matrix
- Do not save the target labels as a sparse as you'll have to reshape it from (1, n) to (n, ) later, and the space savings is probably very small
- Note that there were some problems reading the sparse matrix into some sklearn models at training
    - Will need to look into this problem more

In [None]:
%%time
lrcv =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000, # Try 4000...
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )
lrcv.fit(X_train_processed, y_train)

In [None]:
# #The full pipeline as a step in another pipeline with an estimator as the final step
# full_pipeline_m = Pipeline(steps = [
#     ('full_pipeline', full_pipeline),
#     ('model', LogisticRegressionCV(cv=5, random_state=42, n_jobs=-1)) 
# ])

# #Can call fit on it just like any other pipeline
# full_pipeline_m.fit(X_train, y_train)

In [None]:
X_test = train_df.copy().sample(1000, random_state=42)
y_test = X_test.pop('target').values

In [None]:
# Preprocess test data
X_test_processed = full_pipeline.transform(X_test)

In [None]:
X_test_processed.shape

In [None]:
X_train_processed.shape

In [None]:
%%time
# Predict
predicted = lrcv.predict(X_test_processed) 

In [None]:
# full_pipeline_m.get_params()

In [None]:
# # %%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# LR with tfidf, upper, lower text
Logistic Regression Accuracy: 0.851  
Logistic Regression Precision: 0.9287925696594427  
Logistic Regression Recall: 0.704225352112676  



# Same as above but using the keyword column as a categorical feature

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# Bumped up the LRCV iterations to 4000 due to non-convergence at iterations=100

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [None]:
metrics.f1_score(y_test,predicted)

# Upped cv to 10 using additional feature of num_hashtags in text
# Model training takes under 40 mins

In [None]:
%%time
# # # Predicting with a test dataset
# predicted = pipe.predict(X_test)

# # Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

In [None]:
metrics.f1_score(y_test,predicted)

In [None]:
for key, vals in lrcv.scores_.items():
    for idx, val in enumerate(vals):
        print(idx)
        print(val)
        print()

In [None]:
lrcv.classes_

In [None]:
lrcv.scores_[1].mean(axis=0).max()

In [None]:
lrcv.scores_[1]

In [None]:
lrcv.scores_[1].mean(axis=0)

In [None]:
lrcv.scores_[1][0].mean()

In [None]:
print ('Max auc_roc:', searchCV.scores_[1].mean(axis=0).max())

In [None]:
test_df = pd.read_csv('data/test.csv')
test_df.head()

In [None]:
%%time
# Preprocess test data
test_processed = full_pipeline.transform(test_df)

In [None]:
test_predictions = lrcv.predict(test_processed)

In [None]:
test_predictions

In [None]:
test_id = test_df['id']

In [None]:
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']

In [None]:
test_predictions_df.head()

In [None]:
test_predictions_df.to_csv('test_preds.csv', index=False)