In [58]:
import numpy as np 
import pandas as pd

import regex as re

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split#, cross_val_score
from sklearn import metrics, preprocessing

import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.matcher import Matcher

from joblib import dump, load

In [16]:
spacy.prefer_gpu()

True

In [17]:
# !python -m spacy download en_core_web_sm

# nlp = spacy.load("en_core_web_sm")
nlp = English()

In [18]:
# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [19]:
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [20]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [21]:
# Create matcher for hashtags
matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

In [22]:
# Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    # Class Constructor 
    def __init__(self, feature_names):
        self.feature_names = feature_names 
    
    # Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    # Method that describes what we need this transformer to do
    # This one pulls up the list of feature columns you pass in and returns just those columns
    def transform(self, X, y = None):
        return X[self.feature_names] 

In [43]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0

                
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    

In [44]:
# Custom transformer processes the keyword feature as a categorical
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.ohe_model = preprocessing.OneHotEncoder(handle_unknown='error',
                                         drop='first',
                                         sparse=False)

        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    

    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
    
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        X = self.ohe_model.transform(X)
        
        return X
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Fill NaNs with "None"
        # Missing values will cause the one-hot encoding to fail
        X = X.fillna("none")
        
        X = self.ohe_model.fit_transform(X)
        

        return X 

In [45]:
class DenseTfidfVectorizer(TfidfVectorizer):
    def __init__(self):
        self.tfidf_model = TfidfVectorizer(tokenizer=self.spacy_tokenizer)
        
    def spacy_tokenizer(self, obj):
        doc = nlp(obj)

        # Looks for hashtags
        matches = matcher(doc)
        spans = []
        for match_id, start, end in matches:
            spans.append(doc[start:end])

        for span in spans:
            span.merge()

        return [t.text.lower() for t in doc if t not in stop_words and not t.is_punct | t.is_space]
        
    def transform(self, raw_documents):
        X = self.tfidf_model.transform(raw_documents['text'])

        return X.toarray() # Changes the scipy sparse array to a numpy matrix

    
    def fit_transform(self, raw_documents, y=None):
        X = self.tfidf_model.fit_transform(raw_documents['text'], y=y)

        return X.toarray()

In [46]:
# Categorical text features
cat_text_features = ['text']

# Text features for text pipeline
text_features = ['text']

# Define categorical pipeline
cat_text_pipeline = Pipeline(
    steps = [('cat_text_selector', FeatureSelector(cat_text_features)),
             ('cat_text_transformer', CategoricalTextTransformer()),
            ],
    verbose = True
)

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Model 1
- tf-idf
- text is upper
- text is lower

In [47]:
# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_pipeline = FeatureUnion(
    transformer_list=[
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

In [48]:
train_df = pd.read_csv("data/train.csv")

In [49]:
X_train = train_df.copy()
y_train = X_train.pop('target').values

In [50]:
%%time
# Process text and categorical features
X_train_processed = full_pipeline.fit_transform(X_train)

[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 1.6min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.0s
CPU times: user 1min 33s, sys: 1.43 s, total: 1min 34s
Wall time: 1min 34s


In [51]:
%%time
lrcv =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000,
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )
lrcv.fit(X_train_processed, y_train)

CPU times: user 33.2 s, sys: 1.05 s, total: 34.2 s
Wall time: 15min 52s


LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=4000, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=42, refit=True, scoring='f1', solver='lbfgs',
                     tol=0.0001, verbose=0)

In [52]:
lrcv.scores_[1].mean(axis=0).max()

0.6239874682916249

In [59]:
# Save the model to disk
dump(lrcv, 'saved_models/model_01.joblib') 

# # To load it later:
# model_01 = load('saved_models/model_01.joblib') 

['saved_models/model_01.joblib']

# Get test predictions for kaggle scoring

In [62]:
%%time
test_df = pd.read_csv('data/test.csv')

# Preprocess test data
test_processed = full_pipeline.transform(test_df)

test_predictions = lrcv.predict(test_processed)
test_id = test_df['id']
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']
test_predictions_df.to_csv('test_preds.csv', index=False)

CPU times: user 42.3 s, sys: 720 ms, total: 43 s
Wall time: 42.6 s


## Create dict for model results

In [63]:
model_results = dict()
model_results['model_01'] = {'best mean kfold score' : lrcv.scores_[1].mean(axis=0).max(), 
                             'kaggle submission score' : 0.80777
                            }

In [64]:
model_results

{'model_01': {'best mean kfold score': 0.6239874682916249,
  'kaggle submission score': 0.80777}}

# Model 2
- tf-idf
- text is upper
- text is lower
- include hashtag counts
- include keywords as categorical

In [68]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0


    def count_hashtags(self, obj):
        hashtag_count = len(re.findall(self.hashtag_pattern, obj))
        return hashtag_count
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        # Count the number of hashtags in the text
        X['hashtag_count'] = X['text'].apply(self.count_hashtags)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
        
        # Count the number of hashtags in the text
        X['hashtag_count'] = X['text'].apply(self.count_hashtags)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    

In [69]:
# Categorical text features
cat_text_features = ['text']

# Text features for text pipeline
text_features = ['text']

# Categorical features for text pipeline
cat_features = ['keyword']

# Define categorical pipeline
cat_text_pipeline = Pipeline(
    steps = [('cat_text_selector', FeatureSelector(cat_text_features)),
             ('cat_text_transformer', CategoricalTextTransformer()),
            ],
    verbose = True
)

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
#              ('text_transformer', TextTokenizerTransformer()),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Define the keyword categorical training pipeline
cat_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalTransformer())
            ],
    verbose = True
)

In [70]:
# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_pipeline', cat_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

In [71]:
%%time
# Process text and categorical features
X_train_processed = full_pipeline.fit_transform(X_train)

lrcv02 =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000,
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )

lrcv02.fit(X_train_processed, y_train)

[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 1.6min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.1s
CPU times: user 10min 20s, sys: 7.08 s, total: 10min 27s
Wall time: 31min 40s


LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=4000, multi_class='auto', n_jobs=-1, penalty='l2',
                     random_state=42, refit=True, scoring='f1', solver='lbfgs',
                     tol=0.0001, verbose=0)

In [72]:
lrcv02.scores_[1].mean(axis=0).max()

# Save the model to disk
dump(lrcv02, 'saved_models/model_02.joblib') 

# # To load it later:
# model_02 = load('saved_models/model_02.joblib') 

['saved_models/model_02.joblib']

In [73]:
%%time
# Preprocess test data
test_processed = full_pipeline.transform(test_df)

test_predictions = lrcv02.predict(test_processed)
test_id = test_df['id']
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']
test_predictions_df.to_csv('test_preds_02.csv', index=False)

CPU times: user 41.5 s, sys: 820 ms, total: 42.4 s
Wall time: 42 s


In [74]:
model_results['model_02'] = {'best mean kfold score' : lrcv02.scores_[1].mean(axis=0).max(), 
                             'kaggle submission score' : 0.79243
                            }
model_results

{'model_01': {'best mean kfold score': 0.6239874682916249,
  'kaggle submission score': 0.80777},
 'model_02': {'best mean kfold score': 0.43945727482425345,
  'kaggle submission score': 0.79243}}

# Model 3
- Only TF-IDF on tweet

In [75]:
# Text features for text pipeline
text_features = ['text']

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_pipeline = FeatureUnion(
    transformer_list=[
        ('text_pipeline', text_pipeline),
                     ]
)

In [76]:
%%time
# Process text and categorical features
X_train_processed = full_pipeline.fit_transform(X_train)

lrcv03 =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000,
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )

lrcv03.fit(X_train_processed, y_train)

lrcv03.scores_[1].mean(axis=0).max()

# Save the model to disk
dump(lrcv03, 'saved_models/model_03.joblib') 

# # To load it later:
# model_03 = load('saved_models/model_03.joblib') 

[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 1.6min
CPU times: user 2min 17s, sys: 2.45 s, total: 2min 20s
Wall time: 16min 14s


['saved_models/model_03.joblib']

In [78]:
%%time
# Preprocess test data
test_processed = full_pipeline.transform(test_df)

test_predictions = lrcv03.predict(test_processed)
test_id = test_df['id']
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']
test_predictions_df.to_csv('test_preds_03.csv', index=False)

CPU times: user 42.9 s, sys: 788 ms, total: 43.7 s
Wall time: 43.6 s


In [80]:
model_results['model_03'] = {'best mean kfold score' : lrcv03.scores_[1].mean(axis=0).max(), 
                             'kaggle submission score' : 0.79959
                            }
model_results

{'model_01': {'best mean kfold score': 0.6239874682916249,
  'kaggle submission score': 0.80777},
 'model_02': {'best mean kfold score': 0.43945727482425345,
  'kaggle submission score': 0.79243},
 'model_03': {'best mean kfold score': 0.6228593210878816,
  'kaggle submission score': 0.79959}}

# Model 4
- TF-IDF
- text is upper
- text is lower
- keywords as categorical

In [84]:
# Custom transformer that takes in a string and returns new categorical features
class CategoricalTextTransformer(BaseEstimator, TransformerMixin):
    # Class constructor method that takes in a list of values as its argument
    def __init__(self):
        self.hashtag_pattern = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)
        
        
    # Return self nothing else to do here
    def fit(self, X, y = None):
        return self
    
    
    # Test helper func to just return the text in all lower case
    def is_lower(self, obj):
        if obj.islower():
            return 1
        else:
            return 0
    
    
    def is_upper(self, obj):
        if obj.isupper():
            return 1
        else:
            return 0
        
        
    # Transformer method to take in strings from a dataframe and return some extra features
    def transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
    
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    
    # Transformer method to take in strings from a dataframe and return some extra features
    def fit_transform(self, X , y = None):
        # Copy the incoming df to prevent setting on copy errors
        X = X.copy()
        
        # Return binary indicator of whether tweet is all lowercase
        X['is_lower'] = X['text'].apply(self.is_lower)
        
        # Return binary indicator of whether tweet is all uppercase
        X['is_upper'] = X['text'].apply(self.is_upper)
        
        # Drop original text col
        # The only thing remaining now will be the lowercased text
        X = X.drop('text', axis=1)
        
        # returns numpy array
        return X.values 
    
    

In [85]:
# Categorical text features
cat_text_features = ['text']

# Text features for text pipeline
text_features = ['text']

# Categorical features for text pipeline
cat_features = ['keyword']

# Define categorical pipeline
cat_text_pipeline = Pipeline(
    steps = [('cat_text_selector', FeatureSelector(cat_text_features)),
             ('cat_text_transformer', CategoricalTextTransformer()),
            ],
    verbose = True
)

# Define the text training pipeline
text_pipeline = Pipeline(
    steps = [('text_selector', FeatureSelector(text_features)),
             ('text_tfidf', DenseTfidfVectorizer())
            ],
    verbose = True
)

# Define the keyword categorical training pipeline
cat_pipeline = Pipeline(
    steps = [('cat_selector', FeatureSelector(cat_features)),
             ('cat_transformer', CategoricalTransformer())
            ],
    verbose = True
)

In [86]:
# Combine all our pipelines into a single one inside the FeatureUnion object
# Right now we only have one pipeline which is our text one
full_pipeline = FeatureUnion(
    transformer_list=[
        ('cat_pipeline', cat_pipeline),
        ('text_pipeline', text_pipeline),
        ('cat_text_pipeline', cat_text_pipeline),
                     ]
)

In [87]:
%%time
# Process text and categorical features
X_train_processed = full_pipeline.fit_transform(X_train)

lrcv04 =  LogisticRegressionCV(cv=10, 
                             max_iter = 4000,
                             random_state=42, 
                             n_jobs=-1,
                             scoring = 'f1',
                            )

lrcv04.fit(X_train_processed, y_train)

print(lrcv04.scores_[1].mean(axis=0).max())

# Save the model to disk
dump(lrcv04, 'saved_models/model_04.joblib') 

# # To load it later:
# model_03 = load('saved_models/model_03.joblib') 

[Pipeline] ...... (step 1 of 2) Processing cat_selector, total=   0.0s
[Pipeline] ... (step 2 of 2) Processing cat_transformer, total=   0.0s
[Pipeline] ..... (step 1 of 2) Processing text_selector, total=   0.0s
[Pipeline] ........ (step 2 of 2) Processing text_tfidf, total= 1.6min
[Pipeline] . (step 1 of 2) Processing cat_text_selector, total=   0.0s
[Pipeline]  (step 2 of 2) Processing cat_text_transformer, total=   0.0s
0.43962002382388465
CPU times: user 6min 53s, sys: 5.71 s, total: 6min 59s
Wall time: 19min 35s


['saved_models/model_04.joblib']

In [88]:
%%time
# Preprocess test data
test_processed = full_pipeline.transform(test_df)

test_predictions = lrcv04.predict(test_processed)
test_id = test_df['id']
test_predictions_df = pd.DataFrame([test_id, test_predictions]).T
test_predictions_df.columns = ['id', 'target']
test_predictions_df.to_csv('test_preds_04.csv', index=False)

CPU times: user 41.9 s, sys: 848 ms, total: 42.8 s
Wall time: 45.2 s


In [89]:
model_results['model_04'] = {'best mean kfold score' : lrcv04.scores_[1].mean(axis=0).max(), 
                             'kaggle submission score' : 0.79345
                            }
model_results

{'model_01': {'best mean kfold score': 0.6239874682916249,
  'kaggle submission score': 0.80777},
 'model_02': {'best mean kfold score': 0.43945727482425345,
  'kaggle submission score': 0.79243},
 'model_03': {'best mean kfold score': 0.6228593210878816,
  'kaggle submission score': 0.79959},
 'model_04': {'best mean kfold score': 0.43962002382388465,
  'kaggle submission score': 0.79345}}