# Pipeline for Vectorizing and Training on Twitter Disaster Message Data
* twitter messages are vectorized using the spacy (tokenization )and gensim (tfidf) libraries
* steps include:
  * all strings to lowercase
  * tokenization
  * stopword removal
  * lemmatization
  * bag of words
  * tfidf
  * GloVe vectors
  * Count of the number of hashtags
  
 A custom scikit learn transformer is created for scikit learn pipelining. That way cross-validation can be done properly where the text corpus is made from training data rather than from training and cross-validation data.
 
 Multiple models were tried including:
 * Gaussian Naive Bayes
 * SVM with singular value decomposition to decrease the number of features
 * Decision Tree
 * Gradient Boosted Trees
 * Logistic Regression
 
Logistic Regression and Gradient Boosted Trees had similar performance with a Relevant recall of .73. Because Logistic Regression is simpler and faster to train, that was used for the final model.

In [167]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import spacy
import gensim
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [160]:
df = pd.read_csv('socialmedia-disaster-tweets-DFE.csv', encoding='latin-1')
df = df[(df['choose_one'] == 'Relevant') | (df['choose_one'] == 'Not Relevant')]

In [161]:
df.shape

(10860, 13)

In [162]:
X = df.text
y = df.choose_one

In [163]:
X.head()

0                   Just happened a terrible car crash
1    Our Deeds are the Reason of this #earthquake M...
2    Heard about #earthquake is different cities, s...
3    there is a forest fire at spot pond, geese are...
4               Forest fire near La Ronge Sask. Canada
Name: text, dtype: object

In [29]:
# regex for replacing links and twitter handles
regex_link = re.compile("(?P<url>https?://[^\s]+)")
regex_handle = re.compile('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)')

def regex_replace(text):
    """replace weblinks with the word webpage and twitter handles with the word username
    
    Args:
       text (str): a string with a Twitter message
        
    Returns:
        prepared_text (str): a string with weblinks and twitter handles replaced
    
    """

    prepared_text = regex_link.sub('webpage', text)
    prepared_text = regex_handle.sub('username', prepared_text)

    return prepared_text

In [6]:
# prepare text as vectors
nlp = spacy.load('en_core_web_lg')

# fixes the issue with the spacy library where stop words are not included with the model
for word in nlp.Defaults.stop_words:
    nlp.vocab[word].is_stop = True

In [7]:
# make word lowercase
def lowercase(text):
    """make string lowercase
    
    Args:
        text (str): a string
    
    Returns:
        text (str): a string with all lowercase letters
    """
    
    return text.lower()

In [8]:
# output tokens for tweet, GloVe vectors, and count of hashtags
def spacy_tokenize_glove(text):
    """prepares a Tweet for further processing
    
    Args:
        text (str): a string representing a Tweet
        
    Returns:
        tokenized_text (list(str)): a list of tokenized words
        doc_vector (list(float)): a vector of average GloVe for tokenizes in the text
        hashtag_counts (int): number of hashtags used in the Tweet
    
    """
    text = regex_replace(text)
    text = lowercase(text)

    doc = nlp(text)
    
    doc_vector = [] # holds the GloVe vector for each token as a list
    hashtag_counts = 0 # number of hashtags in the tweet
    tokenized_text = [] # tokenized text
    
    for token in doc:
        # only keep words that are not punctuation, space, or stop words
        if token.is_stop != True and token.is_punct != True and token.text.isspace() != True: 
            if token.lemma_ != '-PRON-': 
                tokenized_text.append(token.lemma_)
            else:
                tokenized_text.append(token) # keep pronouns in original form though most if not all are stop words
                
            doc_vector.append(token.vector) # append the vector fo the token, which will be averaged
        
        if token.text == '#':
            hashtag_counts += 1
            
    if len(doc_vector) > 0:
        doc_vector = np.mean(doc_vector, axis=0)
    else:
        doc_vector = np.array([0]*300) # for a tweet with no word embedding vector

    return tokenized_text, doc_vector, hashtag_counts

In [9]:
# create a corpus of tokenized words
def create_corpus(data):
    """creates a corpus of tokenized tweets, GloVe doc vectors, and counts of hashtags
    
    Args: 
        data (list(str)): list of unprocessed tweets
    
    Returns:
        corpus (list(list)): list of tokenized tweets
        vector_corpus (list(list)): list of GloVe vector for each tweet
        hashtag_corpus (list(int)): list of hashtag counts for each tweet
        frequency (dict): dictionary of frequency counts for words in corpus
    """
    frequency = Counter()
    corpus = []
    vector_corpus = []
    hashtag_corpus = []
    
    for tweet in data:
                
        tokenized_text, doc_vector, hashtag_counts = spacy_tokenize_glove(tweet)
        
        corpus.append(tokenized_text)
        vector_corpus.append(doc_vector)
        hashtag_corpus.append(hashtag_counts)
        
        for token in tokenized_text:
            frequency[token] += 1
    
    # removes tokens that only appear once in the corpus
    corpus = [[token for token in tweet if frequency[token] > 1] for tweet in corpus]
    
    return corpus, vector_corpus, hashtag_corpus, frequency

In [10]:
def output_tfidf_model(corpus):
    """outputs a tfidf model from a corpus
    
    Args:
        corpus (list(list)): list of tweets as tokens
        
    Returns:
        tfidf: tfidf model from gensim
        
    """
    dictionary = gensim.corpora.Dictionary(corpus)
    bow = [dictionary.doc2bow(text) for text in corpus]
    tfidf = gensim.models.TfidfModel(bow, normalize=True)
    
    return tfidf, bow, dictionary

In [11]:
def pipeline_new_text(text, corpus_dictionary, tfidf_model, frequency):
    """take a list of text messages and output features
    
        Args:
            text (list): list of text messages
            corpus_dictionary (dict): mapping of words to ids
            tfidf_model (tfidf): gensim tfidf model
            frequency (dict): word count of words in corpus
            
        Returns:
            tfidf_features (list): list of tfidf features for each message
            doc_vector (list): list of GloVe vectors for each message
            hashtag_counts (list): list of hashtag counts for each message
        
    """
    
    tokenized_text, doc_vector, hashtag_counts = spacy_tokenize_glove(text)

    # removes tokens that only appear once in the corpus
    tokenized_text = [token for token in tokenized_text if frequency[token] > 1]

    bow = dictionary.doc2bow(tokenized_text)

    tfidf_features = tfidf_model[bow]
    
    return tfidf_features, doc_vector, hashtag_counts

In [12]:
def prepare_train_features(tfidf_features, doc_vector, hashtag_counts, terms):
    """concatenate different features into a single vector and scale the hashtag counts
    
    Args:
        tfidf_features (list): list of tfidf features for each message
        doc_Vector (list): list of GloVe features for each message
        hashtag_counts (list): list of hashtag counts for each message
        terms (int): size of the corpus vocabulary
    
    Returns:
        features (list): concatenated features
        scaler (MinMaxScaler): scaler for scaling hashtags on prediction data
    """
    
    np_tfidf = gensim.matutils.corpus2dense(tfidf_features, num_terms = terms).T
    scaler = MinMaxScaler()
    hashtag_count = scaler.fit_transform([[num] for num in hashtag_counts])
    
    features = np.hstack((np_tfidf, hashtag_count, doc_vector))
    
    return features, scaler

In [13]:
def prepare_predict_features(tfidf_features, doc_vector, hashtag_counts, terms, scaler):
    """concatenate different features into a single vector and scale hashtag counts for single row
    of prediction data
    
    Args:
        tfidf_features (list): tfidf features message
        doc_Vector (list): GloVe features for message
        hashtag_counts (list): lhashtag counts message
        terms (int): size of the corpus vocabulary
        scaler (object): MinMaxScaler from training
    
    Returns:
        features (list): concatenated features
        scaler (MinMaxScaler): scaler for scaling hashtags on prediction data

    """
    
    np_tfidf = gensim.matutils.corpus2dense([tfidf_features], num_terms = terms).T
    hashtag_count = scaler.fit_transform([[hashtag_counts]])
    features = np.hstack((np_tfidf, hashtag_count, [doc_vector]))
    
    return features

In [14]:
def output_training_features(train_data):
    """runs messages through the entire pipeline to output training features 
    
    Args:
        train_data (list): list of messages 
        
    Returns:
        X_train (list): training features for each message
        tfidf_model (object): tfidf model to be used to vectorize prediction data
        scaler (object): MinMaxScaler for hashtags
        dictionary (dict): mappings of ids to words
        frequency (dict): frequency counts of words in corpus
    
    """
    corpus, vector_corpus, hashtag_corpus, frequency = create_corpus(X[0:200]) # create a corpus and other features
    tfidf_model, bow, dictionary = output_tfidf_model(corpus) # tfidf model and bag of words from corpus
    X_train, scaler = prepare_train_features(tfidf_model[bow], vector_corpus, hashtag_corpus, len(dictionary))
    
    return X_train, tfidf_model, scaler, dictionary, frequency

In [15]:
def output_test_features(text, dictionary, tfidf_model, frequency):
    """runs new messages through the pipeline using the training corpus
    
    Args:
        text (str): new text to predict on
        dictionary (dict): mapping of ids to words
        tfidf_model (object): model from the training corpus
        frequency (dict): frequency counts for words in the model
    
    Returns:
        features (list): features for the new text
    """
    
    tfidf_features, doc_vector, hashtag_counts = pipeline_new_text(text, dictionary, tfidf_model, frequency)
    features = prepare_predict_features(tfidf_features, doc_vector, hashtag_counts, len(dictionary), scaler)
    
    return features

In [93]:
X_train, tfidf_model, scaler, dictionary, frequency = output_training_features(X[0:400])
X_test = output_test_features(X[0], dictionary, tfidf_model, frequency)

[[ 4.74977612e-01  6.87750220e-01  5.48995376e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000

In [67]:
class TextTransformer(TransformerMixin):
    
    def __init__(self):
        self.frequency = Counter()
    
    def transform(self, X):
        
        results = []
        for x in X:
            result = output_test_features(x, self.dictionary, self.tfidf_model, self.frequency)
            results.append(result[0])
            
        return results
        
    def fit(self, X, y=None):
        X_train, tfidf_model, scaler, dictionary, frequency = output_training_features(X)
        
        self.X_train = X_train
        self.tfidf_model = tfidf_model
        self.scaler = scaler
        self.dictionary = dictionary
        self.frequency = frequency
        
        return self

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC

In [125]:
textvectorizer = TextTransformer()
clf = GaussianNB()

gnb_pipeline = Pipeline([('vectorize', TextTransformer()), 
                           ('model', clf)])

model = transform_pipe.fit(X, y)

In [91]:
print(metrics.classification_report(y, model.predict(X)))

              precision    recall  f1-score   support

Can't Decide       0.00      1.00      0.01        16
Not Relevant       0.75      0.51      0.61      6187
    Relevant       0.77      0.44      0.56      4673

   micro avg       0.48      0.48      0.48     10876
   macro avg       0.51      0.65      0.39     10876
weighted avg       0.76      0.48      0.59     10876



In [104]:
# custom scoring function to improve relevant recall scores
def custom_score(y_true, y_pred): 
    tp = 0
    fn = 0
    
    y_true = list(y_true)
    y_pred = list(y_pred)
    for i, value in enumerate(y_true):

        if y_true[i] == y_pred[i] and y_true[i] == 'Relevant':
            tp += 1
        if y_true[i] == 'Relevant' and y_pred[i] != 'Relevant':
            fn += 1
    score = tp/(tp + fn + 1e-9)
    return score
 
svm = SVC(gamma='scale')
svm_pipeline = Pipeline([('vectorize', TextTransformer()), 
                      ('SVD', TruncatedSVD(n_components=20, random_state=42)),
                      ('model', svm)])
    
scoring = make_scorer(custom_score)

params = {'model__C':(1e-3,1,1e3)}

clf = GridSearchCV(svm_pipeline, scoring=scoring, param_grid=params, cv=3)

clf.fit(X, y)



GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorize', <__main__.TextTransformer object at 0x1a49867eb8>), ('PCA', PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('model', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'model__C': (0.001, 1, 1000.0)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(custom_score), verbose=0)

In [106]:
clf.best_params_

{'model__C': 1}

In [107]:
clf.cv_results_



{'mean_fit_time': array([143.98350922, 131.60852599, 149.64854868]),
 'std_fit_time': array([18.33198669,  1.24229908,  4.15146226]),
 'mean_score_time': array([66.80119189, 63.59841537, 59.44534731]),
 'std_score_time': array([4.84149559, 6.39863509, 1.38691445]),
 'param_model__C': masked_array(data=[0.001, 1, 1000.0],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model__C': 0.001}, {'model__C': 1}, {'model__C': 1000.0}],
 'split0_test_score': array([0.        , 0.64955071, 0.64890886]),
 'split1_test_score': array([0.        , 0.65917843, 0.65275995]),
 'split2_test_score': array([0.        , 0.746307  , 0.75080283]),
 'mean_test_score': array([0.        , 0.68499989, 0.6841446 ]),
 'std_test_score': array([0.        , 0.04351662, 0.04714771]),
 'rank_test_score': array([3, 1, 2], dtype=int32),
 'split0_train_score': array([0.        , 0.69341894, 0.81605136]),
 'split1_train_score': array([0.        , 0.71556982, 0.80995

In [115]:
print(metrics.classification_report(y, model.predict(X)))

              precision    recall  f1-score   support

Can't Decide       0.00      0.00      0.00        16
Not Relevant       0.79      0.88      0.83      6187
    Relevant       0.82      0.69      0.75      4673

   micro avg       0.80      0.80      0.80     10876
   macro avg       0.53      0.52      0.53     10876
weighted avg       0.80      0.80      0.79     10876



  'precision', 'predicted', average, warn_for)


# Simple train/cross-validation split

Score output is the recall for relevant tweets

In [122]:
from sklearn.model_selection import train_test_split
random_state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=random_state, 
                                                    shuffle=True, 
                                                    stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_test, 
                                              y_test, 
                                              test_size=0.5, 
                                              random_state=random_state, 
                                              shuffle=True, 
                                              stratify=y_test)

#### Gaussian NB

In [126]:
gnb_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorize', <__main__.TextTransformer object at 0x1a49866d30>), ('model', GaussianNB(priors=None, var_smoothing=1e-09))])

In [127]:
custom_score(y_cv, gnb_pipeline.predict(X_cv))

0.48502139800216115

#### RandomForest

In [130]:
rf = RandomForestClassifier(n_estimators=30)
rf_pipeline = Pipeline([('vectorize', TextTransformer()), 
                      ('SVD', TruncatedSVD(n_components=300, random_state=42)),
                      ('model', rf)])
rf_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorize', <__main__.TextTransformer object at 0x1a4915b8d0>), ('SVD', TruncatedSVD(algorithm='randomized', n_components=300, n_iter=5,
       random_state=42, tol=0.0)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_f...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [132]:
custom_score(y_cv, rf_pipeline.predict(X_cv))

0.590584878743808

#### Decision Tree

In [135]:
dt = DecisionTreeClassifier()
dt_pipeline = Pipeline([('vectorize', TextTransformer()), 
                      ('SVD', TruncatedSVD(n_components=300, random_state=42)),
                      ('model', dt)])
dt_pipeline.fit(X_train, y_train)

custom_score(y_cv, dt_pipeline.predict(X_cv))

0.6504992867323103

#### Gradient Boosting

In [172]:
gbc = GradientBoostingClassifier(learning_rate=.45, n_estimators=400)
gbc_pipeline = Pipeline([('vectorize', TextTransformer()), 
                      ('SVD', TruncatedSVD(n_components=300, random_state=42)),
                      ('model', gbc)])
gbc_pipeline.fit(X_train, y_train)

print(custom_score(y_cv, gbc_pipeline.predict(X_cv)))
print(metrics.classification_report(y_cv, gbc_pipeline.predict(X_cv)))

0.7161198288149556
              precision    recall  f1-score   support

Can't Decide       0.00      0.00      0.00         2
Not Relevant       0.80      0.87      0.83       928
    Relevant       0.81      0.72      0.76       701

   micro avg       0.80      0.80      0.80      1631
   macro avg       0.54      0.53      0.53      1631
weighted avg       0.80      0.80      0.80      1631



#### LogisticRegression

In [168]:
lr = LogisticRegression()
lr_pipeline = Pipeline([('vectorize', TextTransformer()), 
                        ('SVD', TruncatedSVD(n_components=300, random_state=42)),
                      ('model', lr)])
lr_pipeline.fit(X_train, y_train)

print(custom_score(y_cv, lr_pipeline.predict(X_cv)))
print(metrics.classification_report(y_cv, lr_pipeline.predict(X_cv)))



0.7318116975738491
              precision    recall  f1-score   support

Can't Decide       0.00      0.00      0.00         2
Not Relevant       0.81      0.88      0.85       928
    Relevant       0.82      0.73      0.77       701

   micro avg       0.82      0.82      0.82      1631
   macro avg       0.55      0.54      0.54      1631
weighted avg       0.82      0.82      0.81      1631



  'precision', 'predicted', average, warn_for)


# Test Set Results

In [178]:
lr = LogisticRegression()
lr_pipeline = Pipeline([('vectorize', TextTransformer()), 
                        ('SVD', TruncatedSVD(n_components=300, random_state=42)),
                      ('model', lr)])

X_traincv = X_train.append(X_cv)
y_traincv = y_train.append(y_cv)

lr_pipeline.fit(X_traincv, y_traincv)

print(metrics.classification_report(y_traincv, lr_pipeline.predict(X_traincv)))
print(metrics.classification_report(y_test, lr_pipeline.predict(X_test)))



0.2553495007129025


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

Can't Decide       0.00      0.00      0.00        13
Not Relevant       0.81      0.88      0.84      5259
    Relevant       0.82      0.74      0.78      3972

   micro avg       0.82      0.82      0.82      9244
   macro avg       0.54      0.54      0.54      9244
weighted avg       0.81      0.82      0.81      9244

              precision    recall  f1-score   support

Can't Decide       0.00      0.00      0.00         3
Not Relevant       0.80      0.86      0.83       928
    Relevant       0.78      0.71      0.75       701

   micro avg       0.79      0.79      0.79      1632
   macro avg       0.53      0.52      0.52      1632
weighted avg       0.79      0.79      0.79      1632



  'precision', 'predicted', average, warn_for)


In [187]:
lr_pipeline.predict(["Mosque with fluffy pillows to spare", 
                     "Jinwoo has dragged them all to play with fire #WINNER We knew this day was going to come"])

array(['Not Relevant', 'Not Relevant'], dtype=object)