# Using Pipelines
As the evaluation function takes scikit-learn compatible estimators, it is possible to use scikits <a href="https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html">pipelines</a> to create models in an easy to use and concise way. A pipeline chains feature transformers with an estimator at the end. In the following, we evaluate the results with an TfidfVectorizer. For the Classification it uses the Naive Bayes and
linear SVC.

### TfidfVectorizer with MultinomialNB
Using TfidfVectorizer, a combination of CountVectorizer and TfidfTransformer.
and MaxAbsScaler for Sclaing.
For the Classification we use the MultinomialNB, a Naive Bayes Classifier.

In [12]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import evaluation

# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    #TF-IDF Vectorizer
    ('tfidf', TfidfVectorizer()),
    ('scaler', preprocessing.MaxAbsScaler()),
    #NaiveBayes-Classifier
    ('clf', MultinomialNB()),
])

# Evaluate model pipeline
_,_,_ = evaluation.evaluate(model, store_model=False, store_submission=True)

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 3
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.75
INFO:root:F1-Score: 0.92 (training); 0.75 (test)
INFO:root:Accuracy: 93.32% (training); 80.02% (test)
INFO:root:Recall: 87.70% (training); 69.52% (test)
INFO:root:Precision: 96.43% (training); 81.27% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.91
INFO:root:-> Stored submission file to ../models/submission_2021-01-24_134720_Pipeline_1x

Actual score: 0.78577

### TfidfVectorizer with MultinomialNB and sinple PrePros
Using TfidfVectorizer, a combination of CountVectorizer and TfidfTransformer. adding simple inbuild preprocessing
For the Classification we use the MultinomialNB, a Naive Bayes Classifier.


In [2]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import evaluation

# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    #TF-IDF Vectorizer
    ('tfidf', TfidfVectorizer(analyzer ='word', stop_words = 'english')),
    #NaiveBayes-Classifier
    ('clf', MultinomialNB()),
])

# Evaluate model pipeline
evaluation.evaluate(model, store_model=False, store_submission=True)

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 3
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.74
INFO:root:F1-Score: 0.88 (training); 0.74 (test)
INFO:root:Accuracy: 90.84% (training); 80.05% (test)
INFO:root:Recall: 81.94% (training); 65.79% (test)
INFO:root:Precision: 96.18% (training); 84.33% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.88
INFO:root:-> Stored submission file to ../models/submission_2021-01-24_102411_Pipeline_1x

(array([[nan, nan,
         'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
        [nan, nan, 'Forest fire near La Ronge Sask. Canada'],
        [nan, nan,
         "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
        ...,
        [nan, nan,
         'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'],
        [nan, nan,
         'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'],
        [nan, nan,
         'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']],
       dtype=object),
 {'idx': [0,
   2,
   4,
   5,
   6,
   7,
   8,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   40,
 



without Scaler:
INFO:root:F1-Score: 0.88 (training); 0.74 (test)
INFO:root:Accuracy: 90.84% (training); 80.05% (test)
INFO:root:Recall: 81.94% (training); 65.79% (test)
INFO:root:Precision: 96.18% (training); 84.33% (test)

### tfidfVectorizer with Naive Bayes Classification and GridSearchCV for optimization

In [3]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

import evaluation
  
# Setup parameters for gridsearch
hyper_param = {'alpha': (1e-2, 1e-3),
}    

    
# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    #('vect',  feature_extraction.text.CountVectorizer()),
    ('tfidf', TfidfVectorizer()),
    #NaiveBayes-Classifier
    ('clf', GridSearchCV(MultinomialNB(), hyper_param, scoring='f1')),
])

# Evaluate model pipeline
a ,b, c = evaluation.evaluate(model, store_model=False, store_submission=False)

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 3
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.73
INFO:root:F1-Score: 0.97 (training); 0.73 (test)
INFO:root:Accuracy: 97.62% (training); 78.43% (test)
INFO:root:Recall: 95.69% (training); 68.82% (test)
INFO:root:Precision: 98.72% (training); 78.35% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:Evaluation finished.


INFO:root:Expected submission results (F1-Score): around 0.73
INFO:root:F1-Score: 0.97 (training); 0.73 (test)
INFO:root:Accuracy: 97.62% (training); 78.43% (test)
INFO:root:Recall: 95.69% (training); 68.82% (test)
INFO:root:Precision: 98.72% (training); 78.35% (test)

### TfidfVectorizer with linear SVM and Scaler
Using again the TfidfVectorizer, but now a linear SVM Classifier for the Classification.
Vatriable C for the SVM Classifier is set to 1e-1.

In [10]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base, svm, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer

import evaluation

# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    #TF-IDF Vectorizer
    ('tfidf', TfidfVectorizer()),
    ('scaler', preprocessing.MaxAbsScaler()),
    # Classify data with a linear SVM
    ('clf', svm.LinearSVC(C=1e-2, class_weight='balanced', random_state=42))
])

# Evaluate model pipeline
evaluation.evaluate(model, store_model=False, store_submission=True)

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 3
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.74
INFO:root:F1-Score: 0.85 (training); 0.74 (test)
INFO:root:Accuracy: 87.25% (training); 77.74% (test)
INFO:root:Recall: 80.90% (training); 74.11% (test)
INFO:root:Precision: 88.44% (training); 74.08% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:-> F1-Score on complete training set: 0.84
INFO:root:-> Stored submission file to ../models/submission_2021-01-24_131850_Pipeline_1x

(array([[nan, nan,
         'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
        [nan, nan, 'Forest fire near La Ronge Sask. Canada'],
        [nan, nan,
         "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
        ...,
        [nan, nan,
         'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'],
        [nan, nan,
         'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'],
        [nan, nan,
         'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']],
       dtype=object),
 {'idx': [0,
   2,
   4,
   5,
   6,
   7,
   8,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   40,
 

actual result(with Scaler): 0.77045

without Scaler:
C=1e-2:
INFO:root:Expected submission results (F1-Score): around 0.71
INFO:root:F1-Score: 0.75 (training); 0.71 (test)
INFO:root:Accuracy: 78.44% (training); 74.45% (test)
INFO:root:Recall: 75.38% (training); 73.43% (test)
INFO:root:Precision: 74.68% (training); 69.06% (test)
INFO:root:Evaluation finished.


### TfidfVectorizer with linear SVM Classifier
Using again the TfidfVectorizer, but now a linear SVM Classifier for the Classification.
Vatriable C for the SVM Classifier is set to 0.5.

In [8]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base, svm, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer

import evaluation

# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    #TF-IDF Vectorizer
    ('tfidf', TfidfVectorizer()),
    ('scaler', preprocessing.MaxAbsScaler()),
    # Classify data with a linear SVM
    ('clf', svm.LinearSVC(C=0.5, class_weight='balanced', random_state=42))
])

# Evaluate model pipeline
_,_,_ = evaluation.evaluate(model, store_model=False, store_submission=False)

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 3
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.74
INFO:root:F1-Score: 1.00 (training); 0.74 (test)
INFO:root:Accuracy: 99.63% (training); 77.95% (test)
INFO:root:Recall: 99.66% (training); 74.23% (test)
INFO:root:Precision: 99.48% (training); 74.39% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:Evaluation finished.


In [None]:
C=0.5:
INFO:root:Expected submission results (F1-Score): around 0.76
INFO:root:F1-Score: 0.96 (training); 0.76 (test)
INFO:root:Accuracy: 96.91% (training); 79.06% (test)
INFO:root:Recall: 95.69% (training); 75.11% (test)
INFO:root:Precision: 97.09% (training); 75.90% (test)
            
without Scaler:
INFO:root:Expected submission results (F1-Score): around 0.74
INFO:root:F1-Score: 1.00 (training); 0.74 (test)
INFO:root:Accuracy: 99.63% (training); 77.95% (test)
INFO:root:Recall: 99.66% (training); 74.23% (test)
INFO:root:Precision: 99.48% (training); 74.39% (test)

### TfidfVectorizer with linear SVM Classifier and GridSearchCV.
Using again the TfidfVectorizer, but now a linear SVM Classifier for the Classification.
to optimize the Classifier GridSearchCv is used additionaly.

In [6]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base, svm, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

import evaluation

hyper_param = [{
    'kernel': ['rbf'],
    'C': [1e-1, 0.5, 1, 5],
    'gamma': ['scale']
}]

# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    #TF-IDF Vectorizer
    ('tfidf', TfidfVectorizer()),
    # Classify data with a linear SVM
     ('clf', GridSearchCV(svm.SVC(), hyper_param, scoring='f1'))
])
#('clf', svm.LinearSVC(C=1e-2, class_weight='balanced', random_state=42))])

# Evaluate model pipeline
evaluation.evaluate(model, store_model=False, store_submission=False)

INFO:root:Loading training data from ../data/external/kaggle/train.csv...
INFO:root:-> Number of samples: 7613
INFO:root:-> Number of features: 3
INFO:root:Evaluating model with 1 experiment(s) of 10-fold Cross Validation...
INFO:root:Run 1/10 finished
INFO:root:Run 2/10 finished
INFO:root:Run 3/10 finished
INFO:root:Run 4/10 finished
INFO:root:Run 5/10 finished
INFO:root:Run 6/10 finished
INFO:root:Run 7/10 finished
INFO:root:Run 8/10 finished
INFO:root:Run 9/10 finished
INFO:root:Run 10/10 finished
INFO:root:---
INFO:root:Expected submission results (F1-Score): around 0.76
INFO:root:F1-Score: 1.00 (training); 0.76 (test)
INFO:root:Accuracy: 99.65% (training); 80.51% (test)
INFO:root:Recall: 99.47% (training); 71.94% (test)
INFO:root:Precision: 99.73% (training); 80.61% (test)
INFO:root:---
INFO:root:Retraining model on the complete data set...
INFO:root:Evaluation finished.


(array([[nan, nan,
         'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
        [nan, nan, 'Forest fire near La Ronge Sask. Canada'],
        [nan, nan,
         "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
        ...,
        [nan, nan,
         'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'],
        [nan, nan,
         'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'],
        [nan, nan,
         'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']],
       dtype=object),
 {'idx': [0,
   2,
   4,
   5,
   6,
   7,
   8,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   40,
 

# tfidfVectorizer with Linear SVC, removing Stopwords and Lemmatize

Text is preprocessed, lemmatized and stopwords removed, before usinf the tfidfVectorizer

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import preprocessing, base, svm, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

import evaluation
import spacy

class LemmatizeAndStopWords(base.TransformerMixin, base.BaseEstimator):
    def __init__(self, model="en_core_web_sm"):
        self.model = model

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        nlp = spacy.load(self.model)
        rt = []
        for tweet in X:
            lemmatWords =  " ".join([token.lemma_ for token in nlp(tweet) if token.is_stop == False])
            rt.append(lemmatWords)
        return rt
    
# Setup model as transformer pipeline with logistic regression
model = Pipeline([
    # Extract the `text` feature
    ('col-selector', preprocessing.FunctionTransformer(func=lambda X: X[:, 2])),
    ('Lemma', LemmatizeAndStopWords()),
    #TF-IDF Vectorizer
    ('tfidf', TfidfVectorizer()),
    # Classify data with a linear SVM
    ('clf', svm.LinearSVC(C=0.5, class_weight='balanced', random_state=42))
])
# Evaluate model pipeline
_,_,_ = evaluation.evaluate(model, store_model=True, store_submission=True)

INFO:root:Expected submission results (F1-Score): around 0.76
INFO:root:F1-Score: 0.97 (training); 0.76 (test)
INFO:root:Accuracy: 97.07% (training); 78.94% (test)
INFO:root:Recall: 96.22% (training); 75.73% (test)
INFO:root:Precision: 96.94% (training); 75.38% (test)
INFO:root:Evaluation finished.

actual submission score: 0.79068
(slightly worse than tutorial)