<img src="img/bigthings.png" style="width:600px;">

# Solving Natural Language problems with scarce data

# Baseline models with scikit-learn

In [3]:
import numpy as np

from toxic import load_data

train, y_train, test, y_test = load_data()

## Baseline model

As baseline for comparisons we will use a trivial model that predicts everything as non-toxic. Anything scoring below this is a useless model.

In [4]:
baseline_preds = np.zeros(y_test.shape)
np.save("baseline_preds", baseline_preds)

## Bag of Words model

As baseline model we will use a standard Bag of Words model. This is readily available in scikit-learn as the CountVectorizer transformer.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

bow_model = Pipeline([
    ('vectorizer', CountVectorizer(analyzer = "word", ngram_range = (1,1), binary = True)),
    ('classifier', RandomForestClassifier(100))
])

  from numpy.core.umath_tests import inner1d


In [6]:
bow_model.fit(train["comment_text"], y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [7]:
bow_preds = np.array(bow_model.predict_proba(test["comment_text"]))[:,:,1].transpose()
np.save("bow_preds", bow_preds)

## HashingVectorizer model

To account for n-grams we can also try a HashingVectorizer transformation

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

hv_model = Pipeline([
    ('vectorizer', HashingVectorizer(analyzer = "word", ngram_range = (1,3), binary = True)),
    ('classifier', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()), n_jobs=1))
])

In [9]:
hv_model.fit(train["comment_text"], y_train)

Pipeline(memory=None,
     steps=[('vectorizer', HashingVectorizer(alternate_sign=True, analyzer='word', binary=True,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 3), non_negative=False,
         norm='l2..._state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid'),
          n_jobs=1))])

In [10]:
hv_preds = hv_model.predict_proba(test["comment_text"])
np.save("hv_preds", hv_preds)