In [1]:
# default_exp learners

In [2]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Learners

> Built-in base learners for use with metalearners.

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
#export
from lightgbm import LGBMClassifier, LGBMRegressor

In [9]:
#export
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.datasets import load_files
from joblib import dump, load
import string
import numpy as np
from abc import ABC, abstractmethod


class BaseTextLearner(ABC):
    """
    Abstract class for text classifier/regressor for use with metalearners
    """
    
    @abstractmethod
    def fit(self, x_train, y_train):
        """
        Train a classifier.
        """
        pass

    @abstractmethod
    def predict(self, x_test, return_proba=False):
        """
        Make predictions on text data.
        """
        pass

    @abstractmethod
    def predict_proba(self, x_test):
        """
        predict_proba returns the prediction probabilities for texts in  `x_test`
        """
        pass

    def _squeeze_inputs(self, x):
        if isinstance(x, np.ndarray) and len(x.shape) > 1:
            x = np.squeeze(x).tolist()
        return x
        
    
class DefaultTextLearner(BaseTextLearner):
    def __init__(self, is_classifier=True, model=None):
        """
        Default text classifier/regressor for custom models.
        """
        self.is_classifier = is_classifier
        self.model = None


    def create_model(self, texts):
        """
        create a model
        Args:
          texts(list): list of texts
        """
        token_pattern = r'\w+|[%s]' % string.punctuation
        if self.is_classifier:
            clf = LogisticRegression()
        else:
            clf = LinearRegression()
               
        self.model = Pipeline([ ('vect', CountVectorizer()), 
                                ('tfidf', TfidfTransformer()),
                                ('clf', clf) ])         
        return


    def fit(self, x_train, y_train):
        """
        train a classifier
        Args:
          x_train(list or np.ndarray):  training texts
          y_train(np.ndarray):  training labels
        """
        x_train = self._squeeze_inputs(x_train)
        y_train = self._squeeze_inputs(y_train)
        if self.model is None:
            self.create_model(x_train)
        self.model.fit(x_train, y_train)


    def predict(self, x_test, return_proba=False):
        """
        make predictions on text data
        Args:
          x_test(list or np.ndarray or str): array of texts on which to make predictions 
          or a string representing text
        """
        x_test = self._squeeze_inputs(x_test)
        if return_proba and not hasattr(self.model['clf'], 'predict_proba'): 
            raise ValueError('%s does not support predict_proba' % (type(self.model['clf']).__name__))
        if isinstance(x_test, str): x_test = [x_test]
        if self.model is None: raise ValueError('model is None - call fit or load to set the model')
        if return_proba:
            predicted = self.model.predict_proba(x_test)
        else:
            predicted = self.model.predict(x_test)
        if len(predicted) == 1: predicted = predicted[0]
        return predicted


    def predict_proba(self, x_test):
        """
        predict_proba returns the prediction probabilities for texts in  `x_test`
        """
        return self.predict(x_test, return_proba=True)


    def evaluate(self, x_test, y_test):
        """
        evaluate
        Args:
          x_test(list or np.ndarray):  training texts
          y_test(np.ndarray):  training labels
        """
        predicted = self.predict(x_test)
        return np.mean(predicted == y_test)


    def save(self, filename):
        """
        save model
        """
        dump(self.model, filename)


    def load(self, filename):
        """
        load model
        """
        self.model = load(filename)


class DefaultTextClassifier(DefaultTextLearner):
    def __init__(self):
        """
        Default text classifier for categorical outcomes.
        """
        super().__init__(is_classifier=True)
        
class DefaultTextRegressor(DefaultTextLearner):
    def __init__(self):
        """
        Default text classifier for real-valued outcomes
        """
        super().__init__(is_classifier=False)


In [10]:
# load text data
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
from sklearn.datasets import fetch_20newsgroups
train_b = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True)
(x_train, y_train) = (train_b.data, train_b.target)
(x_test, y_test) = (test_b.data, test_b.target)

In [11]:
learner = DefaultTextLearner(is_classifier=True)

In [12]:
learner.fit(x_train, y_train)

In [13]:
acc = learner.evaluate(x_test, y_test)
acc

0.8974700399467377

In [14]:
assert acc > 0.89

In [15]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_causalinference.ipynb.
Converted 01_autocoder.ipynb.
Converted 02_learners.ipynb.
Converted index.ipynb.
