In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

from rubicon import Rubicon
from rubicon.sklearn import RubiconPipeline

In [None]:
categories = ["alt.atheism", "talk.religion.misc"]
data = fetch_20newsgroups(subset='train', categories=categories)

In [None]:
rubicon = Rubicon(persistence="memory")
project = rubicon.get_or_create_project("Rubicon Pipeline Example")

pipeline = RubiconPipeline(
    [
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ],
    project,
)

pipeline.fit(data.data, data.target)

In [None]:
pipeline.__dict__

In [None]:
pipeline["clf"].__dict__

In [None]:
from rubicon.sklearn import get_logger

In [None]:
base_logger = get_logger("there aren't any yet")

In [None]:
base_logger()

## Simple pipeline with fit and score

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from rubicon import Rubicon
from rubicon.sklearn import RubiconPipeline

rubicon = Rubicon(persistence="memory")
project = rubicon.get_or_create_project("Rubicon Pipeline Example")

X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe = RubiconPipeline([('scaler', StandardScaler()), ('svc', SVC())], project)

# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

In [None]:
experiment = project.experiments()[0]

In [None]:
for param in experiment.parameters():
    print(f"{param.name}: {param.value}")

In [None]:
for metric in experiment.metrics():
    print(f"{metric.name}: {metric.value}")