In [73]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

import mlflow
import mlflow.sklearn
from  mlflow.tracking import MlflowClient
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
mlflow_params  = {}
mlflow_metrics = {}

In [74]:
tracking_uri = "https://j35165181202.babyrocket.net"
client = MlflowClient(tracking_uri=tracking_uri)
mlflow.set_tracking_uri(tracking_uri)
experiments = client.list_experiments()
experiment_names = []
for exp in experiments:
    experiment_names.append(exp.name)
experiment_name = "nlp_demo_2"
if experiment_name not in experiment_names:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

In [75]:
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

In [76]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [77]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
mlflow_params["samples"]  = X_train_counts.shape[0]
mlflow_params["features"] = X_train_counts.shape[1]
mlflow_params["demo_name"] = "Raji_Amit"

In [78]:
use_idf = True
tf_transformer = TfidfTransformer(use_idf=use_idf).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape
mlflow_params["use_idf"] = use_idf

In [79]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))
])

mlflow_params["classifier"] = text_clf.steps[2][1].__class__.__name__

In [80]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [81]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
accuracy = np.mean(predicted == twenty_test.target)
mlflow_metrics["accuracy"] = accuracy

In [82]:
from sklearn import metrics

In [83]:
report = metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names)

In [84]:
fp = open("report.txt","w")
fp.write(report)
fp.close()

In [85]:
with mlflow.start_run():
    mlflow.log_params(mlflow_params)
    mlflow.log_metrics(mlflow_metrics)
    mlflow.sklearn.log_model(text_clf,"model")
    mlflow.log_artifact("report.txt")