# Experiment Tracking and Management

In [20]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import mlflow
import re

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

client = MlflowClient("http://127.0.0.1:5000/")

In [16]:
CONFIG = {}

CONFIG["NORMALIZER"] = "stem"

def preprocessor(corpus: str or list) -> str:
    '''
    Preprocessor for the input features
    '''

    if type(corpus) == str:
        corpus = [corpus]

    preprocessed_corpus = []
    for text in corpus:
        # remove html tags
        text = re.sub(r'<[^>]+>', '', text)

        # replace non-alphanumeric
        text = re.sub("[^a-zA-Z0-9]+", " ", text)

        # replace unnecessary whitespaces
        text = re.sub("\s+", " ", text)

        normalizer = CONFIG["NORMALIZER"]

        if normalizer == "stem": 
            stemmer = PorterStemmer()
            text = ' '.join([stemmer.stem(word) for word in word_tokenize(text)])
        
        elif normalizer == "lemma":
            def pos_tagger(word):
                """
                Obtains the Parts of Speech (POS) for NLTK's lemmatizer mapping
                """
                tag = nltk.pos_tag([word])[0][1][0].lower()
                tag_dict = {"j": wordnet.ADJ,
                            "n": wordnet.NOUN,
                            "v": wordnet.VERB,
                            "r": wordnet.ADV}

                # returns the pos tag, defaults to noun
                return tag_dict.get(tag, wordnet.NOUN)
            lemmatizer = WordNetLemmatizer()
            text = ' '.join([lemmatizer.lemmatize(w, pos_tagger(w)) for w in word_tokenize(text)])
        else:
            raise Exception('Please enter CONFIG["NORMALIZER"] as "stem" or "lemma"')
        preprocessed_corpus.append(text)

    return preprocessed_corpus

In [26]:
def get_top_runs(experiment_id:int = 1, max_results:int = 3) -> list:
    '''
    Gets `run_ids` for the top runs (`max_results`) with the highest accuracies

    Returns list of run_ids with the highest accuracies
    '''
    runs = client.search_runs(
        experiment_ids=str(experiment_id),
        filter_string="",
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=max_results,
        order_by=["metrics.accuracy DESC"]
    )

    run_ids = []

    for run in runs:
        print(f"Run id: {run.info.run_id}, start_time: {run.info.start_time}")
        print(f"Model_name: {run.data.tags['model_type']}, metrics: {run.data.metrics}")
        print("-"*100)
        run_ids.append(run.info.run_id)
    
    return run_ids

In [27]:
top_runs = get_top_runs()

Run id: 35b60b1ee6054fb5b78084524dfdd307, start_time: 1657177835952
Model_name: Naive Bayes, metrics: {'accuracy': 0.8143}
----------------------------------------------------------------------------------------------------
Run id: 202d3bb342ce46048b3bf3bf3dda6199, start_time: 1657180074773
Model_name: SGDClassifier, metrics: {'accuracy': 0.8015}
----------------------------------------------------------------------------------------------------
Run id: e2eff7851b2947f5aee823422f992585, start_time: 1657177970430
Model_name: Random Forest, metrics: {'accuracy': 0.7457}
----------------------------------------------------------------------------------------------------


In [28]:
def load_model(run_id:int, experiment_id:int =1,):
    print(f"{client.get_run(run_id).data.tags['model_type']} loaded (run_id: {run_id})")
    return mlflow.sklearn.load_model(f"mlflow/mlruns/{experiment_id}/{run_id}/artifacts")

In [29]:
model = load_model(top_runs[0])

Naive Bayes loaded (run_id: 35b60b1ee6054fb5b78084524dfdd307)


In [31]:
test_sentence = "First of all, let's get a few things straight here: a) I AM an anime fan- always has been as a matter of fact (I used to watch Speed Racer all the time in Preschool). b) I DO like several B-Movies because they're hilarious. c) I like the Godzilla movies- a lot.<br /><br />Moving on, when the movie first comes on, it seems like it's going to be your usual B-movie, down to the crappy FX, but all a sudden- BOOM! the anime comes on! This is when the movie goes WWWAAAAAYYYYY downhill.<br /><br />The animation is VERY bad & cheap, even worse than what I remember from SPEED RACER, for crissakes! In fact, it's so cheap, one of the few scenes from the movie I ""vividly"" remember is when a bunch of kids run out of a school... & it's the same kids over & over again! The FX are terrible, too; the dinosaurs look worse than Godzilla. In addition, the transition to live action to animation is unorganized, the dialogue & voices(especially the English dub that I viewed) was horrid & I was begging my dad to take the tape out of the DVD/ VHS player; The only thing that kept me surviving was cracking out jokes & comments like the robots & Joel/Mike on MST3K (you pick the season). Honestly, this is the only way to barely enjoy this movie & survive it at the same time.<br /><br />Heck, I'm planning to show this to another fellow otaku pal of mine on Halloween for a B-Movie night. Because it's stupid, pretty painful to watch & unintentionally hilarious at the same time, I'm giving this movie a 3/10, an improvement from the 0.5/10 I was originally going to give it.<br /><br />(According to my grading scale: 3/10 means Pretty much both boring & bad. As fun as counting to three unless you find a way to make fun of it, then it will become as fun as counting to 15.)"

for run in top_runs:
    model = load_model(run)
    print(model.predict_proba(preprocessor(test_sentence)))

Naive Bayes loaded (run_id: 35b60b1ee6054fb5b78084524dfdd307)
[[0.75314729 0.24685271]]
SGDClassifier loaded (run_id: 202d3bb342ce46048b3bf3bf3dda6199)
[[0.75217694 0.24782306]]
Random Forest loaded (run_id: e2eff7851b2947f5aee823422f992585)
[[0.54692248 0.45307752]]
