In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import wandb
from pprint import pprint

from utils import load_embeddings
from model_eval.eval_predictions import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
ids, embeddings, metadata = load_embeddings('/home/ubuntu/dev/detoxify/jigsaw-toxic-comment-classification-challenge/embeddings/embeddings_t5-large_test_This_text_is_about_{}.pkl')

In [25]:
labels = pd.read_csv('/home/ubuntu/dev/detoxify/jigsaw-toxic-comment-classification-challenge/data/test_labels.csv', dtype={"id": "string"})
classes = list(labels.columns)
classes.remove("id")
y = labels[classes].to_numpy()

In [26]:
mask = (y != -1).any(axis=1)
y_m = y[mask, :]
embeddings_m = embeddings[mask, :]
y_m.shape, embeddings_m.shape

X_train, X_test, y_train, y_test = train_test_split(embeddings_m, y_m, random_state=1)

In [4]:
# sweep_config = {
#   "name" : "my-sweep",
#   "method" : "random",
#   "metric": {
#     "name": "mean_auc",
#     "goal": "maximize"
#   },
#   "parameters" : {
#     # "epochs" : {
#     #   "values" : [10, 20, 50]
#     # },
#     "learning_rate" :{
#       "min": 0.0001,
#       "max": 0.1
#     },
#     'model':{
#       'values':['randomForest','mlp']
#     }
#   }
# }

import yaml
with open("config-wandb.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

pprint(sweep_config)

sweep_id = wandb.sweep(sweep_config)

{'method': 'random',
 'metric': {'goal': 'maximize', 'name': 'mean_auc'},
 'name': 'sklearn-sweep',
 'parameters': {'learning_rate': {'max': 0.1, 'min': 0.0001},
                'model': {'value': ['randomForest', 'mlp']}}}
Create sweep with ID: r8rkrj8c
Sweep URL: https://wandb.ai/anitavero/uncategorized/sweeps/r8rkrj8c


In [77]:
def train():
    wandb.init()    # required to have access to `wandb.config`
    wb_config = wandb.config
    if wandb.config.model == 'randomForest':
        clf = RandomForestClassifier(n_jobs=7, verbose=True)
    if wandb.config.model == 'mlp':
        clf = MLPClassifier(max_iter=10, verbose=True, learning_rate_init=wb_config["learning_rate"])
    mo_clf = MultiOutputClassifier(estimator=clf)
    loss = mo_clf.fit(X_train, y_train)  # your model training code here

    def predict_prob(embs):
        preds = mo_clf.predict_proba(embs)
        return np.column_stack([p[:, 1] for p in preds])

    y_probs = predict_prob(X_test)
    mean_auc = evaluate(y_probs, y_test)['mean_auc']
    print("MEAN AUC:", mean_auc)
    wandb.log({"mean_auc": mean_auc})

In [78]:
count = 2 # number of runs to execute
wandb.agent(sweep_id, function=train, count=count)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   17.7s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:   42.1s finished
[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   11.5s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:   29.5s finished
[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   14.9s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:   39.0s finished
[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   13.6s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:   34.6s finished
[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:   14.9s
[Parallel(n_jobs=7)]: Done 100 out of 10

MEAN AUC: 0.9074290279893017
Iteration 1, loss = 0.18575724
Iteration 2, loss = 0.16109254
Iteration 3, loss = 0.15761680
Iteration 4, loss = 0.15421153
Iteration 5, loss = 0.15053281
Iteration 6, loss = 0.14876830
Iteration 7, loss = 0.14529643
Iteration 8, loss = 0.14321018
Iteration 9, loss = 0.14342068
Iteration 10, loss = 0.14072729




Iteration 1, loss = 0.03297218
Iteration 2, loss = 0.02017528
Iteration 3, loss = 0.01983781
Iteration 4, loss = 0.01941928
Iteration 5, loss = 0.01864224
Iteration 6, loss = 0.01845024
Iteration 7, loss = 0.01877085
Iteration 8, loss = 0.01833980
Iteration 9, loss = 0.01776490
Iteration 10, loss = 0.01885892




Iteration 1, loss = 0.12835079
Iteration 2, loss = 0.10750187
Iteration 3, loss = 0.10515020
Iteration 4, loss = 0.10151810
Iteration 5, loss = 0.09957776
Iteration 6, loss = 0.09929834
Iteration 7, loss = 0.09650205
Iteration 8, loss = 0.09428344
Iteration 9, loss = 0.09354739
Iteration 10, loss = 0.08983738




Iteration 1, loss = 0.02660083
Iteration 2, loss = 0.01279813
Iteration 3, loss = 0.01158710
Iteration 4, loss = 0.01175843
Iteration 5, loss = 0.01119007
Iteration 6, loss = 0.01079137
Iteration 7, loss = 0.01118282
Iteration 8, loss = 0.01051206
Iteration 9, loss = 0.00988151
Iteration 10, loss = 0.00904838




Iteration 1, loss = 0.12975775
Iteration 2, loss = 0.10853962
Iteration 3, loss = 0.10322639
Iteration 4, loss = 0.10114804
Iteration 5, loss = 0.09893477
Iteration 6, loss = 0.09782942
Iteration 7, loss = 0.09445798
Iteration 8, loss = 0.09353488
Iteration 9, loss = 0.09243904
Iteration 10, loss = 0.09148909




Iteration 1, loss = 0.05530907
Iteration 2, loss = 0.03560440
Iteration 3, loss = 0.03202646
Iteration 4, loss = 0.03157063
Iteration 5, loss = 0.03209549
Iteration 6, loss = 0.03118966
Iteration 7, loss = 0.02991697
Iteration 8, loss = 0.02813617
Iteration 9, loss = 0.02744080
Iteration 10, loss = 0.02712898




MEAN AUC: 0.9581781268228055


In [17]:
with open("config-wandb.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)
sweep_config

['a', 'b']

In [27]:
c = MLPClassifier(max_iter=10, verbose=True, hidden_layer_sizes=[100, 300])
c.fit(X_train[:10], y_train[:10])

Iteration 1, loss = 4.06547419
Iteration 2, loss = 3.93523666
Iteration 3, loss = 3.81115383
Iteration 4, loss = 3.68740980
Iteration 5, loss = 3.56231390
Iteration 6, loss = 3.42896180
Iteration 7, loss = 3.28559084
Iteration 8, loss = 3.13336045
Iteration 9, loss = 2.97204688
Iteration 10, loss = 2.80179909


