In [1]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:

# Weights and Bias to version the dataset and experiment tracking
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import warnings
warnings.filterwarnings("ignore")

In [5]:
import wandb

In [6]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
import json
import random

In [8]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [9]:
label_encoder = LabelEncoder.load("../../artifacts/label_encoder.json")

In [10]:
train_df = pd.read_parquet("../../datasets/V1/train.parquet")
val_df = pd.read_parquet("../../datasets/V1/valid.parquet")
test_df = pd.read_parquet("../../datasets/V1//test.parquet")

In [11]:
# validate shapes
train_df.shape, val_df.shape, test_df.shape

((31499, 2), (6750, 2), (6751, 2))

In [12]:
# Feature and target
X_train = train_df.text.to_numpy()
y_train = train_df.rating

X_val =  val_df.text.to_numpy()
y_val = val_df.rating

X_test = test_df.text.to_numpy()
y_test = test_df.rating

In [13]:
# Encode all our labels
y_train = label_encoder.encode(y_train)

y_val = label_encoder.encode(y_val)

y_test = label_encoder.encode(y_test)

In [14]:
def set_seeds(seed=42):
    """Set seeds for reproducibility"""
    np.random.seed(seed)
    random.seed(seed)

In [15]:
set_seeds()

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# saving raw X_test to compare later
X_test_raw = X_test

In [18]:
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,7)) # character n-gram

In [20]:
X_train = vectorizer.fit_transform(X_train)

In [19]:
import pickle

In [21]:
with open('../../artifacts/vectorizer.pkl', 'wb') as fp:
    pickle.dump(vectorizer, fp)

In [69]:
run = wandb.init(project="stackoverflow-quality", name="TFIDFVectorizer")

VBox(children=(Label(value='179.238 MB of 179.238 MB uploaded (179.238 MB deduped)\r'), FloatProgress(value=1.…

In [70]:
metadata = dict(
    vectorizer="TFIDF"
)

In [71]:
from pathlib import Path

In [72]:
vect_artifacts = wandb.Artifact("vectorizer_artifacts", type="preprcossing_data", metadata=metadata, description="TFIDF vectorizer in pickle format")
vect_artifacts.add_file(Path("../../artifacts/vectorizer.pkl"))
run.log_artifact(vect_artifacts)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fb76a76fb80>

In [27]:
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [28]:
print(X_train.shape)

(31499, 6215674)


In [29]:
# class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print(f"class counts: {counts},\nclass weights: {class_weights}")

class counts: [10499 10500 10500],
class weights: {0: 9.524716639679969e-05, 1: 9.523809523809524e-05, 2: 9.523809523809524e-05}


In [30]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support

In [31]:
lr_model = LogisticRegression(C=1)

In [32]:
lr_model.fit(X_train, y_train)

LogisticRegression(C=1)

In [33]:
y_pred = lr_model.predict(X_test)
y_pred_proba = lr_model.predict_proba(X_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.8208461977458574,
  "recall": 0.8204710413272108,
  "f1": 0.8189114615101143
}


In [34]:
wandb.finish()

VBox(children=(Label(value='179.238 MB of 179.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

In [35]:
lr_model_run = wandb.init(project="stackoverflow-quality", name="LogisticRegression")
# wandb.run.name = "Rule Based Model"
lr_model_run.log({"precision": performance["precision"], "recall": performance["recall"], "f1-score": performance["f1"]})

In [36]:
with open("../../model-artifacts/log-reg.pkl", "wb") as fp:
    pickle.dump(lr_model, fp)

In [38]:
wandb.run.name

'LogisticRegression'

In [40]:
# model versioning
model_art = wandb.Artifact(f"{wandb.run.name}_{wandb.run.id}", type="model")
model_art.add_file('../../model-artifacts/log-reg.pkl')
lr_model_run.log_artifact(model_art)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fb75779ac40>

In [None]:
# wandb.sklearn.plot_learning_curve(lr_model, X_train, y_train)

In [41]:
wandb.finish()

VBox(children=(Label(value='142.267 MB of 142.267 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
f1-score,▁
precision,▁
recall,▁

0,1
f1-score,0.81891
precision,0.82085
recall,0.82047


In [55]:
# initialize model
model = SGDClassifier(
    loss="log", penalty="l2", alpha=1e-4, max_iter=1,
    learning_rate="constant", eta0=1e-1, power_t=0.1,
    warm_start=True, early_stopping=True
)

In [56]:
config={
    "epochs": 50,
    "loss": "log",
    "penalty": "l2",
    "alpha": "1e-4"
}

In [58]:
wandb.finish()

In [59]:
sgd_model_run = wandb.init(project="stackoverflow-quality", name="SGDClassifier", config=config)

In [60]:
num_epochs = 50
for epoch in range(num_epochs):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict_proba(X_train)
    train_loss = log_loss(y_train, y_train_pred)
    
    y_val_pred = model.predict_proba(X_val)
    val_loss = log_loss(y_val, y_val_pred)
    
    wandb.log({"train_loss": train_loss})
    wandb.log({"valid_loss": val_loss})
    
    print(
        f"Epoch: {epoch: 02d} | "
        f"train_loss: {train_loss: .5f}, "
        f"val_loss: {val_loss: .5f}")

Epoch:  0 | train_loss:  0.72050, val_loss:  0.72988
Epoch:  1 | train_loss:  0.65023, val_loss:  0.66909
Epoch:  2 | train_loss:  0.61815, val_loss:  0.64384
Epoch:  3 | train_loss:  0.60002, val_loss:  0.63021
Epoch:  4 | train_loss:  0.58907, val_loss:  0.62303
Epoch:  5 | train_loss:  0.58406, val_loss:  0.62034
Epoch:  6 | train_loss:  0.58073, val_loss:  0.61893
Epoch:  7 | train_loss:  0.57959, val_loss:  0.61842
Epoch:  8 | train_loss:  0.57293, val_loss:  0.61316
Epoch:  9 | train_loss:  0.57670, val_loss:  0.61702
Epoch:  10 | train_loss:  0.57373, val_loss:  0.61491
Epoch:  11 | train_loss:  0.57420, val_loss:  0.61556
Epoch:  12 | train_loss:  0.57173, val_loss:  0.61393
Epoch:  13 | train_loss:  0.57043, val_loss:  0.61316
Epoch:  14 | train_loss:  0.57392, val_loss:  0.61548
Epoch:  15 | train_loss:  0.57369, val_loss:  0.61526
Epoch:  16 | train_loss:  0.57172, val_loss:  0.61365
Epoch:  17 | train_loss:  0.57211, val_loss:  0.61436
Epoch:  18 | train_loss:  0.57164, val

In [61]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.7984095150014032,
  "recall": 0.7935120722855873,
  "f1": 0.7897254458048573
}


In [62]:
sgd_model_run.log({"precision": performance["precision"], "recall": performance["recall"], "f1-score": performance["f1"]})

In [63]:
with open("../../model-artifacts/sgd.pkl", "wb") as fp:
    pickle.dump(model, fp)

In [64]:
# model versioning
model_art = wandb.Artifact(f"{wandb.run.name}_{wandb.run.id}", type="model", metadata=config)
model_art.add_file('../../model-artifacts/sgd.pkl')
sgd_model_run.log_artifact(model_art)

<wandb.sdk.wandb_artifacts.Artifact at 0x7fb7595be280>

In [73]:
wandb.finish()

VBox(children=(Label(value='179.238 MB of 179.238 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…