In [1]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:

# Weights and Bias to version the dataset and experiment tracking
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import warnings
warnings.filterwarnings("ignore")

In [3]:
import wandb

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33malokpadhi[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import json
import random

In [6]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [7]:
label_encoder = LabelEncoder.load("../../artifacts/label_encoder.json")

In [8]:
train_df = pd.read_parquet("../../datasets/V1/train.parquet")
val_df = pd.read_parquet("../../datasets/V1/valid.parquet")
test_df = pd.read_parquet("../../datasets/V1//test.parquet")

In [9]:
# validate shapes
train_df.shape, val_df.shape, test_df.shape

((31499, 2), (6750, 2), (6751, 2))

In [10]:
# Feature and target
X_train = train_df.text.to_numpy()
y_train = train_df.rating

X_val =  val_df.text.to_numpy()
y_val = val_df.rating

X_test = test_df.text.to_numpy()
y_test = test_df.rating

In [11]:
# Encode all our labels
y_train = label_encoder.encode(y_train)

y_val = label_encoder.encode(y_val)

y_test = label_encoder.encode(y_test)

In [12]:
def set_seeds(seed=42):
    """Set seeds for reproducibility"""
    np.random.seed(seed)
    random.seed(seed)

In [13]:
set_seeds()

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# saving raw X_test to compare later
X_test_raw = X_test

In [16]:
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,7)) # character n-gram

In [17]:
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

In [18]:
print(X_train.shape)

(31499, 6215674)


In [19]:
# class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print(f"class counts: {counts},\nclass weights: {class_weights}")

class counts: [10499 10500 10500],
class weights: {0: 9.524716639679969e-05, 1: 9.523809523809524e-05, 2: 9.523809523809524e-05}


In [21]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support

In [23]:
lr_model = LogisticRegression(C=1)

In [24]:
lr_model.fit(X_train, y_train)

LogisticRegression(C=1)

In [25]:
y_pred = lr_model.predict(X_test)
y_pred_proba = lr_model.predict_proba(X_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.8208461977458574,
  "recall": 0.8204710413272108,
  "f1": 0.8189114615101143
}


In [26]:
lr_model_run = wandb.init(project="stackoverflow-quality", name="LogisticRegression")
# wandb.run.name = "Rule Based Model"
lr_model_run.log({"precision": performance["precision"], "recall": performance["recall"], "f1-score": performance["f1"]})

In [None]:
# wandb.sklearn.plot_learning_curve(lr_model, X_train, y_train)

In [25]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, maxâ€¦

In [33]:
# initialize model
model = SGDClassifier(
    loss="log", penalty="l2", alpha=1e-4, max_iter=1,
    learning_rate="constant", eta0=1e-1, power_t=0.1,
    warm_start=True, early_stopping=True
)

In [34]:
config={
    "epochs": 100,
    "loss": "log",
    "penalty": "l2",
    "alpha": "1e-4"
}

In [38]:
sgd_model_run = wandb.init(project="stackoverflow-quality", name="SGDClassifier", config=config)

In [39]:
num_epochs = 100
for epoch in range(num_epochs):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict_proba(X_train)
    train_loss = log_loss(y_train, y_train_pred)
    
    y_val_pred = model.predict_proba(X_val)
    val_loss = log_loss(y_val, y_val_pred)
    
    wandb.log({"train_loss": train_loss})
    wandb.log({"valid_loss": val_loss})
    
    print(
        f"Epoch: {epoch: 02d} | "
        f"train_loss: {train_loss: .5f}, "
        f"val_loss: {val_loss: .5f}")

Epoch:  0 | train_loss:  0.72132, val_loss:  0.73096
Epoch:  1 | train_loss:  0.65284, val_loss:  0.67171
Epoch:  2 | train_loss:  0.61728, val_loss:  0.64321
Epoch:  3 | train_loss:  0.59791, val_loss:  0.62841
Epoch:  4 | train_loss:  0.59218, val_loss:  0.62552
Epoch:  5 | train_loss:  0.58717, val_loss:  0.62288
Epoch:  6 | train_loss:  0.57837, val_loss:  0.61630
Epoch:  7 | train_loss:  0.57907, val_loss:  0.61757
Epoch:  8 | train_loss:  0.57507, val_loss:  0.61500
Epoch:  9 | train_loss:  0.57697, val_loss:  0.61697
Epoch:  10 | train_loss:  0.57241, val_loss:  0.61350
Epoch:  11 | train_loss:  0.57325, val_loss:  0.61488
Epoch:  12 | train_loss:  0.57173, val_loss:  0.61362
Epoch:  13 | train_loss:  0.57074, val_loss:  0.61274
Epoch:  14 | train_loss:  0.57205, val_loss:  0.61421
Epoch:  15 | train_loss:  0.57287, val_loss:  0.61492
Epoch:  16 | train_loss:  0.57118, val_loss:  0.61328
Epoch:  17 | train_loss:  0.57398, val_loss:  0.61630
Epoch:  18 | train_loss:  0.56843, val

In [40]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.7984095150014032,
  "recall": 0.7935120722855873,
  "f1": 0.7897254458048573
}


In [41]:
sgd_model_run.log({"precision": performance["precision"], "recall": performance["recall"], "f1-score": performance["f1"]})