## Baseline
    * First we will create a baseline and iteratively improve the performance with more complex models
    * Along the way we will try to fix any data related issues like data imbalance, trying different vecotorization methods etc

### Parameters to validate the final model
    * Performance per class
    * Latency
    * Size of the model
    * Inference cost
    * Bias check
    * Maintenance Cost

### Baseline Model
    * Randomly generated targets
    * Simple rule based model

## Random target Model

In [1]:
import random

In [2]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Weights and Bias to version the dataset
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import warnings
warnings.filterwarnings("ignore")

In [6]:
import wandb

In [7]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33malokpadhi[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
import json

In [9]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [11]:
label_encoder = LabelEncoder.load("../../artifacts/label_encoder.json")

In [12]:
train_df = pd.read_parquet("../../datasets/V1/train.parquet")
val_df = pd.read_parquet("../../datasets/V1/valid.parquet")
test_df = pd.read_parquet("../../datasets/V1//test.parquet")

In [13]:
# validate shapes
train_df.shape, val_df.shape, test_df.shape

((31499, 2), (6750, 2), (6751, 2))

In [14]:
# Feature and target
X_train = train_df.text.to_numpy()
y_train = train_df.rating

X_val =  val_df.text.to_numpy()
y_val = val_df.rating

X_test = test_df.text.to_numpy()
y_test = test_df.rating

In [15]:
# Encode all our labels
y_train = label_encoder.encode(y_train)

y_val = label_encoder.encode(y_val)

y_test = label_encoder.encode(y_test)

In [16]:
def set_seeds(seed=42):
    """Set seeds for reproducibility"""
    np.random.seed(seed)
    random.seed(seed)

In [17]:
print(label_encoder)

<LabelEncoder(num_classes=3)>


In [18]:
print(label_encoder.classes)

['HQ', 'LQ_CLOSE', 'LQ_EDIT']


In [19]:
# Let's generate  random prediction on the test set to see the performance
y_pred = np.random.randint(low=0, high=len(label_encoder), size=len(y_test))
print(y_pred.shape)
print(y_pred[:5])

(6751,)
[0 1 0 0 1]


In [20]:
from sklearn.metrics import precision_recall_fscore_support

In [21]:
# Evaluate the random prediction
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recal": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.33354698483187256,
  "recal": 0.333580210339209,
  "f1": 0.33354856549136974
}


In [22]:
random_model_run = wandb.init(project="stackoverflow-quality", name="Random-Model")
random_model_run.log({"precision": performance["precision"], "recall": performance["recal"], "f1-score": performance["f1"]})

## Rule Based Model

In [23]:
from collections import Counter

In [24]:
# Let's find out most frequent 10-20 words in each class
target_word_dist = {}
for y_ in label_encoder.classes:
    filter_target_sentences = train_df.loc[train_df.rating == y_, 'text'].values
    combine_words = " ".join(filter_target_sentences).split(' ')
    word_counter = Counter(combine_words).most_common(25)
    target_word_dist[y_] = [wc[0] for wc in word_counter if len(wc[0]) > 1]

In [25]:
# Check the top 15 words in each class
target_word_dist

{'HQ': ['code',
  '\n\n',
  'pre',
  'gt',
  'lt',
  'com',
  'android',
  'li',
  'strong',
  'error',
  'app',
  'using',
  'href',
  'java',
  'file',
  'class',
  'get',
  'new'],
 'LQ_CLOSE': ['\n\n',
  'code',
  'lt',
  'gt',
  'pre',
  'string',
  'class',
  'new',
  'name',
  'android',
  'int',
  'id',
  'data',
  'com',
  'want',
  'like',
  'get',
  'java'],
 'LQ_EDIT': ['\r\n',
  '\r\n\r\n',
  'id',
  'class',
  'android',
  'string',
  'name',
  'new',
  'code',
  'data',
  'div',
  'value',
  'com',
  'int',
  'public',
  'text',
  'file',
  'get']}

In [26]:
import operator

In [27]:
# Match the input string words with each class word whichever gives the highest matching assign that class
def match_words(input_str):
    target_match_count = {}
    input_str_words = input_str.split(' ')
    for target_ in target_word_dist.keys():
        target_words = [wc for wc in target_word_dist[target_]]
        match_count = 0
        for word in input_str_words:
            if word in target_words:
                match_count += 1
        target_match_count[target_] = match_count
    y_pred = max(target_match_count.items(), key=operator.itemgetter(1))[0]
    return y_pred

In [28]:
y_pred = [match_words(input_str) for input_str in X_test]

In [29]:
y_pred = label_encoder.encode(y_pred)

In [30]:
y_pred.shape

(6751,)

In [32]:
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print(json.dumps(performance, indent=2))

{
  "precision": 0.6556052475456681,
  "recall": 0.6092430750999852,
  "f1": 0.6170609511840676
}


In [34]:
rulebased_model_run = wandb.init(project="stackoverflow-quality", name="Rule-Based")
rulebased_model_run.log({"precision": performance["precision"], "recall": performance["recall"], "f1-score": performance["f1"]})

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
f1-score,▁
precision,▁
recall,▁

0,1
f1-score,0.61706
precision,0.65561
recall,0.60924
