In [40]:
%pip install seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import warnings
warnings.filterwarnings("ignore")

In [42]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Reading the data

In [43]:
# read the data
data_train = pd.read_csv('../../datasets/train.csv')
data_train.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [44]:
data_valid = pd.read_csv("../../datasets/valid.csv")
data_valid.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,<php>,2016-01-01 11:34:09,LQ_EDIT


In [45]:
train_data = data_train.copy()
valid_data = data_valid.copy()

In [46]:
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
porter = PorterStemmer()

In [48]:
# Preprocess
def preprocess(text, lower=True, stem=False,
                filters="[!\"'#$%&()*\+,-.:;<=>?@\\\[\]^_`{|}~]",
                stopwords=STOPWORDS):
    # lower the text
    if lower:
        text = text.lower()
    
    # remove the stopwords
    pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub('', text)
    
    # remove <p> and </p> tags
    text = re.sub(r"[^(a-zA-Z0-9)\s]", " ", text)
    text = re.sub(r"\bp\b","", text)
    
    # spacing and filters
    # text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
    # text = re.sub(filters, r"", text)
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()
    
    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([porter.stem(word) for word in text.split(" ")])

    return text

In [49]:
import ipywidgets as widgets

In [50]:
train_data['Body'][0]

'<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n'

In [51]:
# Toggle preprocessing parameters
@widgets.interact(lower=True, stem=False)
def display_preprocessed_text(lower, stem):
    text = train_data['Body'][0]
    preprocessed_text = preprocess(text=text, lower=lower, stem=stem)
    print(text)
    print (preprocessed_text)

interactive(children=(Checkbox(value=True, description='lower'), Checkbox(value=False, description='stem'), Ou…

In [52]:
train_df = train_data[['Title','Body', 'Y']]
valid_df = valid_data[['Title','Body', 'Y']]

In [53]:
train_df['text'] = train_df['Title'] + train_df['Body']
valid_df['text'] = valid_df['Title'] + valid_df['Body']
train_df.drop(['Title', 'Body'], axis=1,inplace=True)
valid_df.drop(['Title', 'Body'], axis=1,inplace=True)

In [54]:
train_df.head()

Unnamed: 0,Y,text
0,LQ_CLOSE,Java: Repeat Task Every Random Seconds<p>I'm a...
1,HQ,Why are Java Optionals immutable?<p>I'd like t...
2,HQ,Text Overlay Image with Darkened Opacity React...
3,HQ,Why ternary operator in swift is so picky?<p>T...
4,HQ,hide/show fab with scale animation<p>I'm using...


In [55]:
train_df.text = train_df.text.apply(preprocess,lower=True, stem=False)
valid_df.text = valid_df.text.apply(preprocess,lower=True, stem=False)

In [56]:
train_df.head()

Unnamed: 0,Y,text
0,LQ_CLOSE,java repeat task every random seconds already ...
1,HQ,java optionals immutable like understand java ...
2,HQ,text overlay image darkened opacity react nati...
3,HQ,ternary operator swift picky question simple c...
4,HQ,hide show fab scale animation using custom flo...


In [57]:
# Weights and Bias to version the dataset
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [58]:
import wandb

In [59]:
wandb.login()



True

In [60]:
run = wandb.init(project="stackoverflow-quality", entity="alokpadhi", name="Datasets")

VBox(children=(Label(value='64.068 MB of 64.068 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

In [61]:
from pathlib import Path

In [63]:
# save processed datasets
train_df.to_parquet('train_processed.parquet', index=False)
valid_df.to_parquet('valid_processed.parquet', index=False)

In [64]:
preprocessed_data = wandb.Artifact("raw_dataset", type="raw_data")
dataset_path = Path("../../datasets/")
preprocessed_data.add_dir(dataset_path)
run.log_artifact(preprocessed_data)

[34m[1mwandb[0m: Adding directory to artifact (./../../datasets)... Done. 0.2s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fe897cf4970>

In [66]:
processed_data = wandb.Artifact("processed_dataset", type="raw_data")
data_path = Path("../../datasets/preprocessed/")
processed_data.add_dir(data_path)
run.log_artifact(processed_data)

[34m[1mwandb[0m: Adding directory to artifact (./../../datasets/preprocessed)... Done. 0.0s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fe8737a1f10>

In [30]:
import json

In [31]:
# Label encoder to encode class labels
class LabelEncoder(object):
    """Encode labels into unqiue ids/integers"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v:k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded
    
    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
            
        return classes
    
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
    
    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [32]:
X = train_df.text.to_numpy()
y = train_df.Y

In [33]:
label_encoder = LabelEncoder()
label_encoder.fit(y)

<__main__.LabelEncoder at 0x7fe8b8295d30>

In [34]:
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'HQ': 0, 'LQ_CLOSE': 1, 'LQ_EDIT': 2}

In [35]:
# Check one example
print(f"Target sample: {y[15]}")
print(f"Encoded target sample: {label_encoder.encode([y[15]])}")

Target sample: HQ
Encoded target sample: [0]


In [36]:
# Encode all our labels
y = label_encoder.encode(y)
print(y.shape)

(45000,)


In [37]:
label_encoder.save(Path("../../artifacts/label_encoder.json"))

In [38]:
metadata = dict(
    lower=True,
    stem=False,
    num_classes=NUM_CLASSES
)

In [39]:
data_artifacts = wandb.Artifact("data_artifacts", type="preprcossing_data", metadata=metadata, description="Storing data related artifacts")
data_artifacts.add_dir(Path("../../artifacts/"))
run.log_artifact(data_artifacts)

[34m[1mwandb[0m: Adding directory to artifact (./../../artifacts)... Done. 0.0s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fe8b82bc670>