# SVM Preprocessing

This is a notebook to do some basic preprocessing of a given dataset for training the SVM models specifically. The fastText model preprocessing is different and will be done in a different notebook. 

The goal of this notebook is to preprocess the text and build a smaller vocabulary that will be recognized by the SVM models.

## Environment Variables and Data

First we import packages and load in the dataset.

In [1]:
import pandas as pd
import nltk
import re
from string import punctuation
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.corpus import stopwords
import boto3
from dotenv import load_dotenv, find_dotenv
import os

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
use_ceph = bool(int(os.getenv('USE_CEPH')))

if use_ceph:
    s3_endpoint_url = os.environ["OBJECT_STORAGE_ENDPOINT_URL"]
    s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
    s3_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
    s3_bucket = os.environ["OBJECT_STORAGE_BUCKET_NAME"]

    # Create an S3 client
    s3 = boto3.client(
        service_name="s3",
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url,
    )

In [4]:
name = os.getenv("REPO_NAME")

if "/" in name:
    REPO = name
    USER = ""
else:
    USER = name
    REPO = ""

In [5]:
# repo data is saved as {org_name}-_-{repo_name}
# orginization data is saved as {org_name}

savename = USER if USER else REPO.replace("/", "-_-")
path = os.path.join("../data", savename + ".csv")
key = f"github-labeler/data/{savename}.csv"

if use_ceph:
    response = s3.get_object(Bucket=s3_bucket, Key=key)
    issues_df = pd.read_csv(response.get("Body")).drop_duplicates()
else:
    issues_df = pd.read_csv(path).drop_duplicates()

data/openshift-_-origin.csv


## Define Functions

We now define a handful of preprocessing functions which will combine to be one preprocessing function. Most of these functions are simple regex expressions.

In [6]:
function_list = []

pattern = r"```.+?```"
code_block_regex = re.compile(pattern, re.DOTALL)


def code_block(string):
    """
    replaces code blocks with a CODE_BLOCK
    """
    string = re.sub(code_block_regex, "CODE_BLOCK", string)
    return string


function_list.append(code_block)

pattern = r"`{1,2}.+?`{1,2}"
inline_code_regex = re.compile(pattern, re.DOTALL)


def code_variable(string):
    """
    replaces inline code with VARIABLE
    """
    string = re.sub(inline_code_regex, " INLINE ", string)
    return string


function_list.append(code_variable)

pattern = r"\s@[^\s]+"
tagged_user_regex = re.compile(pattern)


def tagged_user(string):
    """
    replaces a user tagged with USER
    """
    string = re.sub(tagged_user_regex, " USER ", string)
    return string


function_list.append(tagged_user)

pattern = r"[^\s]+\.(com|org|net|gov|edu)[^\s]*"
url_regex = re.compile(pattern)


def urls(string):
    """
    replaces URLs with URL
    """
    string = re.sub(url_regex, " URL ", string)
    return string


function_list.append(urls)

pattern = r"[\r\n]+"
enter_regex = re.compile(pattern, re.DOTALL)


def enters(string):
    """
    replaces \r\n with ENTER
    """
    string = re.sub(enter_regex, " ", string)
    return string


function_list.append(enters)

pattern = r"#####"
bold_regex = re.compile(pattern, re.DOTALL)


def bold(string):
    """
    replace bold characters with bold word
    """
    string = re.sub(bold_regex, " BOLD ", string)
    return string


function_list.append(bold)

In [7]:
def preprocess(string):
    for func in function_list:
        string = func(string)
    return string

In [8]:
punct = set(punctuation)


def all_punc(word):
    for ch in word:
        if ch not in punct:
            return False
    return True

## Word Count

Here we tokenize each issue into words and stem each word and keep a count of how many mentions each word has.

In [9]:
stopwds = set(stopwords.words("english"))
ps = PorterStemmer()
counter = Counter()

for i, row in issues_df.iterrows():
    title = row.title
    body = row.body
    listed_words = nltk.word_tokenize(
        preprocess(title + " " + body if type(body) == str else row.title).lower()
    )
    listed_words = [word for word in listed_words if not all_punc(word)]
    stemmed = [ps.stem(word) for word in listed_words if word not in stopwds]
    counter.update(stemmed)

We keep the most popular words that make up the top x% of the data, then save these words to ceph. This is for data reduction, since we are using SVMs with a small amount of data we want to keep the complexity of the input low.

In [10]:
# number of distinct words detected

print(f"{len(counter)} unique words")

# keep only the words that comprise 80% of the data (cut off tail)

thresh = 0.8

to_keep = dict()
num_words = sum(counter.values())
thresh = thresh * num_words
curr = 0
for word, val in counter.most_common():
    if curr > thresh:
        break
    else:
        to_keep[word] = val
        curr += val

print(f"{len(to_keep)} unique words kept")

34235 unique words
1077 unique words kept


In [11]:
# look at most popular stemmed words
list(to_keep.items())[:50]

[('inlin', 10185),
 ('url', 9896),
 ('bold', 9422),
 ('code_block', 8048),
 ('use', 5476),
 ('openshift', 5446),
 ('user', 5199),
 ('creat', 4503),
 ('oc', 4497),
 ('error', 4479),
 ('version', 4399),
 ('result', 4396),
 ('info', 4348),
 ('build', 4327),
 ('pod', 4230),
 ('run', 4203),
 ('imag', 4034),
 ('permiss', 3815),
 ('fail', 3598),
 ('get', 3502),
 ('issu', 3398),
 ('deploy', 3283),
 ("n't", 3088),
 ('current', 2872),
 ('step', 2741),
 ('expect', 2580),
 ('extra', 2553),
 ('docker', 2490),
 ('test', 2411),
 ('log', 2400),
 ('command', 2384),
 ('cluster', 2366),
 ('contain', 2354),
 ('servic', 2296),
 ('kubernet', 2230),
 ('reproduc', 2217),
 ('work', 2189),
 ('node', 2162),
 ('start', 2071),
 ('tri', 1997),
 ('resourc', 1978),
 ('server', 1966),
 ('1', 1885),
 ('instal', 1866),
 ('name', 1815),
 ('1.', 1767),
 ('project', 1725),
 ("'s", 1661),
 ('registri', 1612),
 ('master', 1596)]

In [12]:
with open("../wordlist.txt", "w") as f:
    for word in list(to_keep):
        f.write(word)
        f.write("\n")

In [14]:
if use_ceph:
    s3.upload_file(
        Bucket=s3_bucket, Key="github-labeler/wordlist.txt", Filename="../wordlist.txt"
    )
    os.remove("../wordlist.txt")