In [66]:
import pandas as pd
import nltk
import re
from string import punctuation
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.corpus import stopwords
import boto3
from dotenv import load_dotenv, find_dotenv
import os

In [67]:
load_dotenv(find_dotenv())

True

In [68]:
use_ceph = True

if use_ceph:
    s3_endpoint_url = os.environ["OBJECT_STORAGE_ENDPOINT_URL"]
    s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
    s3_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
    s3_bucket = os.environ["OBJECT_STORAGE_BUCKET_NAME"]

    # Create an S3 client
    s3 = boto3.client(
        service_name="s3",
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url,
    )

In [69]:
name = os.getenv("REPO_NAME")

if "/" in name:
    REPO = name
    USER = ""
else:
    USER = name
    REPO = ""

In [2]:
# repo data is saved as {org_name}-_-{repo_name}
# orginization data is saved as {org_name}

savename = USER if USER else REPO.replace("/", "-_-")
path = os.path.join("../data", savename + ".csv")
key = f"data/{savename}.csv"

if use_ceph:
    print(key)
    response = s3.get_object(Bucket=s3_bucket, Key=key)
    issues_df = pd.read_csv(response.get("Body")).drop_duplicates()
else:
    issues_df = pd.read_csv(path).drop_duplicates()

In [33]:
function_list = []

pattern = r"```.+?```"
code_block_regex = re.compile(pattern, re.DOTALL)


def code_block(string):
    """
    replaces code blocks with a CODE_BLOCK
    """
    string = re.sub(code_block_regex, "CODE_BLOCK", string)
    return string


function_list.append(code_block)

pattern = r"`{1,2}.+?`{1,2}"
inline_code_regex = re.compile(pattern, re.DOTALL)


def code_variable(string):
    """
    replaces inline code with VARIABLE
    """
    string = re.sub(inline_code_regex, " INLINE ", string)
    return string


function_list.append(code_variable)

pattern = r"\s@[^\s]+"
tagged_user_regex = re.compile(pattern)


def tagged_user(string):
    """
    replaces a user tagged with USER
    """
    string = re.sub(tagged_user_regex, " USER ", string)
    return string


function_list.append(tagged_user)

pattern = r"[^\s]+\.(com|org|net|gov|edu)[^\s]*"
url_regex = re.compile(pattern)


def urls(string):
    """
    replaces URLs with URL
    """
    string = re.sub(url_regex, " URL ", string)
    return string


function_list.append(urls)

pattern = r"[\r\n]+"
enter_regex = re.compile(pattern, re.DOTALL)


def enters(string):
    """
    replaces \r\n with ENTER
    """
    string = re.sub(enter_regex, " ", string)
    return string


function_list.append(enters)

pattern = r"#####"
bold_regex = re.compile(pattern, re.DOTALL)


def bold(string):
    """
    replace bold characters with bold word
    """
    string = re.sub(bold_regex, " BOLD ", string)
    return string


function_list.append(bold)

In [34]:
def preprocess(string):
    for func in function_list:
        string = func(string)
    return string

In [36]:
punct = set(punctuation)


def all_punc(word):
    for ch in word:
        if ch not in punct:
            return False
    return True

In [None]:
stopwds = set(stopwords.words("english"))
ps = PorterStemmer()
counter = Counter()

for i, row in issues_df.iterrows():
    title = row.title
    body = row.body
    listed_words = nltk.word_tokenize(
        preprocess(title + " " + body if type(body) == str else row.title).lower()
    )
    listed_words = [word for word in listed_words if not all_punc(word)]
    stemmed = [ps.stem(word) for word in listed_words if word not in stopwds]
    counter.update(stemmed)

In [45]:
# number of distinct words detected

print(f"{len(counter)} unique words")

# keep only the words that comprise 80% of the data (cut off tail)

thresh = 0.8

to_keep = dict()
num_words = sum(counter.values())
thresh = thresh * num_words
curr = 0
for word, val in counter.most_common():
    if curr > thresh:
        break
    else:
        to_keep[word] = val
        curr += val

print(f"{len(to_keep)} unique words kept")

34207 unique words
1052 words left


In [61]:
# look at most popular stemmed words
list(to_keep.items())[:50]

[('inlin', 12527),
 ('url', 11370),
 ('bold', 11285),
 ('code_block', 9566),
 ('use', 6714),
 ('openshift', 6509),
 ('user', 6346),
 ('oc', 5457),
 ('result', 5271),
 ('creat', 5269),
 ('error', 5211),
 ('version', 5166),
 ('pod', 5070),
 ('run', 4999),
 ('build', 4986),
 ('imag', 4918),
 ('info', 4448),
 ('fail', 4240),
 ('issu', 4036),
 ('get', 4010),
 ('permiss', 3919),
 ('deploy', 3907),
 ("n't", 3747),
 ('current', 3505),
 ('step', 3255),
 ('docker', 3112),
 ('expect', 3098),
 ('test', 2899),
 ('command', 2867),
 ('contain', 2849),
 ('cluster', 2821),
 ('servic', 2812),
 ('log', 2809),
 ('work', 2711),
 ('kubernet', 2686),
 ('reproduc', 2649),
 ('extra', 2563),
 ('node', 2524),
 ('tri', 2448),
 ('start', 2397),
 ('server', 2393),
 ('1', 2265),
 ('name', 2262),
 ('instal', 2208),
 ('1.', 2139),
 ('resourc', 2138),
 ('project', 2061),
 ('registri', 2030),
 ("'s", 2013),
 ('need', 1917)]

In [72]:
with open("../wordlist.txt", "w") as f:
    for word in list(to_keep):
        f.write(word)
        f.write("\n")

In [73]:
if use_ceph:
    s3.upload_file(
        Bucket=s3_bucket, Key="github_labeler/wordlist.txt", Filename="../wordlist.txt"
    )
    os.remove("../wordlist.txt")