# Label Bot Prediction Service #

### Sentence Parser ###

In [1]:
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# This script serves to do data cleaning
from bs4 import BeautifulSoup
import nltk
# fix ssl certificate errors
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
import os.path
import pandas as pd
import re
import sys
import logging


class SentenceParser:

    regex_str = [
        r'<[^>]+>',                                                                     # HTML tags
        r'(?:@[\w_]+)',                                                                 # @-mentions
        r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",                                               # hash-tags
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',   # URLs
        r'(?:(?:\d+,?)+(?:\.?\d+)?)',                                                   # numbers
        r"(?:[a-z][a-z'\-_]+[a-z])",                                                    # words with - and '
        r'(?:[\w_]+)',                                                                  # other words
        r'(?:\S)'                                                                       # anything else
    ]
    # English Stopwords
    with open('stopwords.txt') as file:
        stopwords = file.read().split()
    file.close()

    def __init__(self):
        """
        SentenceParser serves to clean text content
        """
        self.data = None
        # extract words stem
        self.porter = nltk.PorterStemmer()
        # a set of stopwords
        self.stops = set(self.stopwords)

    def read_file(self, filepath, filetype, encod='ISO-8859-1', header=None):
        """
        This method is to read csv/json/xlsx files
        """
        logging.info('Start reading File')
        if not os.path.isfile(filepath):
            logging.error("File Not Exist!")
            sys.exit()
        if filetype == 'csv':
            df = pd.read_csv(filepath, encoding=encod, header=header)
        elif filetype == 'json':
            df = pd.read_json(filepath, encoding=encod, lines=False)
        elif filetype == 'xlsx':
            df = pd.read_excel(filepath, encoding=encod, header=header)
        else:
            logging.error("Extension Type not Accepted!")
            sys.exit()

        logging.debug(df)
        self.data = df

    def merge_column(self, columns, name):
        """
        This method is to merge columns of a pandas dataframe
        """
        logging.info('Merge headers %s to %s', str(columns), name)
        self.data[name] = ''
        for header in columns:
            self.data[name] += ' ' + self.data[header]
  
    def clean_body(self, column, remove_template=True, remove_code=True):
        """
        This methods is to remove template and code from issue's body
        """
        logging.info("Start Removing Templates..")
        for i in range(len(self.data)):
            # remove 'Environment info' part
            if remove_template and "## Environment info" in self.data[column][i]:
                index = self.data.loc[i, column].find("## Environment info")
                self.data.loc[i, column] = self.data.loc[i, column][:index]
            # remove code
            if remove_code and "```" in self.data[column][i]:
                sample = self.data[column][i].split("```")
                sample = [sample[i*2] for i in range(0, int((len(sample)+1)/2))]
                self.data.loc[i, column] = " ".join(sample)

    def process_text(self, column, remove_symbol=True, remove_stopwords=False, stemming=False):
        """
        This method is to remove symbols/remove stopwords/extract words stem
        """
        logging.info("Start Data Cleaning...")
        # remove some symbols
        self.data[column] = self.data[column].str.replace(r'[\n\r\t]+', ' ')
        # remove URLs
        self.data[column] = self.data[column].str.replace(self.regex_str[3], ' ')
        tempcol = self.data[column].values.tolist()

        for i in range(len(tempcol)):
            row = BeautifulSoup(tempcol[i], 'html.parser').get_text().lower()
            # remove symbols
            if remove_symbol:
                row = re.sub('[^a-zA-Z]', ' ', row)
            words = row.split()
            # remove stopwords
            if remove_stopwords:
                words = [w for w in words if w not in self.stops and not w.replace('.', '', 1).isdigit()]
            # extract words stem
            if stemming:
                words = [self.porter.stem(w) for w in words] 
            row = ' '.join(words)
            tempcol[i] = row.lower()
        return tempcol


### DataFetcher ###

In [2]:
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# This scipt is served to fetch GitHub issues into a json file
from __future__ import print_function
import os
import requests
import json
import re
import pandas as pd
import logging


class DataFetcher:

    def __init__(self,
                 github_user=os.environ.get("github_user"),
                 github_oauth_token=os.environ.get("github_oauth_token"),
                 repo=os.environ.get("repo")):
        """
        This DataFetcher serves to fetch issues data
        Args:
            github_user(str): the github id. ie: "CathyZhang0822"
            github_oauth_token(str): the github oauth token, paired with github_user to realize authorization
            repo(str): the repo name
        """
        self.github_user = github_user
        self.github_oauth_token = github_oauth_token
        self.repo = repo
        self.auth = (self.github_user, self.github_oauth_token)
        self.json_data = None

    def cleanstr(self, raw_string, sub_string):
        """
        This method is to convert all non-alphanumeric charaters from 
        raw_string into substring
        """
        clean = re.sub("[^0-9a-zA-Z]", sub_string, raw_string)
        return clean.lower()

    def count_pages(self, state):
        """
        This method is to count how many pages of issues/labels in total
        state can be "open"/"closed"/"all"
        """
        url = 'https://api.github.com/repos/%s/issues' % self.repo
        response = requests.get(url, {'state': state},
                                auth=self.auth)
        assert response.status_code == 200, "Authorization failed"
        if "link" not in response.headers:
            return 1
        return int(self.cleanstr(response.headers['link'], " ").split()[-3])
    
    def fetch_issues(self, issue_nums):
        """
        This method is to fetch issues data
        issue_num: a list of issue ids
        return issues' data in pandas dataframe format
        """
        assert issue_nums != [], "Empty Input!"
        logging.info("Reading issues:{}".format(", ".join([str(num) for num in issue_nums])))
        data = []
        for number in issue_nums:
            url = 'https://api.github.com/repos/' + self.repo + '/issues/' + str(number)
            response = requests.get(url, auth=self.auth)
            item = response.json()
            assert 'title' in item, "{} issues doesn't exist!".format(str(number))
            data += [{'id': str(number), 'title': item['title'], 'body': item['body']}]
        return pd.DataFrame(data)

    def data2json(self, state, labels=None, other_labels=False):
        """
        This method is to store issues' data into a json file, return json file's name
        state can be either "open"/"closed"/"all"
        labels is a list of target labels we are interested in
        other_labels can be either "True"/"False"
        """
        assert state in set(['all', 'open', 'closed']), "Invalid State!"
        logging.info("Reading {} issues..".format(state))
        pages = self.count_pages(state)
        data = []
        for x in range(1, pages+1):
            url = 'https://api.github.com/repos/' + self.repo + '/issues?page=' + str(x) \
                  + '&per_page=30'.format(repo=self.repo)
            response = requests.get(url,
                                    {'state': state,
                                     'base': 'master',
                                     'sort': 'created'},
                                    auth=self.auth)
            for item in response.json():
                if "pull_request" in item:
                    continue
                if "labels" in item:
                    issue_labels=list(set([item['labels'][i]['name'] for i in range(len(item['labels']))]))
                else:
                    continue
                if labels is not None:
                    # fetch issue which has at least one target label
                    for label in labels:
                        if label in issue_labels:
                            if other_labels:
                                # besides target labels, we still want other labels
                                data += [{'id': item['number'],'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
                            else:
                                # only record target labels
                                if(label in set(["Feature", "Call for Contribution", "Feature request"])):
                                    label = "Feature"
                                data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': label}]
                            # if have this break, then we only pick up the first target label
                            break
                else:
                    # fetch all issues
                    data += [{'id': item['number'], 'title': item['title'], 'body': item['body'], 'labels': issue_labels}]
        self.json_data = data
        s_labels = "_".join(labels) if labels is not None else "all_labels"
        filename = "{}_data.json_{}".format(state, s_labels)
        logging.info("Writing json file..")
        with open(filename, 'w') as write_file:
            json.dump(data, write_file)
        logging.info("{} json file is ready!".format(filename))
        return filename


### Trainer ###

In [6]:
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# This script is served to train Machine Learning models

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import tempfile
import pickle
import logging
import os


class Trainer:
    # target labels that we are interested in
    labels = ["Performance", "Test", "Question",
               "Feature request", "Call for contribution",
               "Feature", "Example", "Doc",
               "Installation", "Build", "Bug"]

    def __init__(self, 
                 tv=TfidfVectorizer(min_df=0.00009, ngram_range=(1, 3), max_features=10000), 
                 clf=SVC(gamma=0.5, C=100, probability=True),
                 tmp_dir = tempfile.TemporaryDirectory()
                 ):
        """
        Trainer is to train issues using Machine Learning methods.
        self.labels(list): a list of target labels
        self.tv: TFIDF model (trigram, max_features = 10000)
        self.clf: Classifier (SVC, kenerl = 'rbf')
        self.tmp_tv_file: tempfile to store Vectorizer
        self.tmp_clf_file: tempfile to store Classifier
        self.tmp_labels_file: tempfile to store Labels
        """
        self.tv = tv
        self.clf = clf
        self.tmp_dir = tmp_dir

    def train(self):
        """
        This method is to train and save models.
        It has 5 steps:
        1. Fetch issues
        2. Clean data
        3. Word embedding
        4. Train models
        5. Save models
        """
        logging.info("Start training issues of general labels")
        # Step1: Fetch issues with general labels
        logging.info("Fetching Data..")
        DF = DataFetcher()
        filename = DF.data2json('all', self.labels, False)
        # Step2: Clean data
        logging.info("Cleaning Data..")
        SP = SentenceParser()
        SP.read_file(filename, 'json')
        SP.clean_body('body', True, True)
        SP.merge_column(['title', 'title', 'title', 'body'], 'train')
        text = SP.process_text('train', True, False, True)
        df = SP.data
        # Step3: Word Embedding
        logging.info("Word Embedding..")
        # tv = TfidfVectorizer(min_df=0.00009, ngram_range=(1, 3), max_features=10000)
        tv = self.tv
        X = tv.fit_transform(text).toarray()
        # Labels
        labels = SP.data['labels']
        le = LabelEncoder()
        Y = le.fit_transform(labels)
        # Step4: Train Classifier
        # SVC, kernel = 'rbf'
        logging.info("Training Data..")
        # clf = SVC(gamma=0.5, C=100, probability=True)
        clf = self.clf
        clf.fit(X, Y)
        # Step5: save models
        logging.info("Saving Models..")
        with open(os.path.join(self.tmp_dir.name,'Vectorizer.p'), 'wb') as tv_file:
            pickle.dump(tv, tv_file)
        with open(os.path.join(self.tmp_dir.name,'Classifier.p'), 'wb') as clf_file:
            pickle.dump(clf, clf_file)
        with open(os.path.join(self.tmp_dir.name,'Labels.p'), 'wb') as labels_file:
            pickle.dump(labels, labels_file)
        logging.info("Completed!")
        return self.tmp_dir
    

### Predictor ###

In [8]:
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from sklearn.preprocessing import LabelEncoder
import numpy as np
import pickle
import re
import logging
import os


class Predictor:
    # keywords will be used to apply rule-based algorithms
    keywords = {"ci": ["ci", "ccache", "jenkins"],
                "flaky": ["flaky"],
                "gluon": ["gluon"],
                "coda": ["cuda", "cudnn"],
                "scala": ["scala"],
                "mkldnn": ["mkldnn, mkl"],
                "onnx": ["onnx"]}

    def __init__(self):
        """
        Predictor serves to apply rule-based and ML algorithms to predict labels
        """
        self.tv = None
        self.labels = None
        self.clf = None

    def reload(self, tmp_dir):
        """
        This method is to load models
        """
        with open(os.path.join(tmp_dir.name,'Vectorizer.p'), "rb") as tv:
            self.tv = pickle.load(tv)
        with open(os.path.join(tmp_dir.name,'Classifier.p'), "rb") as clf:
            self.clf = pickle.load(clf)
        with open(os.path.join(tmp_dir.name,'Labels.p'), "rb") as labels:
            self.labels = pickle.load(labels)

    def tokenize(self, row):
        """
        This method is to tokenize a sentence into a list of words
        Args:
            row(string): a sentence
        Return:
            words(list): a list of words
        """
        row = re.sub('[^a-zA-Z0-9]', ' ', row).lower()
        words = set(row.split())
        return words

    def rule_based(self, issues):
        """
        This method applies rule_based algorithms to predict labels
        Args:
            issues(list): a list of issue numbers
        Return:
            rule_based_predictions(list of lists): labels which satisfy rules
        """
        DF = DataFetcher()
        df_test = DF.fetch_issues(issues)
        rule_based_predictions = []
        for i in range(len(issues)):
            # extract every issue's title
            row = df_test.loc[i, 'title']
            # apply rule-based algorithms
            single_issue_predictions = []
            if "feature request" in row.lower():
                single_issue_predictions.append("Feature")
            if "c++" in row.lower():
                single_issue_predictions.append("C++")
            tokens = self.tokenize(row)
            for k, v in self.keywords.items():
                for keyword in v:
                    if keyword in tokens:
                        single_issue_predictions.append(k)
            rule_based_predictions.append(single_issue_predictions)
        return rule_based_predictions

    def ml_predict(self, issues, threshold=0.3):
        """
        This method applies machine learning algorithms to predict labels
        Args:
            issues(list): a list of issue numbers
            threshold(float): threshold of probability
        Return:
            ml_predictions(list of lists): predictions
        """
        # step1: fetch data
        DF = DataFetcher()
        df_test = DF.fetch_issues(issues)
        # step2: data cleaning
        SP = SentenceParser()
        SP.data = df_test
        SP.clean_body('body', True, True)
        SP.merge_column(['title', 'title', 'title', 'body'], 'train')
        test_text = SP.process_text('train', True, False, True)
        # step3: word embedding
        test_data_tfidf = self.tv.transform(test_text).toarray()
        le = LabelEncoder()
        le.fit_transform(self.labels)
        # step4: classification
        probs = self.clf.predict_proba(test_data_tfidf)
        # pick up top 2 predictions which exceeds threshold
        best_n = np.argsort(probs, axis=1)[:, -2:]
        ml_predictions = []
        for i in range(len(best_n)):
            # INFO:Predictor:issue:11919,Performance:0.47353076240017744,Question:0.2440056213336274
            logging.info("issue:{}, {}:{}, {}:{}".format(str(issues[i]), str(le.classes_[best_n[i][-1]]), str(probs[i][best_n[i][-1]]),
                        str(le.classes_[best_n[i][-2]]), str(probs[i][best_n[i][-2]])))
            single_issue_predictions = [le.classes_[best_n[i][j]] for j in range(-1, -3, -1) if probs[i][best_n[i][j]] > threshold]
            ml_predictions.append(single_issue_predictions)
        return ml_predictions

    def predict(self, issues):
        # return predictions of both rule_base algorithms and machine learning methods
        rule_based_predictions = self.rule_based(issues)
        ml_predictions = self.ml_predict(issues)
        predictions = [list(set(rule_based_predictions[i]+ml_predictions[i])) for i in range(len(ml_predictions))]
        return predictions
