In [1]:
import untangle
import os
import glob
import string
import numpy as np
import pandas as pd
from xml.etree import ElementTree as ET
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
BUG_CLASSIFICATION_FILE="./classification.csv"
BUGS_DIR="./bugs"

In [3]:
def get_files_ext(directory, extension="csv"):
    result = [i for i in glob.glob(os.path.join(directory, '*.{}').format(extension))]
    return result

In [4]:
def get_classified_bugs(bug_id_file):
    bug_ids = list()
    with open(bug_id_file, 'r') as f:
        bug_ids = f.read().splitlines()

    return bug_ids

In [5]:
def untangle_parse(bug_file):
    bug_obj = untangle.parse(bug_file)

    # Extract the short descriptions and the comments.
    bug_details = bug_obj.bugzilla.bug
    bug_data = dict()
    bug_id = str(bug_details.bug_id.cdata)
    bug_data['bug_id'] = bug_id
    bug_data['bug_status'] = str(bug_details.bug_status.cdata) + '-' + str(bug_details.resolution.cdata)
    bug_data['version'] = str(bug_details.version.cdata)
    bug_data['keywords'] = str(bug_details.keywords.cdata)
    bug_data['component'] = str(bug_details.component.cdata)

    short_desc = bug_details.short_desc.cdata.encode('ascii', 'ignore')
    long_descs = [ld.thetext.cdata.encode('ascii', 'ignore') for ld in bug_details.long_desc]


    # Bug text consists of the short description and the comments.
    bug_data['text'] = short_desc + os.linesep.join(long_descs)

    # Attachments created.
    if hasattr(bug_details, 'attachment'):
        bug_data['attachments'] = ', '.join([str(a.filename.cdata) for a in bug_details.attachment])    
    
    del bug_obj
    return bug_id, bug_data

In [6]:
def etree_parse(bug_file):
    xml_parse = ET.iterparse(bug_file, events=('start', 'end'))
    
    bug_data = dict()
    bug_id = ''
    bug_status = ''
    bug_resolution = ''
    short_desc = ''
    comments = list()
    attachments = list()
    for event, element in xml_parse:
        if event == 'end':
            tag = element.tag
            value = element.text
            if tag == 'bug_id':
                bug_id = value
                bug_data['bug_id'] = bug_id
                element.clear()
            elif tag == 'bug_status':
                bug_status = value
                element.clear()
            elif tag == 'resolution':
                if value:
                    bug_resolution = value
                element.clear()
            elif tag == 'version':
                bug_data['version'] = value
                element.clear()
            elif tag == 'keywords':
                bug_data['keywords'] = value
                element.clear()
            elif tag == 'component':
                bug_data['component'] = value
                element.clear()
            elif tag == 'short_desc':
                if value:
                    short_desc = value.encode('ascii', 'ignore')
                element.clear()
            elif tag == 'thetext':
                if value:
                    comments.append(value.encode('ascii', 'ignore'))
                element.clear()
            elif tag == 'filename':
                attachments.append(value)
                element.clear()
        
    bug_data['bug_status'] = bug_status + '-' + bug_resolution
    bug_data['text'] = short_desc + os.linesep + os.linesep.join(comments)
    bug_data['attachments'] = ', '.join(attachments)
            
    del xml_parse
    return bug_id, bug_data

In [10]:
def get_bug_df(bug_dir, parser):
    bug_files = get_files_ext(bug_dir, "xml")
    
    # DataFrame creation.
    rows = list()
    index = list()

    for bug_file in bug_files:
        bug_id, bug_data = parser(bug_file)
        
        # Add this as a row
        rows.append(bug_data)
        index.append(bug_id)

    bug_df = pd.DataFrame(rows, index=index)
    return bug_df

In [11]:
def tokenize(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stemmer = PorterStemmer()
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [12]:
def do_classification_cv(bug_df, clf, cv_cnt):
    # Get the X and y values
    X = bug_df['text'].values
    y = bug_df['class'].values
    
    # Run the classifier for precision and recall.
    precision = cross_val_score(clf, X, y, cv=cv_cnt, scoring='precision')
    recall = cross_val_score(clf, X, y, cv=cv_cnt, scoring='recall')
    
    return precision, recall

In [197]:
# Get the classified bug info.
bug_classified_df = pd.read_csv(BUG_CLASSIFICATION_FILE, dtype={'bug_id': object})

In [198]:
# Get the entire bug data.
bug_df = get_bug_df(BUGS_DIR, etree_parse)

In [199]:
# Split the data into model and prediction sets.
classified_ids = bug_classified_df['bug_id'].values.tolist()
mask = bug_df.index.isin(classified_ids)
model_df = bug_df.loc[mask]
pred_df = bug_df.loc[~mask]

In [200]:
# Perform a merge on the model data and classfied data.
model_df = pd.merge(model_df, bug_classified_df, on='bug_id')

# Get the text data from both the model and the prediction set.
model_texts = model_df['text'].values
pred_texts = pred_df['text'].values

In [201]:
# Set up the vectorizer
vectorizer = CountVectorizer(analyzer='word', tokenizer=tokenize)

# Get the text data and apply the vectorizer.
texts = np.concatenate([model_texts, pred_texts])
vectorizer = vectorizer.fit(texts)

In [202]:
# Set up the classifier
classifier = LogisticRegression()

In [203]:
# Classification pipeline
clf = Pipeline([
('vectorizer', vectorizer),
('classifier', classifier) ])

In [204]:
folds = 10
p, r = do_classification_cv(model_df, clf, folds)
print 'Mean Precision {}, Mean Recall {}, {} fold'.format(np.mean(p), np.mean(r), folds)

Mean Precision 0.683333333333, Mean Recall 0.315, 10 fold


In [205]:
# Start predicting the non-classified bugs. First fit the classified ones.
model_classes = model_df['class'].values
clf = clf.fit(model_texts, model_classes)
pred_classes = clf.predict(pred_texts)

In [210]:
print pred_df.shape

(6312, 7)


In [212]:
# The following operation is to prevent a nasty warning from poping up, which is
# a false positive, for more information see the following link:
# https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None  # default='warn'

pred_df['class'] = pd.Series(pred_classes, index=pred_df.index)

In [223]:
print 'Classified {} bugs as vulnerabilites'.format(np.sum(pred_classes))# Column order to save to csv file.

# Column order to appear in the csv output.
column_order = ['bug_id', 'bug_status', 'version', 'keywords', 'component', 'attachments']

# Save the classified bugs data to a csv.
pred_df_mask = pred_df['class']==1
pred_bugs_df = pred_df.loc[pred_df_mask]
pred_bugs_df.reset_index(drop=True)

pred_bugs_df[column_order].to_csv('classification_auto.csv', encoding='utf-8')

Classified 384 bugs as vulnerabilites
