In [6]:
#import packages

In [7]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import gensim
from bs4 import BeautifulSoup
import time
import numpy as np

# PreProcessing

In [8]:
class PreProcessing:
    def __init__(self):
        self.data = pd.read_csv('dataset/stack-overflow-data.csv')
        # Extract unique classes
        self.tags = self.data.tags.unique()
        
    def plot_figures(self):
            plt.figure(figsize=(10, 4))
            self.tags.value_counts().plot(kind='bar')
            
    def clean_text(self):
            start = time.time()
            # HTML decoding, Default parser is lxml
            self.data['post'] = self.data['post'].apply(lambda text: self.filter_data(text))
            end = time.time()
            print("Time Taken: " + str(end - start))
            #print(self.data['post'][:20])
            return self.data
        
    def filter_data(self, text):
        # Remove parentheses, brackets and special symbols from input
        REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,; ]')
        # Don't remove numbers, lower case characters, #, +, _ or space
        # (because they are found in calsses such as: "c++, c#")
        BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
        STOPWORDS = set(stopwords.words('english'))
        # Transform words to lower case letters
        text = text.lower()
        text = BeautifulSoup(text, 'lxml').get_text()
        text = REPLACE_BY_SPACE_RE.sub(' ', text)
        text = BAD_SYMBOLS_RE.sub('', text)
        lemmatizer = WordNetLemmatizer()
        # Filter stop words from text
        text = ' '.join(lemmatizer.lemmatize(word) for word in text.split(' ') if word not in STOPWORDS)
        # Tokenize will change classes like c#, it will split c and # in two different words
        #text = ' '.join(lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text) if word not in STOPWORDS)
        return text
    
    def word_averaging(self, wv, words):
            all_words, mean = set(), []

            for word in words:
                if isinstance(word, np.ndarray):
                    mean.append(word)
                elif word in wv.vocab:
                    mean.append(wv.syn0norm[wv.vocab[word].index])
                    all_words.add(wv.vocab[word].index)

            if not mean:
                return np.zeros(wv.vector_size, )

            mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
            return mean
        
    def word_averaging_list(self, wv, text_list):
                return np.vstack([self.word_averaging(wv, post) for post in text_list])
        
    def getRawData(self):
        return self.data

# Models

In [9]:
#import packages

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import nltk
from preprocessing import PreProcessing
from sklearn.metrics import classification_report, accuracy_score, f1_score
from gensim.models import KeyedVectors
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text
from keras import utils
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.callbacks import Callback
from sklearn.metrics import f1_score, precision_score, recall_score

Using TensorFlow backend.


In [11]:
class Model:
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.tags = tags
        self.model = None

    def train(self, tfidf=True):
        # TfidfVectorizer applies CountVectorizer to count word frequencies then
        # applies TfidfTransformer to extract tfidf information for each token
        if tfidf:
            tfidf_vector = TfidfVectorizer()
            self.X_train = tfidf_vector.fit_transform(self.X_train)
            self.X_test = tfidf_vector.transform(self.X_test)

        scoring = {'acc': 'accuracy',
                   'prec_macro': 'precision_macro',
                   'rec_micro': 'recall_macro',
                   'f1_micro': 'f1_macro'}


        scores = cross_validate(self.model,
                    self.X_train,
                    self.y_train,
                    cv=5,
                    scoring=scoring,
                    return_train_score=True)

        self.model.fit(self.X_train, self.y_train)
        y_pred = self.model.predict(self.X_test)
        print('accuracy %s' % accuracy_score(self.y_test, y_pred))
        print(classification_report(self.y_test, y_pred, target_names=self.tags))
        return scores

## NaiveBayes

In [12]:
class NaiveBayes(Model):
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        Model.__init__(self, X_train, X_test, y_train, y_test, tags)
        self.model = MultinomialNB()

## LogisticRegression

In [13]:
class LR(Model):
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        Model.__init__(self, X_train, X_test, y_train, y_test, tags)
        self.model = LogisticRegression(n_jobs=1, C=1e5, solver='lbfgs', multi_class='auto', max_iter=1000)

## SVM

In [14]:
class SVM(Model):
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        Model.__init__(self, X_train, X_test, y_train, y_test, tags)
        self.model = SGDClassifier(max_iter=1000, tol=1e-3)

## Word2Vec


In [15]:
class Word2VecDeep:
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        self.y_train = y_train
        self.y_test = y_test
        self.tags = tags
        # Limit is used to get the most-frequent 500,000 word's vectors, so speed loading vectors a little.
        glove_model = KeyedVectors.load_word2vec_format("pretrained_vectors/gensim_glove_vectors.txt", binary=False,
                                                        limit=500000)
        # Used for initialization of model.syn0norm
        glove_model.init_sims(replace=True)

        print("Done Processing Pretrained Vectors")

        pre = PreProcessing()

        test_tokenized = X_test.apply(lambda item: self.tokenize_text(item))
        train_tokenized = X_train.apply(lambda item: self.tokenize_text(item))

        self.X_train_word_average = pre.word_averaging_list(glove_model, train_tokenized)
        self.X_test_word_average = pre.word_averaging_list(glove_model, test_tokenized)

        print("Done Applying Pretrained Vectors")

    def train(self):
        lr = LR(self.X_train_word_average, self.X_test_word_average, self.y_train, self.y_test, self.tags)
        lr.train(tfidf=False)

    # Tokenize Word
    def tokenize_text(self, text):
        tokens = []
        for sent in nltk.sent_tokenize(text, language='english'):
            for word in nltk.word_tokenize(sent, language='english'):
                # To make sure that this is at least a word not single character
                if len(word) < 2:
                    continue
                tokens.append(word)
        return tokens

## BOW

In [16]:
class BOWDeep:
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.max_words = 1000
        self.batch_size = 64
        self.epochs = 3
        self.num_classes = len(tags)

        # Build the model (Neural Network)
        # Sequential structure to store multiple sequential layers
        self.model = Sequential()
        self.model.add(Dense(512, input_shape=(self.max_words, )))
        self.model.add(Activation('relu'))
        # To prevent over fitting
        # Dropout drops 50% of information so not to over fit model and improve performance
        self.model.add(Dropout(0.5))
        self.model.add(Dense(self.num_classes))
        self.model.add(Activation('softmax'))
        # Categorical cross entropy loss because we have multiple classes
        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        tokenize = text.Tokenizer(num_words=self.max_words, char_level=False)
        # Only fit on train, then use for testing
        tokenize.fit_on_texts(X_train)

        self.X_train = tokenize.texts_to_matrix(self.X_train)
        self.X_test = tokenize.texts_to_matrix(self.X_test)

        # label encoder to get one hot encoding for classes
        encoder = LabelEncoder()
        encoder.fit(self.y_train)
        self.y_train = encoder.transform(self.y_train)
        self.y_test = encoder.transform(self.y_test)

        self.y_train = utils.to_categorical(self.y_train, self.num_classes)
        self.y_test = utils.to_categorical(self.y_test, self.num_classes)

        self.metrics = Metrics()

    def train(self):
        self.model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size,
                       verbose=True, validation_split=0.1,
                       callbacks=[self.metrics,
                                  EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

        score = self.model.evaluate(self.X_test, self.y_test,
                                    batch_size=self.batch_size, verbose=True)
        print('Test accuracy:', score[1])
        return self.metrics.get_f1_scores()


## RNN

In [None]:
class RNN:
    def __init__(self, X_train, X_test, y_train, y_test, tags):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.epochs = 5
        self.batch_size = 64
        self.num_classes = len(tags)
        # The maximum number of words to be used. Most frequeunt words
        MAX_NB_WORDS = 50000
        EMBEDDING_DIM = 100

        self.metrics = Metrics()

        self.model = Sequential()
        self.model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=self.X_train.shape[1]))
        self.model.add(SpatialDropout1D(0.2))
        self.model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
        self.model.add(Dense(20, activation='softmax'))
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    def train(self):
        self.model.fit(self.X_train, self.y_train, epochs=self.epochs, batch_size=self.batch_size,
                       verbose=True, validation_split=0.1,
                       callbacks=[self.metrics,
                                  EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

        score = self.model.evaluate(self.X_test, self.y_test,
                                    batch_size=self.batch_size, verbose=True)
        print('Test accuracy:', score[1])
        return self.metrics.get_f1_scores()

### Metrics

In [None]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict, average='micro')
        _val_recall = recall_score(val_targ, val_predict, average='micro')
        _val_precision = precision_score(val_targ, val_predict, average='micro')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: {:f} — val_precision: {:f} — val_recall {:f}".format(_val_f1, _val_precision, _val_recall))

    def get_f1_scores(self):
        return self.val_f1s

# Train

In [17]:
pre = PreProcessing()
data = pre.clean_text()

FileNotFoundError: [Errno 2] File b'dataset/stack-overflow-data.csv' does not exist: b'dataset/stack-overflow-data.csv'

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data['post']
y = data['tags']
# Split to 20% test data and 80% training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
results = []
names = []

In [None]:
nb = NaiveBayes(X_train, X_test, y_train, y_test, pre.tags)
scores = nb.train()
results.append(scores['test_f1_micro'])
names.append('Naive')

In [None]:
lr = LR(X_train, X_test, y_train, y_test, pre.tags)
scores = lr.train()

results.append(scores['test_f1_micro'])
names.append('LR')

In [None]:
svm = SVM(X_train, X_test, y_train, y_test, pre.tags)
scores = svm.train()

results.append(scores['test_f1_micro'])
names.append('SVM')

In [None]:
wv = Word2VecDeep(X_train, X_test, y_train, y_test, pre.tags)
scores = wv.train()

results.append(scores)
names.append('W2V')

In [None]:
bow = BOWDeep(X_train, X_test, y_train, y_test, pre.tags)
scores = bow.train()

results.append(scores)
names.append('BOW')

In [None]:
# Use different function for pre-processing
X, y = pre.filter_rnn()
# Split to 20% test data and 80% training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rnn = RNN(X_train, X_test, y_train, y_test, pre.tags)
scores = rnn.train()

results.append(scores)
names.append('RNN')

In [None]:
# boxplot algorithm comparison

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()