In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.callbacks import ModelCheckpoint
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.layers import Embedding
# from tensorflow.keras.layers import GRU
# from tensorflow.keras.layers import LSTM
# from tensorflow.keras.layers import MaxPool1D
# from tensorflow.keras.layers import SpatialDropout1D
# from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model

In [2]:
df = pd.read_csv('data/processed/data_with_stem_lem.csv', index_col='Id')

In [3]:
df.fillna('code', inplace=True)

# Modelling
## Padded Train Test Split
### X

In [4]:
vocab_size = 73_747 # From 03-WordAnalysis.ipynb
# vocab_size = 290_479 # No restriction on vocab length
token = Tokenizer(num_words=vocab_size,
                  filters="""!"#$'%&()*+,-./:;<=>?@[\]^_`{|}~""",
                  lower=True
                 )

token.fit_on_texts(df['Body'].values)
X = token.texts_to_sequences(df['Body'].values)
len(token.word_index)

105215

In [5]:
max_words_in_post = 150
# This number could be tweaked (look at histogram of word count per class)
padded_X = pad_sequences(X, maxlen = max_words_in_post, padding = 'post')

In [6]:
print(padded_X.shape, ' ', padded_X, sep='\n')

(60000, 150)
 
[[   2    6  228 ...    0    0    0]
 [ 110  664    3 ...    0    0    0]
 [   2  113    3 ...    0    0    0]
 ...
 [1731  507    1 ...    0    0    0]
 [ 488  184   87 ...    0    0    0]
 [ 744 2490  186 ...    0    0    0]]


### y

In [7]:
y = df['Y'].replace({'HQ':2,'LQ_EDIT':1,'LQ_CLOSE':0}).values

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state = 42
                                                   )

In [9]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(48000, 150) (48000,)
(12000, 150) (12000,)


## Gaussian Naive Bayes and Multinomial Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [12]:
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, Y_train)

In [13]:
gnb_pred = gnb.predict(X_train)

In [14]:
print("Gaussian Naive Bayes Classification Report", " ",classification_report(Y_train, gnb_pred, labels= [2, 1, 0]), sep='\n')

Gaussian Naive Bayes Classification Report
 
              precision    recall  f1-score   support

           2       0.42      0.07      0.13     16037
           1       0.65      0.23      0.34     16023
           0       0.38      0.93      0.54     15940

    accuracy                           0.41     48000
   macro avg       0.48      0.41      0.34     48000
weighted avg       0.48      0.41      0.34     48000



In [50]:
# Questions from the website (might be in the dataset, need to check)
# HQ score       = 25042, ID = 11227809
# LQ_CLOSE score =   -26, ID = 24681866
# LQ_EDIT score  =     9, ID =  3977535
hq_question = """Here is a piece of C++ code that shows some very peculiar behavior. For some strange reason, sorting the data miraculously makes the code almost six times faster: code Without std::sort(data, data + arraySize);, the code runs in 11.54 seconds. With the sorted data, the code runs in 1.93 seconds. Initially, I thought this might be just a language or compiler anomaly, so I tried Java: code  With a similar but less extreme result. My first thought was that sorting brings the data into the cache, but then I thought how silly that was because the array was just generated. What is going on? Why is processing a sorted array faster than processing an unsorted array? The code is summing up some independent terms, so the order should not matter."""
lq_close_question = """My html code is code How I convert it into wordpress menu? Actually I want include in wordpress menu title="features" data-hover="Features" """
lq_edit_question = """I have two different timeseries with partially overlapping timestamps: code which represents following data: code I would like to calculate a weighted average on every day with coefficients a(0.3) and b(0.7), while ignoring missing values: code when I first try to align these timeseries: code I get correctly masked timeseries: code but when I do a1 * 0.3 + b1 * 0.7, it ignores values, that are present in one timeseries only: code What should I do to receive the awaited? code EDIT: The answer should be applicable also to more than two initial timeseries with different weights and differently missing values. o if we have four timeseries with weights T1(0.1), T2(0.2), T3(0.3) and T4(0.4), their weights at a given timestamp will be: code"""
questions = [hq_question, lq_edit_question, lq_close_question]

for i in questions:
    print(i)
    sequenced = token.texts_to_sequences(i.split())
    print(sequenced)
    padded = pad_sequences(sequenced, maxlen=max_words_in_post, padding='post')
    print(padded.reshape(-1,1))
    prediction = mnb.predict(padded)
    # Padded computes but results don't match (it converts the sequence (nested list) into a 2d array but )
    print(prediction, sep='\n')

Here is a piece of C++ code that shows some very peculiar behavior. For some strange reason, sorting the data miraculously makes the code almost six times faster: code Without std::sort(data, data + arraySize);, the code runs in 11.54 seconds. With the sorted data, the code runs in 1.93 seconds. Initially, I thought this might be just a language or compiler anomaly, so I tried Java: code  With a similar but less extreme result. My first thought was that sorting brings the data into the cache, but then I thought how silly that was because the array was just generated. What is going on? Why is processing a sorted array faster than processing an unsorted array? The code is summing up some independent terms, so the order should not matter.
[[80], [132], [103], [1184], [642], [64], [1], [125], [330], [1956], [5569], [10127], [781], [261], [1956], [1364], [397], [1975], [323], [16], [], [655], [323], [1], [1371], [3544], [415], [1494], [1], [119], [602, 467, 16], [16], [], [15665], [323], [1

In [15]:
mnb = MultinomialNB(alpha=1)
mnb_model = mnb.fit(X_train, Y_train)

In [16]:
mnb_pred = mnb.predict(X_train)

In [40]:
mnb_pred

array([0, 0, 0, ..., 0, 1, 2])

In [18]:
print("Multinomial Naive Bayes Classification Report", " ",classification_report(Y_train, mnb_pred, labels= [2, 1, 0]), sep='\n')

Multinomial Naive Bayes Classification Report
 
              precision    recall  f1-score   support

           2       0.42      0.16      0.23     16037
           1       0.56      0.22      0.31     16023
           0       0.39      0.88      0.54     15940

    accuracy                           0.42     48000
   macro avg       0.46      0.42      0.36     48000
weighted avg       0.46      0.42      0.36     48000



Optimising Alpha:

In [None]:
# mnb_params = {'alpha' : np.linspace(0, 1, num=20)}

# grid = GridSearchCV(estimator = MultinomialNB(), 
#                     param_grid = mnb_params, 
#                     scoring = 'accuracy',
#                     cv = KFold(n_splits=10,
#                               random_state=42,
#                               shuffle=True
#                              ),
#                     n_jobs = -1,
#                     return_train_score=True
#                    )

# grid = grid.fit(X_train, nb_target)
# alpha = grid.best_params_['alpha']
# print(f'alpha: {round(alpha,5)}')
# print(f'Accuracy: {round(grid.best_score_,5)}')

## Stochastic Gradient Descent Classifier
Scaling is necessary for this classifier.

In [19]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler

In [20]:
X_train_s = MinMaxScaler().fit_transform(X_train)

In [22]:
sdg = SGDClassifier(loss='log', 
                    penalty='elasticnet', 
                    alpha=0.0001, 
                    l1_ratio=0.15, 
                    fit_intercept=True, 
                    max_iter=1000, 
                    tol=0.001, 
                    shuffle=True, 
                    verbose=0, 
                    epsilon=0.1, 
                    n_jobs=-1, 
                    random_state=42, 
                    learning_rate='optimal', 
                    eta0=0.0, 
                    power_t=0.5, 
                    early_stopping=True, 
                    validation_fraction=0.1, 
                    n_iter_no_change=5, 
                    class_weight=None, 
                    warm_start=False, 
                    average=False
                   )
sdg_model = sdg.fit(X_train_s, Y_train)
sdg_pred = sdg.predict(X_train_s)

In [23]:
print("Support Vector Machine Classification Report", " ",classification_report(Y_train, sdg_pred, labels= [2, 1, 0]), sep='\n')

Support Vector Machine Classification Report
 
              precision    recall  f1-score   support

           2       0.45      0.22      0.29     16037
           1       0.56      0.34      0.42     16023
           0       0.42      0.80      0.55     15940

    accuracy                           0.45     48000
   macro avg       0.47      0.45      0.42     48000
weighted avg       0.47      0.45      0.42     48000



## Decision Tree Classifier

In [24]:
from sklearn.tree import DecisionTreeClassifier 
dtree = DecisionTreeClassifier(max_depth = 2)
dtree_model = dtree.fit(X_train, Y_train) 
dtree_pred = dtree_model.predict(X_train) 

In [25]:
print("Decision Tree Classification Report", " ",classification_report(Y_train, dtree_pred, labels= [2, 1, 0]), sep='\n')

Decision Tree Classification Report
 
              precision    recall  f1-score   support

           2       0.42      0.50      0.46     16037
           1       0.59      0.20      0.30     16023
           0       0.46      0.67      0.54     15940

    accuracy                           0.46     48000
   macro avg       0.49      0.46      0.43     48000
weighted avg       0.49      0.46      0.43     48000



## Neural Network
### Padded Train Test Split
#### X

In [None]:
max_words_in_post = 150
# This number could be tweaked (look at histogram of word count per class)
padded_X = pad_sequences(X, maxlen = max_words_in_post, padding = 'post')

In [None]:
print(padded_X.shape, ' ', padded_X, sep='\n')

#### y

In [None]:
y = pd.get_dummies(df['Y']).values

In [None]:
print(y.shape,' ', y, sep='\n')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state = 42
                                                   )

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

### Creating the Model

In [None]:
# model = Sequential()
# model.add(Embedding(vocab_size+1, max_words_in_post, input_length=X_train.shape[1], mask_zero=True))
# model.add(SpatialDropout1D(0.3))
# model.add(GRU(75, dropout=0.2))
# model.add(Dense(Y_train.shape[1], activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer='adam', 
#               metrics=['categorical_accuracy']
#              )
# model.summary()

In [None]:
# plot_model(loaded, to_file='data/img/BodyOnlyModel.png',show_shapes=True)

### Training the model (on Google Colab)

I have exported the training step to a google colab gpu runtime to speed up the process. I have tried several sequential neural networks:
* Embedding - SpatialDropout1D - LSTM - Dense
* Embedding - SpatialDropout1D - LSTM - SpatialDropout1D - LSTM - Dense
* Embedding - SpatialDropout1D - GRU - Dense

The fitting parameters stayed the same for all models, but the layers parameters changed between the iterations. A couple of layers did not see much variation:
* The embedding layer (input layer) with an input size of 73748 (the vocabulary +1), an output size of 100 or 150 (maximum words in a question) and the `zero_mask=` flag on (the training stage is faster with this option 'on'; from about 8 epochs to about 4 for the same model parameters)
* The final dense layer (output layer) used a "softmax" activation, and had 3 nodes (for the 3 grades of questions).

The best model (in terms of categorical_accuracy) is the GRU, reaching 0.9179. The LSTM is right behind with a categorical_accuracy of 0.9143. The training times are comparable.

In [None]:
# history = model.fit(X_train,
#                     Y_train,
#                     epochs=20,
#                     validation_split=0.1,
#                     callbacks=[EarlyStopping(monitor='val_loss',
#                                              min_delta=0.0001,
#                                              patience=3,
#                                              verbose=1,
#                                              mode='min',
#                                              restore_best_weights=True
#                                              ),
#                             #    ModelCheckpoint(filepath='drive/My Drive/Colab Notebooks/Model/Checkpoints/epoch-{epoch:03d}.ckpt', 
#                             #                    verbose=1
#                             #                    )
#                               ]
#                    )

### Model metrics

In [None]:
# plt.title('Loss')
# plt.plot(history.history['loss'], label='train')
# plt.plot(history.history['val_loss'], label='val')
# plt.legend()
# plt.show();

In [None]:
# plt.title('Accuracy')
# plt.plot(history.history['accuracy'], label='train')
# plt.plot(history.history['val_accuracy'], label='val')
# plt.legend()
# plt.show();

### Saving the model

In [None]:
# model.save('data/model//')

### Loading the model

In [None]:
# loaded = keras.models.load_model('data/model/embGRU/')

### Evaluation

In [None]:
# loss, accuracy = loaded.evaluate(X_test,Y_test)
# print(f'Test set:\n\
#           Loss: {loss:0.3f}\n\
#           Accuracy: {accuracy:0.3f}'
#      )

### Prediction

In [None]:
# # Questions from the website (might be in the dataset, need to check)
# # HQ score = 25042, ID = 11227809
# # LQ_CLOSE = -26, ID = 24681866
# # LQ_EDIT = 9, ID = 3977535
# hq_question = """Here is a piece of C++ code that shows some very peculiar behavior. For some strange reason, sorting the data miraculously makes the code almost six times faster: code Without std::sort(data, data + arraySize);, the code runs in 11.54 seconds. With the sorted data, the code runs in 1.93 seconds. Initially, I thought this might be just a language or compiler anomaly, so I tried Java: code  With a similar but less extreme result. My first thought was that sorting brings the data into the cache, but then I thought how silly that was because the array was just generated. What is going on? Why is processing a sorted array faster than processing an unsorted array? The code is summing up some independent terms, so the order should not matter."""
# lq_close_question = """My html code is code How I convert it into wordpress menu? Actually I want include in wordpress menu title="features" data-hover="Features" """
# lq_edit_question = """I have two different timeseries with partially overlapping timestamps: code which represents following data: code I would like to calculate a weighted average on every day with coefficients a(0.3) and b(0.7), while ignoring missing values: code when I first try to align these timeseries: code I get correctly masked timeseries: code but when I do a1 * 0.3 + b1 * 0.7, it ignores values, that are present in one timeseries only: code What should I do to receive the awaited? code EDIT: The answer should be applicable also to more than two initial timeseries with different weights and differently missing values. o if we have four timeseries with weights T1(0.1), T2(0.2), T3(0.3) and T4(0.4), their weights at a given timestamp will be: code"""
# questions = [hq_question, lq_edit_question, lq_close_question]

# for i in questions:
#     sequenced = token.texts_to_sequences(i.split())
#     padded = pad_sequences(sequenced, maxlen=max_words_in_post, padding='post')
#     prediction = loaded.predict(padded)
#     labels = pd.get_dummies(df['Y']).columns
#     print(prediction[1], labels[np.argmax(prediction[1])], sep='\n')

In [None]:
# for i in df.sample(5).index:
#     sequenced = token.texts_to_sequences(df['Body'].loc[i].split())
#     padded = pad_sequences(sequenced, maxlen=max_words_in_post, padding='post')
#     prediction = loaded.predict(padded)
#     labels = pd.get_dummies(df['Y']).columns
#     print(f"Predicted:{labels[np.argmax(prediction[1])]}\n     Real:{df['Y'].loc[i]}\n")

As we can see, we have an accuracy of about 75%. Unfortunately, the prediction chapter shows that mis-classifications are still fairly common. We can marginally improve this score by not restricting the length of vocabulary:

# No Vocabulary Limitation
## Loading the model

In [None]:
loaded = keras.models.load_model('data/model/embGRU-AllVoc/')

## Evaluation

In [None]:
loss, accuracy = loaded.evaluate(X_test,Y_test)
print(f'Test set:\n\
          Loss: {loss:0.3f}\n\
          Accuracy: {accuracy:0.3f}'
     )

# Not in line with google colab accuracy. Have a look!