In [1]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Embedding, GRU, LSTM, MaxPool1D, SpatialDropout1D
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd



In [2]:
df = pd.read_csv('data/processed/data_with_stem_lem.csv', index_col='Id')

In [3]:
df.fillna('code', inplace=True)

# Modelling
## Train Test Split
### X
The vocabulary size and the cut-off for a question length have been determined in the third notebook: [Word Analysis](./03-WordAnalysis.ipynb).

In [4]:
vocab_size = 73_747
token = Tokenizer(num_words=vocab_size,
                  filters="""!"#$'%&()*+,-./:;<=>?@[\]^_`{|}~""",
                  lower=True
                 )

token.fit_on_texts(df['Body'].values)
X = token.texts_to_sequences(df['Body'].values)
len(token.word_index)

105215

In [5]:
max_words_in_post = 150
padded_X = pad_sequences(X, maxlen = max_words_in_post, padding = 'post')

### y

In [6]:
y = df['Y'].replace({'HQ':2,'LQ_EDIT':1,'LQ_CLOSE':0}).values

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state = 42
                                                   )
X_train, X_val, Y_train, Y_val = train_test_split(X_train,
                                                    Y_train,
                                                    test_size = 0.20,
                                                    random_state = 42
                                                   )

In [8]:
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)
print(X_test.shape,Y_test.shape)

(38400, 150) (38400,)
(9600, 150) (9600,)
(12000, 150) (12000,)


## Gaussian Naive Bayes and Multinomial Naive Bayes

In [9]:
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, Y_train)

In [10]:
gnb_pred = gnb.predict(X_val)

In [11]:
print("Gaussian Naive Bayes Classification Report",
      " ",
      classification_report(Y_val,
                            gnb_pred
                           ),
      sep='\n'
     )

Gaussian Naive Bayes Classification Report
 
              precision    recall  f1-score   support

           0       0.38      0.93      0.54      3206
           1       0.63      0.24      0.34      3129
           2       0.43      0.08      0.13      3265

    accuracy                           0.42      9600
   macro avg       0.48      0.42      0.34      9600
weighted avg       0.48      0.42      0.34      9600



In [12]:
mnb = MultinomialNB(alpha=1)
mnb_model = mnb.fit(X_train, Y_train)

In [13]:
mnb_pred = mnb.predict(X_val)

In [14]:
print("Multinomial Naive Bayes Classification Report",
      " ",
      classification_report(Y_val,
                            mnb_pred
                           ),
      sep='\n'
     )

Multinomial Naive Bayes Classification Report
 
              precision    recall  f1-score   support

           0       0.40      0.88      0.55      3206
           1       0.53      0.22      0.32      3129
           2       0.42      0.15      0.23      3265

    accuracy                           0.42      9600
   macro avg       0.45      0.42      0.36      9600
weighted avg       0.45      0.42      0.36      9600



**Optimising Alpha:**

In [15]:
# mnb_params = {'alpha' : np.linspace(0, 1, num=20)}

# grid = GridSearchCV(estimator = MultinomialNB(), 
#                     param_grid = mnb_params, 
#                     scoring = 'accuracy',
#                     cv = KFold(n_splits=10,
#                               random_state=42,
#                               shuffle=True
#                              ),
#                     n_jobs = -1,
#                     return_train_score=True
#                    )

# grid = grid.fit(X_train, nb_target)
# alpha = grid.best_params_['alpha']
# print(f'alpha: {round(alpha,5)}')
# print(f'Accuracy: {round(grid.best_score_,5)}')

## Stochastic Gradient Descent Classifier
Scaling is necessary for this classifier.

In [16]:
X_train_s = MinMaxScaler().fit_transform(X_train)
X_val_s = MinMaxScaler().fit_transform(X_val)

In [17]:
sgd = SGDClassifier(loss='log', 
                    penalty='elasticnet', 
                    alpha=0.0001, 
                    l1_ratio=0.15, 
                    fit_intercept=True, 
                    max_iter=1000, 
                    tol=0.001, 
                    shuffle=True, 
                    verbose=0, 
                    epsilon=0.1, 
                    n_jobs=-1, 
                    random_state=42, 
                    learning_rate='optimal', 
                    eta0=0.0, 
                    power_t=0.5, 
                    early_stopping=True, 
                    validation_fraction=0.1, 
                    n_iter_no_change=5, 
                    class_weight=None, 
                    warm_start=False, 
                    average=False
                   )
sgd_model = sgd.fit(X_train_s, Y_train)
sgd_pred = sgd.predict(X_val_s)

In [18]:
print("Support Vector Machine Classification Report",
      " ",
      classification_report(Y_val,
                            sgd_pred,
                            labels= [2, 1, 0]
                           ),
      sep='\n'
     )

Support Vector Machine Classification Report
 
              precision    recall  f1-score   support

           2       0.42      0.12      0.18      3265
           1       0.50      0.38      0.43      3129
           0       0.41      0.81      0.55      3206

    accuracy                           0.43      9600
   macro avg       0.44      0.44      0.39      9600
weighted avg       0.44      0.43      0.39      9600



## Decision Tree Classifier

In [19]:
dtree = DecisionTreeClassifier()
dtree_model = dtree.fit(X_train, Y_train) 
dtree_pred = dtree_model.predict(X_val) 

In [20]:
print("Decision Tree Classification Report",
      " ",classification_report(Y_val,
                                dtree_pred,
                                labels= [2, 1, 0]
                               ),
      sep='\n'
     )

Decision Tree Classification Report
 
              precision    recall  f1-score   support

           2       0.41      0.41      0.41      3265
           1       0.50      0.51      0.50      3129
           0       0.45      0.44      0.45      3206

    accuracy                           0.45      9600
   macro avg       0.45      0.45      0.45      9600
weighted avg       0.45      0.45      0.45      9600



In [21]:
print(f'Tree depth:{dtree.get_depth()}, number of leaves:{dtree.get_n_leaves()}')

Tree depth:46, number of leaves:8493


## Predictions

In [45]:
models = [gnb, mnb, sgd, dtree]
sequenced = token.texts_to_sequences(df[df['Y']=='HQ']['Body'].values)
padded = pad_sequences(sequenced, maxlen=max_words_in_post, padding='post')
for model in models:
    prediction = model.predict(padded)
    print(10*'-',f"Prediction for {model}", sep='\n')
    HQ = 0
    LQ_EDIT = 0
    LQ_CLOSE = 0
    for i in prediction:
        if i == 2:
            HQ +=1
        elif i == 1:
            LQ_EDIT += 1
        else:
            LQ_CLOSE += 1
    print(f'HQ= {HQ/200}%\nLQ_EDIT= {LQ_EDIT/200}%\nLQ_CLOSE= {LQ_CLOSE/200}%')

----------
Prediction for GaussianNB()
HQ= 7.0%
LQ_EDIT= 15.045%
LQ_CLOSE= 77.955%
----------
Prediction for MultinomialNB(alpha=1)
HQ= 15.69%
LQ_EDIT= 14.2%
LQ_CLOSE= 70.11%
----------
Prediction for SGDClassifier(early_stopping=True, loss='log', n_jobs=-1, penalty='elasticnet',
              random_state=42)
HQ= 1.045%
LQ_EDIT= 98.825%
LQ_CLOSE= 0.13%
----------
Prediction for DecisionTreeClassifier()
HQ= 74.66%
LQ_EDIT= 11.665%
LQ_CLOSE= 13.675%


In [22]:
models = [gnb, mnb, sgd, dtree]

# Questions from the website (might be in the dataset, need to check)
# HQ score       = 25042, ID = 11227809
# LQ_CLOSE score =   -26, ID = 24681866
# LQ_EDIT score  =     9, ID =  3977535
hq_question = """Here is a piece of C++ code that shows some very peculiar behavior. For some strange reason, sorting the data miraculously makes the code almost six times faster: code Without std::sort(data, data + arraySize);, the code runs in 11.54 seconds. With the sorted data, the code runs in 1.93 seconds. Initially, I thought this might be just a language or compiler anomaly, so I tried Java: code  With a similar but less extreme result. My first thought was that sorting brings the data into the cache, but then I thought how silly that was because the array was just generated. What is going on? Why is processing a sorted array faster than processing an unsorted array? The code is summing up some independent terms, so the order should not matter."""
lq_close_question = """My html code is code How I convert it into wordpress menu? Actually I want include in wordpress menu title="features" data-hover="Features" """
lq_edit_question = """I have two different timeseries with partially overlapping timestamps: code which represents following data: code I would like to calculate a weighted average on every day with coefficients a(0.3) and b(0.7), while ignoring missing values: code when I first try to align these timeseries: code I get correctly masked timeseries: code but when I do a1 * 0.3 + b1 * 0.7, it ignores values, that are present in one timeseries only: code What should I do to receive the awaited? code EDIT: The answer should be applicable also to more than two initial timeseries with different weights and differently missing values. o if we have four timeseries with weights T1(0.1), T2(0.2), T3(0.3) and T4(0.4), their weights at a given timestamp will be: code"""
questions = [hq_question, lq_edit_question, lq_close_question]

sequenced = token.texts_to_sequences(questions)
padded = pad_sequences(sequenced, maxlen=max_words_in_post, padding='post')
print('Expected Output\nHQ\nLQ_EDIT\nLQ_CLOSE')
for model in models:
    prediction = model.predict(padded)
    print(10*'-',f"Prediction for {model}", sep='\n')
    for i in prediction:
        if i == 2:
            print('HQ')
        elif i == 1:
            print('LQ_EDIT')
        else:
            print('LQ_CLOSE')

Expected Output
HQ
LQ_EDIT
LQ_CLOSE
----------
Prediction for GaussianNB()
HQ
LQ_EDIT
LQ_CLOSE
----------
Prediction for MultinomialNB(alpha=1)
LQ_EDIT
LQ_EDIT
LQ_CLOSE
----------
Prediction for SGDClassifier(early_stopping=True, loss='log', n_jobs=-1, penalty='elasticnet',
              random_state=42)
LQ_EDIT
LQ_EDIT
LQ_EDIT
----------
Prediction for DecisionTreeClassifier()
HQ
LQ_EDIT
HQ


## Neural Network
### Padded Train Test Split
#### X

In [71]:
max_words_in_post = 150
# This number could be tweaked (look at histogram of word count per class)
padded_X = pad_sequences(X, maxlen = max_words_in_post, padding = 'post')

In [72]:
print(padded_X.shape, ' ', padded_X, sep='\n')

(60000, 150)
 
[[   2    6  228 ...    0    0    0]
 [ 110  664    3 ...    0    0    0]
 [   2  113    3 ...    0    0    0]
 ...
 [1731  507    1 ...    0    0    0]
 [ 488  184   87 ...    0    0    0]
 [ 744 2490  186 ...    0    0    0]]


#### y

In [73]:
y = pd.get_dummies(df['Y']).values

In [74]:
print(y.shape,' ', y, sep='\n')

(60000, 3)
 
[[0 1 0]
 [0 0 1]
 [1 0 0]
 ...
 [0 0 1]
 [0 1 0]
 [0 1 0]]


In [75]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state = 42
                                                   )

In [76]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(48000, 150) (48000, 3)
(12000, 150) (12000, 3)


### Creating the Model

In [77]:
# model = Sequential()
# model.add(Embedding(vocab_size+1, max_words_in_post, input_length=X_train.shape[1], mask_zero=True))
# model.add(SpatialDropout1D(0.3))
# model.add(GRU(75, dropout=0.2))
# model.add(Dense(Y_train.shape[1], activation='softmax'))

# model.compile(loss='categorical_crossentropy',
#               optimizer='adam', 
#               metrics=['categorical_accuracy']
#              )
# model.summary()

In [78]:
# plot_model(loaded, to_file='data/img/BodyOnlyModel.png',show_shapes=True)

### Training the model (on Google Colab)

I have exported the training step to a google colab gpu runtime to speed up the process. I have tried several sequential neural networks:
* Embedding - SpatialDropout1D - LSTM - Dense
* Embedding - SpatialDropout1D - LSTM - SpatialDropout1D - LSTM - Dense
* Embedding - SpatialDropout1D - GRU - Dense

The fitting parameters stayed the same for all models, but the layers parameters changed between the iterations. A couple of layers did not see much variation:
* The embedding layer (input layer) with an input size of 73748 (the vocabulary +1), an output size of 100 or 150 (maximum words in a question) and the `zero_mask=` flag on (the training stage is faster with this option 'on'; from about 8 epochs to about 4 for the same model parameters)
* The final dense layer (output layer) used a "softmax" activation, and had 3 nodes (for the 3 grades of questions).

The best model (in terms of categorical_accuracy) is the GRU, reaching 0.9179. The LSTM is right behind with a categorical_accuracy of 0.9143. The training times are comparable.

In [79]:
# history = model.fit(X_train,
#                     Y_train,
#                     epochs=20,
#                     validation_split=0.1,
#                     callbacks=[EarlyStopping(monitor='val_loss',
#                                              min_delta=0.0001,
#                                              patience=3,
#                                              verbose=1,
#                                              mode='min',
#                                              restore_best_weights=True
#                                              ),
#                             #    ModelCheckpoint(filepath='drive/My Drive/Colab Notebooks/Model/Checkpoints/epoch-{epoch:03d}.ckpt', 
#                             #                    verbose=1
#                             #                    )
#                               ]
#                    )

### Model metrics

In [80]:
# plt.title('Loss')
# plt.plot(history.history['loss'], label='train')
# plt.plot(history.history['val_loss'], label='val')
# plt.legend()
# plt.show();

In [81]:
# plt.title('Accuracy')
# plt.plot(history.history['accuracy'], label='train')
# plt.plot(history.history['val_accuracy'], label='val')
# plt.legend()
# plt.show();

### Saving the model

In [82]:
# model.save('data/model//')

### Loading the model

In [103]:
loaded = load_model('data/model/emb-lstmx2/')

### Evaluation

In [None]:
loss, accuracy = loaded.evaluate(X_test,Y_test)
print(f'Test set:\n\
          Loss: {loss:0.3f}\n\
          Accuracy: {accuracy:0.3f}'
     )

### Prediction

In [31]:
# Questions from the website (might be in the dataset, need to check)
# HQ score = 25042, ID = 11227809
# LQ_CLOSE = -26, ID = 24681866
# LQ_EDIT = 9, ID = 3977535
hq_question = """Here is a piece of C++ code that shows some very peculiar behavior. For some strange reason, sorting the data miraculously makes the code almost six times faster: code Without std::sort(data, data + arraySize);, the code runs in 11.54 seconds. With the sorted data, the code runs in 1.93 seconds. Initially, I thought this might be just a language or compiler anomaly, so I tried Java: code  With a similar but less extreme result. My first thought was that sorting brings the data into the cache, but then I thought how silly that was because the array was just generated. What is going on? Why is processing a sorted array faster than processing an unsorted array? The code is summing up some independent terms, so the order should not matter."""
lq_close_question = """My html code is code How I convert it into wordpress menu? Actually I want include in wordpress menu title="features" data-hover="Features" """
lq_edit_question = """I have two different timeseries with partially overlapping timestamps: code which represents following data: code I would like to calculate a weighted average on every day with coefficients a(0.3) and b(0.7), while ignoring missing values: code when I first try to align these timeseries: code I get correctly masked timeseries: code but when I do a1 * 0.3 + b1 * 0.7, it ignores values, that are present in one timeseries only: code What should I do to receive the awaited? code EDIT: The answer should be applicable also to more than two initial timeseries with different weights and differently missing values. o if we have four timeseries with weights T1(0.1), T2(0.2), T3(0.3) and T4(0.4), their weights at a given timestamp will be: code"""
questions = [hq_question, lq_edit_question, lq_close_question]

sequenced = token.texts_to_sequences(questions)
padded = pad_sequences(sequenced, maxlen=max_words_in_post, padding='post')
prediction = loaded.predict(padded)
labels = pd.get_dummies(df['Y']).columns
for i in prediction:
    print(i, labels[np.argmax(i)], sep='\n')

[0.0021616  0.45251447 0.54532397]
LQ_EDIT
[0.8673554  0.09242888 0.04021572]
HQ
[0.26463172 0.60632634 0.12904194]
LQ_CLOSE


As we can see, we have an accuracy of about 75%. Unfortunately, the prediction chapter shows that mis-classifications are still fairly common. We can marginally improve this score by not restricting the length of vocabulary:

# No Vocabulary Limitation
## Train Test Split
### X

In [85]:
vocab_size = 290_479 # No restriction on vocab length
token = Tokenizer(num_words=vocab_size,
                  filters="""!"#$'%&()*+,-./:;<=>?@[\]^_`{|}~""",
                  lower=True
                 )

token.fit_on_texts(df['Body'].values)
X = token.texts_to_sequences(df['Body'].values)
len(token.word_index)

105215

In [86]:
max_words_in_post = 150
padded_X = pad_sequences(X, maxlen = max_words_in_post, padding = 'post')

### y

In [92]:
y = pd.get_dummies(df['Y']).values

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state = 42
                                                   )

In [94]:
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)
print(X_test.shape,Y_test.shape)

(48000, 150) (48000, 3)
(9600, 150) (9600,)
(12000, 150) (12000, 3)


## Loading the Neural Network Model

In [95]:
loaded = load_model('data/model/embGRU-AllVoc/')

## Evaluation

In [96]:
loss, accuracy = loaded.evaluate(X_test,Y_test)
print(f'Test set:\n\
          Loss: {loss:0.3f}\n\
          Accuracy: {accuracy:0.3f}'
     )

Test set:
          Loss: 1.273
          Accuracy: 0.500


# Not in line with google colab accuracy. Have a look!