## Some pitfalls of training neural networks for NLP

Adapted from https://realpython.com/python-keras-text-classification/

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import SGD

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

### Download and read in data

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip    

In [None]:
!unzip -o sentiment\ labelled\ sentences.zip

In [None]:
filepath = "sentiment labelled sentences/yelp_labelled.txt"
df_yelp = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')

In [None]:
df_yelp.head()

In [None]:
df_yelp.shape

### Partition data into train and test

In [None]:
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

(sentences_train, sentences_test, 
 y_train, y_test) = train_test_split(sentences, y, 
                                     test_size=0.25, 
                                     random_state=1337)

In [None]:
sentences_train.shape

### Baseline model: logistic regression

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
score = lr_model.score(X_test, y_test)
print('Logistic regression accuracy for Yelp data: {:.4f}'.format(score))

### Keras model

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

#### Single-layer model

In [None]:
input_dim = X_train.shape[1]
k1_model = Sequential()
k1_model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
k1_model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
k1_model.compile(loss='binary_crossentropy', 
              optimizer=SGD(lr=0.01, momentum=0.9), 
              metrics=['accuracy'])
k1_model.summary()

In [None]:
history = k1_model.fit(X_train, y_train,
                       epochs=100,
                       verbose=False,
                       validation_data=(X_test, y_test),
                       batch_size=10)

In [None]:
plot_history(history)

In [None]:
loss, accuracy = k1_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = k1_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

#### Keras logistic regression

In [None]:
input_dim = X_train.shape[1]
klr_model = Sequential()
klr_model.add(layers.Dense(1, input_dim=input_dim, activation='sigmoid'))

In [None]:
klr_model.compile(loss='binary_crossentropy', 
              optimizer=SGD(lr=0.01, momentum=0.9), 
              metrics=['accuracy'])
klr_model.summary()

In [None]:
history = klr_model.fit(X_train, y_train,
                       epochs=100,
                       verbose=False,
                       validation_data=(X_test, y_test),
                       batch_size=10)

In [None]:
plot_history(history)

In [None]:
loss, accuracy = klr_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = klr_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

### Problems with convergence

Learning rate too high

In [None]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer=SGD(lr=4, momentum=0.9), 
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                       epochs=100,
                       verbose=False,
                       validation_data=(X_test, y_test),
                       batch_size=10)

In [None]:
plot_history(history)

### Overtraining

High learning rate; too many iterations

In [None]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer=SGD(lr=0.1, momentum=0.9), 
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                       epochs=100,
                       verbose=False,
                       validation_data=(X_test, y_test),
                       batch_size=10)

In [None]:
plot_history(history)

### Model complexity

Deeper networks are harder to train

In [None]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(10, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(40, input_dim=input_dim, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', 
              optimizer=SGD(lr=0.01, momentum=0.9), 
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                       epochs=100,
                       verbose=False,
                       validation_data=(X_test, y_test),
                       batch_size=10)

In [None]:
plot_history(history)