In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import random
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import load_model

import io
from google.colab import files
uploaded = files.upload()

Saving cv.csv to cv.csv
Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
train = pd.read_csv(io.BytesIO(uploaded['train.csv']))
cv = pd.read_csv(io.BytesIO(uploaded['cv.csv']))
train = pd.concat([train, cv], ignore_index=True)
test = pd.read_csv(io.BytesIO(uploaded['test.csv']))

train_text = (train['book_title'].map(str) + ' ~~~ ' + train['sentence'].map(str)).to_numpy()
train_labels = train['sent_spoil'].to_numpy().astype(np.int32)

test_text = (test['book_title'].map(str) + ' ~~~ ' + test['sentence'].map(str)).to_numpy()
test_labels = test['sent_spoil'].to_numpy().astype(np.int32)

In [None]:
reviewMaxLen = 700

tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)
padded = pad_sequences(sequences, maxlen=reviewMaxLen)

In [None]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 32, input_length=reviewMaxLen) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(32, dropout=0.1, return_sequences=True))
model.add(LSTM(32, dropout=0.2))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
model.compile(loss='binary_crossentropy',optimizer=opt, metrics=[tf.keras.metrics.AUC()])

In [None]:
history = model.fit(padded, train_labels, validation_split=0.0527, epochs=5, batch_size=64, verbose=1)
model.save('model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
INFO:tensorflow:Assets written to: model/assets


In [None]:
model = load_model('model')

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('model AUC')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
predictions = []
i = 0
for text in test_text:
    if i % 1000 == 0:
        print(i)
    i += 1
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=reviewMaxLen)
    prediction = model.predict(pad).item()
    predictions.append(prediction)
predictions = np.array(predictions)

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_labels, predictions)
auc_keras = auc(fpr_keras, tpr_keras)

print(auc_keras)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
0.9102299090478523
