In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare the data

In [2]:
import string

titles = pd.read_csv("./data/medium_post_titles.csv")
titles.drop(columns=["subtitle", "subtitle_truncated_flag"], inplace=True)

titles.title = titles.title.str.replace('[{}]'.format(string.punctuation), '').str.lower()
titles.title = titles.title.str.replace('‘', '')
titles.title = titles.title.str.replace('’', '')
titles.title = titles.title.str.replace('“', '')
titles.title = titles.title.str.replace('”', '')
titles.title = titles.title.str.replace('—', '')

titles.head()

Unnamed: 0,category,title
0,work,21 conversations a fun and easy game for team...
1,spirituality,biblical porn at mars hill
2,lgbtqia,cisgender is that a disease
3,equality,call me nat love black cowboys and the frontie...
4,artificial-intelligence,can i train my model on your computer


In [3]:
from sklearn.model_selection import train_test_split

titles_train, titles_test = train_test_split(titles, train_size=0.8, stratify=titles.category.values)

X_train = titles_train.title.str.split().tolist()
X_test = titles_test.title.str.split().tolist()

y_train = titles_train.category.values
y_test = titles_test.category.values

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

integer_encoder = LabelEncoder()
integer_train = integer_encoder.fit_transform(y_train).reshape((-1, 1))
integer_test = integer_encoder.transform(y_test).reshape((-1, 1))

onehot_encoder = OneHotEncoder(sparse=False)
onehot_train = onehot_encoder.fit_transform(integer_train)
onehot_test = onehot_encoder.transform(integer_test)

In [6]:
def reveseOneHot(prediction_output):
    integer_encode = onehot_encoder.inverse_transform(prediction_output).flatten().astype(int)
    return integer_encoder.inverse_transform(integer_encode)    

In [7]:
from itertools import chain

def extractVocabulary(titles, min_count):
    
    flatten_titles = list(chain(*titles))
    
    vocabulary, counts = np.unique(flatten_titles, return_counts=True)
    vocabulary = vocabulary[counts >= 2]
    
    return vocabulary

In [8]:
from gensim.sklearn_api import W2VTransformer

vectorizer = W2VTransformer(size=128, window=5, min_count=2, null_word=1, workers=-2, iter=500)
vectorizer.fit(X_train)
vocabulary = extractVocabulary(X_train, 2)

In [9]:
def mapTitle2Vectors(title, vectorizer, vocabulary, vectorSize, maxLength):
    
    vectorsMatrix = np.zeros((maxLength, vectorSize))
    for i, word in enumerate(title):
        if word in vocabulary:
            vectorsMatrix[i, :] = vectorizer.transform(word)
            
    return vectorsMatrix

In [10]:
from multiprocessing import Pool
from multiprocessing import cpu_count
from functools import partial

def mapTitles2Vectors(titles, vectorizer, vocabulary, vectorSize, maxLength):
    
    pool = Pool(cpu_count() - 1)
    
    nTitles = len(titles)
    
#     representations = [[] for i in range(nTitles)]
#     for i, title in tqdm(enumerate(titles)):
#         representations[i] = mapTitle2Vectors(title, vectorizer, vocabulary, vectorSize, maxLength)
    
    mapTitle2Vectors_partial = partial(mapTitle2Vectors, vectorizer=vectorizer, vocabulary=vocabulary, vectorSize=vectorSize, maxLength=maxLength)
    representations = pool.map(mapTitle2Vectors_partial, titles)
    
    return np.concatenate(list(map(lambda mat: mat[np.newaxis, :], representations)), axis=0)

In [11]:
maxTitleLength = max(map(len, X_train + X_test))
vectors_train = mapTitles2Vectors(X_train, vectorizer, vocabulary, 128, maxTitleLength)
vectors_test = mapTitles2Vectors(X_test, vectorizer, vocabulary, 128, maxTitleLength)

Process ForkPoolWorker-10:
Process ForkPoolWorker-22:
Process ForkPoolWorker-9:
Process ForkPoolWorker-7:
Process ForkPoolWorker-14:
Process ForkPoolWorker-13:
Process ForkPoolWorker-8:
Process ForkPoolWorker-12:
Process ForkPoolWorker-18:
Process ForkPoolWorker-6:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-21:
Process ForkPoolWorker-17:
Process ForkPoolWorker-15:
Process ForkPoolWorker-20:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Process ForkPoolWorker-19:
Process ForkPoolWorker-16:
Process ForkPoolWorker-11:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call la

  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker


# Recurrent neural network

In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.callbacks import EarlyStopping
from time import time

recurrentLayer = SimpleRNN(256, activation="relu", input_shape=(maxTitleLength, 128))
fullyConnectedLayer1 = Dense(128, activation="relu")
fullyConnectedLayer2 = Dense(93, activation="softmax")

classifier = Sequential([recurrentLayer, fullyConnectedLayer1, fullyConnectedLayer2])
classifier.compile(loss='categorical_crossentropy', optimizer="adam", metrics=["acc"])

early_stopping = EarlyStopping(monitor='val_acc', mode="max", patience=50, min_delta=0.0001)
start_time = time()
history = classifier.fit(vectors_train,onehot_train, epochs=500, batch_size=1024*8, validation_split=0.1, verbose=1, callbacks=[early_stopping])
end_time = time()

print("Time elapsed during training: {}".format(end_time - start_time))
accuracy = model.evaluate(vectors_test, onehot_test, verbose=0)
print("Accuracy on held-out test set: {}".format(accuracy))

Train on 91020 samples, validate on 10114 samples
Epoch 1/500
16384/91020 [====>.........................] - ETA: 5:01 - loss: 4.5317 - acc: 0.0370

Process ForkPoolWorker-157:
Process ForkPoolWorker-167:
Process ForkPoolWorker-165:
Process ForkPoolWorker-168:
Process ForkPoolWorker-158:
Process ForkPoolWorker-159:
Process ForkPoolWorker-161:
Process ForkPoolWorker-155:
Process ForkPoolWorker-163:
Process ForkPoolWorker-156:
Process ForkPoolWorker-160:
Process ForkPoolWorker-164:
Process ForkPoolWorker-162:
Process ForkPoolWorker-171:
Process ForkPoolWorker-169:
Process ForkPoolWorker-166:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
P

KeyboardInterrupt
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
  File "/anaconda3/envs/SpringBoard/lib/

  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anac

  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
KeyboardInterrupt
  File "/anaconda3/envs/SpringBoard/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/anacon

KeyboardInterrupt: 