In [34]:
# Credit to https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17 for inspiration and code

In [3]:
# Standard Imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score

# NLP Imports
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

# Keras Imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import tensorflow as tf


# Random Imports
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot

# Google Colab import to bring in dataframes
import io



  import pandas.util.testing as tm


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from google.colab import files
uploaded = files.upload()

Saving data_ai.csv to data_ai.csv
Saving data_ml.csv to data_ml.csv


In [5]:
data_ai = pd.read_csv(io.BytesIO(uploaded['data_ai.csv']))
data_ml= pd.read_csv(io.BytesIO(uploaded['data_ml.csv']))

In [6]:
data_ai.head()

Unnamed: 0,subreddit,title,selftext
0,artificial,Could AI ethics draw on non-Western philosophi...,
1,artificial,Realistic simulation of tearing meat and peeli...,
2,artificial,[R] Using Deep RL to Model Human Locomotion Co...,In the new paper [*Deep Reinforcement Learning...
3,artificial,Artificial Intelligence Easily Beats Human Fig...,
4,artificial,Foiling illicit cryptocurrency mining with art...,


In [7]:
data_ml.head()

Unnamed: 0,subreddit,title,selftext
0,MachineLearning,[R] Taming pretrained transformers for eXtreme...,New X-Transformer model from Amazon Research\n...
1,MachineLearning,[R] Taming pretrained transformers for eXtreme...,
2,MachineLearning,[D] Why can't I find papers from CVRP '20 / Be...,I am looking for a few of the winning papers f...
3,MachineLearning,[D] Help with bone semantic segmentation,"Hi, I'm Anibal and I'm a software developer.\n..."
4,MachineLearning,help with bone semantic segmentation,[removed]


In [8]:
data_ai.shape

(31299, 3)

In [9]:
data_ml.shape

(31299, 3)

In [10]:
df = data_ai.append(data_ml).reset_index()

In [11]:
df.drop(columns='index',inplace=True)

In [12]:
df

Unnamed: 0,subreddit,title,selftext
0,artificial,Could AI ethics draw on non-Western philosophi...,
1,artificial,Realistic simulation of tearing meat and peeli...,
2,artificial,[R] Using Deep RL to Model Human Locomotion Co...,In the new paper [*Deep Reinforcement Learning...
3,artificial,Artificial Intelligence Easily Beats Human Fig...,
4,artificial,Foiling illicit cryptocurrency mining with art...,
...,...,...,...
62593,MachineLearning,What are some things that you wish you knew be...,[removed]
62594,MachineLearning,[D] Does anyone created a formal database for ...,I'm looking for a database that has sufficient...
62595,MachineLearning,"[P] Demo of ""Arbitrary Style Transfer with Sty...",Hi MachineLearning\n\nI'll introduce awsome st...
62596,MachineLearning,[R] Triplet loss for image retrieval,"Hi, there!\n\n \nThis is an example of image ..."


# **Cleaning Function**

In [13]:
# TEXT CLEANING FUNCTION FOR EVERY POST IN BOTH SUBREDDITS

# These will be replaced by a space ' '
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

 # We will get rid of all these in the function below
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

# We will get rid of all of the stopwords
STOPWORDS = set(stopwords.words('english'))


# Function to clean our texts
def clean_text(text):

    # Make all of the text lower case
    text = text.lower() 

    # Replace REPLACE_BY_SPACE_RE symbols with a space
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # substitute the matched string in REPLACE_BY_SPACE_RE with space.
    
    # remove symbols which are in BAD_SYMBOLS_RE from text.
    text = BAD_SYMBOLS_RE.sub('', text) 
    
    text = re.sub(r'\d+', '', text) # This gets rid of the integers

    text = text.replace('x', '')

    # remove stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 

    return text

# Applying the clean_text function above to every title in df['title']
df['title'] = df['title'].apply(clean_text)

## **Preprocessing the data** 

In [14]:
df.head()

Unnamed: 0,subreddit,title,selftext
0,artificial,could ai ethics draw nonwestern philosophies h...,
1,artificial,realistic simulation tearing meat peeling chee...,
2,artificial,r using deep rl model human locomotion control...,In the new paper [*Deep Reinforcement Learning...
3,artificial,artificial intelligence easily beats human fig...,
4,artificial,foiling illicit cryptocurrency mining artifici...,


In [15]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 1_000

# Max number of words in each title.
# First 500 words in the title
MAX_SEQUENCE_LENGTH = 500

# This is the second argument in our embedding layer 
EMBEDDING_DIM = 100

# Keras Tokenizer turning each text in the corpus into either a sequence of integers or into a vector
# Instantiate the Tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

# Use the tokenizer on every document in our corpus
tokenizer.fit_on_texts(df['title'].values)

# Replaces the word with it's index
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 30041 unique tokens.


In [16]:
# Taking the texts in df['title'] and Tokenizing the list of texts
X = tokenizer.texts_to_sequences(df['title'].values)

# Keras pad sequence --> Make sequences the same size! Makes the shape the same 
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) 
print('Shape of data tensor:', X.shape)

Shape of data tensor: (62598, 500)


In [17]:
# Turn our classes into 0's and 1's
y = pd.get_dummies(df['subreddit']).values
print('Shape of label tensor:', y.shape)

Shape of label tensor: (62598, 2)


In [18]:
y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [19]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.10, random_state = 42)
print('Train')
print(X_train.shape,y_train.shape) # training data
print('='*40)
print('Test')
print(X_test.shape,y_test.shape) # testing data

Train
(56338, 500) (56338, 2)
Test
(6260, 500) (6260, 2)


In [20]:
len(X)

62598

In [21]:

model = Sequential() #Instantiate the Sequential Model

model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1])) # Adding the embedding layer 1st
model.add(SpatialDropout1D(0.10)) 
model.add(LSTM(100, dropout=0.10, recurrent_dropout=0.10))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 128

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
# Evaluating our model on the Testing Data
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.417
  Accuracy: 0.815


In [33]:
new_post = ["Decision Trees: Understanding the Basis of Ensemble Methods"]
seq = tokenizer.texts_to_sequences(new_post)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['MachineLearning', 'artifical']
print(pred, labels[np.argmax(pred)])

[[0.73572147 0.26427853]] MachineLearning


## **Scores**
**1st Run:**

- Dropout: .20
- Batchsize: 64
- Epochs: 5

- Epoch 5/5 Accuracy: 0.918    
- Test Set Accuracy: 0.822

**2nd Run:** 

- Dropout: .40   
- Batchsize: 1000   
- Epochs: 5

- Epoch 5/5 Accuracy: 0.883
- Test Set Accuracy: 0.827

**3rd Run:**

- Dropout .40
- Batchsize: 1000
- Epochs: 10
- LSTM: 433
- Epoch 5/5 Accuracy: 0.867
- Test Set Accuracy: 0.817

**4th Run:**

- Dropout .20
- Batchsize: 64
- Epochs: 5
- LSTM: 50
- Epoch 4/5 Accuracy:  0.899
- Test Set Accuracy: 0.811

**5th Run:**

- Dropout .60
- Batchsize: 64
- Epochs: 5
- LSTM: 100
- Epoch 4/5 Accuracy:  0.899
- Test Set Accuracy: 0.811

## **Citations**

Embedding:
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://stats.stackexchange.com/questions/270546/how-does-keras-embedding-layer-work

Keras Sequential Model:
- https://keras.io/guides/sequential_model/

Drop out:
- https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/
- https://machinelearningmastery.com/use-dropout-lstm-networks-time-series-forecasting/#:~:text=Long%20Short%2DTerm%20Memory%20

Softmax:
- https://medium.com/analytics-vidhya/softmax-classifier-using-tensorflow-on-mnist-dataset-with-sample-code-6538d0783b84
- https://medium.com/data-science-bootcamp/understand-the-softmax-function-in-minutes-f3a59641e86d

LSTM:
- https://towardsdatascience.com/choosing-the-right-hyperparameters-for-a-simple-lstm-using-keras-f8e9ed76f046
- https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17 