In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.utils import np_utils
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_train = pd.read_json("./data/train_lem.json")
df_val = pd.read_json("./data/val_lem.json")
df_test = pd.read_json("./data/test_lem.json")

In [3]:
df_train = df_train.sample(frac=1)
df_val = df_val.sample(frac=1)
df_test = df_test.sample(frac=1)

In [4]:
file = open("./data/useless_words.txt")
useless_words = list(file.read().split(" "))

In [5]:
tfidf = TfidfVectorizer(max_features=5000, stop_words=useless_words)

In [6]:
x_train = tfidf.fit_transform(df_train["text"]).toarray()
y_train = np_utils.to_categorical(df_train["subreddit_id"])

x_val = tfidf.transform(df_val["text"]).toarray()
y_val = np_utils.to_categorical(df_val["subreddit_id"])

x_test = tfidf.transform(df_test["text"]).toarray()
y_test = np_utils.to_categorical(df_test["subreddit_id"])

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, input_dim=5000, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(102),
    tf.keras.layers.Dense(units=102, activation='softmax')
]) 

model.compile(optimizer='adam',
              loss="categorical_crossentropy",
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               1280256   
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 102)               26214     
_________________________________________________________________
dense_2 (Dense)              (None, 102)               10506     
Total params: 1,316,976
Trainable params: 1,316,976
Non-trainable params: 0
_________________________________________________________________


In [8]:
h = model.fit(x_train, 
              y_train,
              epochs=3,
              batch_size=32,
              validation_data=(x_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
results = model.evaluate(x_test, y_test, batch_size=32)
print("test loss, test acc:", results)

test loss, test acc: [0.7454023957252502, 0.8389706015586853]


In [10]:
predictions = np.round(model.predict(x_test[:100]))
for i in range(len(predictions)):
    print(df_test["text"].values[i][:100],"...")
    print("Pred: ", predictions[i], "Real: ", y_test[i])

ars ago and crushed my soul, so I decid ...
Pred:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0.] Real:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
License Plate Light Replacement 2004 RX330 Hey Everyone,<lb><lb>The other day I noticed that one of  ...
Pred:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.