In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.utils import np_utils
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df_train = pd.read_json("./data/train_lem.json")
df_val = pd.read_json("./data/val_lem.json")
df_test = pd.read_json("./data/test_lem.json")

In [5]:
df_train = df_train.sample(frac=1)
df_val = df_val.sample(frac=1)
df_test = df_test.sample(frac=1)

In [4]:
file = open("./data/useless_words.txt")
useless_words = list(file.read().split(" "))

In [5]:
'''Szótár építés'''
tfidf = TfidfVectorizer(max_features=5000, stop_words=useless_words, use_idf=True)

In [11]:
'''Szövegek átalakítása TF-IDF vektorrá'''
x_train = tfidf.fit_transform(df_train["text"]).toarray()
y_train = df_train["subreddit_id"].values

x_val = tfidf.transform(df_val["text"]).toarray()
y_val = df_val["subreddit_id"].values

x_test = tfidf.transform(df_test["text"]).toarray()
y_test = df_test["subreddit_id"].values

In [12]:
ann_model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, input_dim=5000, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128),
    tf.keras.layers.Dense(units=102, activation='softmax')
]) 

ann_model.compile(optimizer='adam',
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

ann_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 256)               1280256   
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_5 (Dense)              (None, 102)               13158     
Total params: 1,326,310
Trainable params: 1,326,310
Non-trainable params: 0
_________________________________________________________________


In [13]:
h = ann_model.fit(x_train, 
              y_train,
              epochs=3,
              batch_size=32,
              validation_data=(x_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
results = ann_model.evaluate(x_test, y_test, batch_size=32)
print("test loss, test acc:", results)

test loss, test acc: [0.7518857717514038, 0.8396568894386292]


In [17]:
predictions = np.round(ann_model.predict(x_test[:100]))
for i in range(len(predictions)):
    print(df_test["text"].values[i][:100],"...")
    print("Pred: ", np.where(predictions[i]==1)[0], "Real: ", y_test[i])

The day I cried out in PPTP pain So I recently started a new job for an IT provider which is a consi ...
Pred:  [93] Real:  92
Beginner question: how do you keep track of all your sources when writing a thesis? Hi guys<lb><lb>N ...
Pred:  [3] Real:  3
Why does Bauer dress certain items as other lines? I've seen a few pro stock skates and sticks that  ...
Pred:  [65] Real:  65
Baby Leopard Gecko, New to reptiles, I have some questions I just purchased a fancy albino leopard g ...
Pred:  [70] Real:  70
Is it just me or are Heinz beans actually shit? The sauce is kinda runny and there's like a whole ce ...
Pred:  [] Real:  96
Sonarr no longer copying downloaded files. I run Sonarr on my seedbox, hosted by Seedhost.eu. It was ...
Pred:  [88] Real:  88
Nest thermostat/low battery I’ve had my Nest thermostat installed for over a year, but I recently ha ...
Pred:  [26] Real:  26
I have a few random questions about the S550s if anyone could help me out. I'm going to be in the ma ...
Pred:  [25

-------------------------------------------------------------------
RNN
---

In [None]:
#import wget
#url = "http://nlp.stanford.edu/data/glove.6B.zip"
#path="./data/"
#filename = wget.download(url, out=path)

In [None]:
#import zipfile
#with zipfile.ZipFile(filename, "r") as fzip:
#    fzip.extractall(path)

In [17]:
embeddings_index = {}
with open("./data/glove.6B.50d.txt", encoding='utf8') as f:
    for line in f:
        word, vector = line.split(maxsplit=1)
        vector = np.fromstring(vector, "f", sep=" ")
        embeddings_index[word] = vector

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [18]:
'''Összegyűjtjük a szótárunk szavaihoz tartozó szó vektorokat'''
num_tokens = len(tfidf.vocabulary_) + 2
embedding_dim = 50
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tfidf.vocabulary_.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

Converted 4903 words (97 misses)


In [19]:
def get_word_indices(x_old):
    x_new = []
    for row in x_old:
        e = enumerate(row)
        for i in e:
            result = []
            for idx, value in e:
                if value > 0.1:
                    result.append(idx)
            x_new.append(result)
    return x_new

In [11]:
'''A TF-IDF vektorból kiszedjük a 0.1-es érték feletti szavak indexét'''
x_train_rnn = get_word_indices(x_train)
x_val_rnn = get_word_indices(x_val)
x_test_rnn = get_word_indices(x_test)

In [12]:
maxlen = 64
x_train_rnn = tf.keras.preprocessing.sequence.pad_sequences(x_train_rnn, maxlen=maxlen)
x_val_rnn = tf.keras.preprocessing.sequence.pad_sequences(x_val_rnn, maxlen=maxlen)
x_test_rnn = tf.keras.preprocessing.sequence.pad_sequences(x_test_rnn, maxlen=maxlen)

In [13]:
rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_tokens, 
                              embedding_dim,                               
                              embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                              trainable=True,
                              input_length=maxlen),
    tf.keras.layers.LSTM(units=64, activation='tanh'),
    tf.keras.layers.Dense(102, activation='softmax')
])

rnn_model.compile(optimizer='adam',
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 64, 50)            250100    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dense (Dense)                (None, 102)               6630      
Total params: 286,170
Trainable params: 286,170
Non-trainable params: 0
_________________________________________________________________


In [14]:
h1 = rnn_model.fit(x_train_rnn, 
              y_train,
              epochs=5,
              batch_size=32,
              validation_data=(x_val_rnn, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
results = rnn_model.evaluate(x_test_rnn, y_test, batch_size=32)
print("test loss, test acc:", results)

test loss, test acc: [0.7834381461143494, 0.8105882406234741]


In [None]:
predictions = np.round(rnn_model.predict(x_test_rnn[:100]))
for i in range(len(predictions)):
    print(df_test["text"].values[i][:100],"...")
    print("Pred: ", np.where(predictions[i]==1)[0], "Real: ", y_test[i])