In [1]:
import ast
import collections
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential, load_model, save_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import pandas
import pickle
import os

Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
np.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [3]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model

In [4]:
df = pandas.read_csv('words_all_no_repeats.csv')
allWords = df['main_text'].str.cat(sep=' ').split(' ')
tags = set()
for tag_list in df['tags']:
    tag_list_parsed = ast.literal_eval(tag_list.replace('/', ','))
    for tag in tag_list_parsed:
        tags.add(tag)
allTags = sorted(list(tags))
pickle.dump(allTags, open('models/tags.pkl', 'wb'))
print(allTags)
# allTags = list(set(df['tags'].str.cat(sep=' ').split(' ')))

['2-sat', 'binary search', 'bitmasks', 'brute force', 'chinese remainder theorem', 'combinatorics', 'constructive algorithms', 'data structures', 'dfs and similar', 'divide and conquer', 'dp', 'dsu', 'expression parsing', 'fft', 'flows', 'games', 'geometry', 'graph matchings', 'graphs', 'greedy', 'hashing', 'implementation', 'math', 'matrices', 'meet-in-the-middle', 'number theory', 'probabilities', 'schedules', 'shortest paths', 'sortings', 'special problem', 'string suffix structures', 'strings', 'ternary search', 'trees', 'two pointers']


In [5]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    # print(count)
    dictionary = dict()
    # Add words to dictionary based off of how common they are
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

vocabularyF, vocabularyR = build_dataset(allWords)
#print(vocabularyF)
vocab_size = len(vocabularyF)

In [6]:
text = df['main_text'].as_matrix()
# Let's try binary classification
models = []
scores = []
filename = 'models/model{}.h5'
for i, goal_tag in enumerate(allTags):
    if i < 17:
        continue
    print('{}. Training tag: {}'.format(i + 1, goal_tag))
    # Gets indices for each word in line, for each line in main_text
    data = [[vocabularyF.get(i, 0) for i in j.split(' ')] for j in text]
    labels = [1 if i == goal_tag else 0 for i in df['tags'].as_matrix()]
    train_data = data[:3000]
    y_train = np.asarray(labels[:3000])
    test_data = data[3000:]
    y_test = np.asarray(labels[3000:])
    max_review_length = 500
    X_train = sequence.pad_sequences(train_data, maxlen=max_review_length)
    X_test = sequence.pad_sequences(test_data, maxlen=max_review_length)

    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_review_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, y_train, epochs=1, batch_size=64)
    # Final evaluation of the model
    score = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (score[1]*100))
    save_model(model, filename.format(i))
    models.append(model)
    scores.append(score)

18. Training tag: graph matchings
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           664992    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 718,293
Trainable params: 718,293
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
Accuracy: 100.00%
19. Training tag: graphs
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           664992    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)    

Epoch 1/1
Accuracy: 100.00%
28. Training tag: schedules
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 32)           664992    
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 101       
Total params: 718,293
Trainable params: 718,293
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
Accuracy: 100.00%
29. Training tag: shortest paths
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 32)           664992    
_________________________________________________________________
lstm_12 (LSTM)

In [None]:
#  Testing on individual test cases
filename = 'models/model{}.h5'
from keras.models import load_model
if not models:
    for i, tag in enumerate(allTags):
        print('loading {}'.format(tag))
        models.append(load_model(filename.format(i)))

#Let's test on trial 3005
results = []
for i, model in enumerate(models):
    print(model.predict(data[3005]))
print(df['tags'].as_matrix()[3005])
    

loading 2-sat
loading binary search
loading bitmasks
loading brute force
loading chinese remainder theorem
loading combinatorics
loading constructive algorithms
loading data structures
loading dfs and similar
loading divide and conquer
loading dp
loading dsu
