In [2]:
import ast
import collections
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense

from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import pandas
import pickle
import os

Using TensorFlow backend.


In [3]:
# fix random seed for reproducibility
np.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [4]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model

In [5]:
df = pandas.read_csv('words_all_no_repeats.csv')
allWords = df['main_text'].str.cat(sep=' ').split(' ')
tags = set()
for tag_list in df['tags']:
    tag_list_parsed = ast.literal_eval(tag_list.replace('/', ','))
    for tag in tag_list_parsed:
        tags.add(tag)
allTags = list(tags)
# allTags = list(set(df['tags'].str.cat(sep=' ').split(' ')))

In [6]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    # print(count)
    dictionary = dict()
    # Add words to dictionary based off of how common they are
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

vocabularyF, vocabularyR = build_dataset(allWords)
#print(vocabularyF)
vocab_size = len(vocabularyF)

In [7]:
text = df['main_text'].as_matrix()
df_tags = [ast.literal_eval(tag_list.replace('/', ',')) for tag_list in df['tags'].as_matrix()]
df_tags_numerical = [[allTags.index(tag) for tag in tag_list] for tag_list in df_tags]

In [8]:
print(allTags)


['hashing', 'geometry', 'flows', 'constructive algorithms', 'greedy', 'dfs and similar', 'expression parsing', 'trees', 'chinese remainder theorem', 'schedules', 'bitmasks', 'data structures', 'shortest paths', 'games', 'special problem', '2-sat', 'ternary search', 'implementation', 'dsu', 'probabilities', 'graph matchings', 'math', 'sortings', 'binary search', 'strings', 'matrices', 'two pointers', 'dp', 'divide and conquer', 'combinatorics', 'meet-in-the-middle', 'fft', 'brute force', 'graphs', 'string suffix structures', 'number theory']


In [None]:
import h5py
from keras.models import load_model
# Let's try binary classification

filename = "models/model{}.h5"
for i, goal_tag in enumerate(allTags):
    # Gets indices for each word in line, for each line in main_text
    print("{}.  Testing tag {}".format(i + 1, goal_tag))
    if os.path.isfile(filename.format(i)):
        print("  model exists! Skipping...")
        continue
    data = [[vocabularyF.get(i, 0) for i in j.split(' ')] for j in text]
    labels = [1 if goal_tag in tag_list else 0 for tag_list in df_tags]
    train_data = data[:3000]
    y_train = np.asarray(labels[:3000])
    test_data = data[3000:]
    y_test = np.asarray(labels[3000:])
    max_review_length = 500
    X_train = sequence.pad_sequences(train_data, maxlen=max_review_length)
    X_test = sequence.pad_sequences(test_data, maxlen=max_review_length)

    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_review_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, y_train, nb_epoch=1, batch_size=64)
    # Final evaluation of the model
    score = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (score[1]*100))
    model.save(filename.format(i))

Testing tag hashing
  model exists! Skipping...
Testing tag geometry
  model exists! Skipping...
Testing tag flows
  model exists! Skipping...
Testing tag constructive algorithms
  model exists! Skipping...
Testing tag greedy
  model exists! Skipping...
Testing tag dfs and similar
  model exists! Skipping...
Testing tag expression parsing
  model exists! Skipping...
Testing tag trees
  model exists! Skipping...
Testing tag chinese remainder theorem
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 32)           664992    
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 101       
Total params: 718,293
Trainable params: 718,293
Non-trainable params: 0
_____________



Epoch 1/1

ImportError: `save_model` requires h5py.