In [5]:
import ast
import collections
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential, load_model, save_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import pandas
import pickle
import os

In [6]:
# fix random seed for reproducibility
np.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [7]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
# create the model

In [8]:
df = pandas.read_csv('words_all_no_repeats.csv')
allWords = df['main_text'].str.cat(sep=' ').split(' ')
tags = set()
for tag_list in df['tags']:
    tag_list_parsed = ast.literal_eval(tag_list.replace('/', ','))
    for tag in tag_list_parsed:
        tags.add(tag)
allTags = sorted(list(tags))
pickle.dump(allTags, open('models/tags.pkl', 'wb'))
print(allTags)
# allTags = list(set(df['tags'].str.cat(sep=' ').split(' ')))

['2-sat', 'binary search', 'bitmasks', 'brute force', 'chinese remainder theorem', 'combinatorics', 'constructive algorithms', 'data structures', 'dfs and similar', 'divide and conquer', 'dp', 'dsu', 'expression parsing', 'fft', 'flows', 'games', 'geometry', 'graph matchings', 'graphs', 'greedy', 'hashing', 'implementation', 'math', 'matrices', 'meet-in-the-middle', 'number theory', 'probabilities', 'schedules', 'shortest paths', 'sortings', 'special problem', 'string suffix structures', 'strings', 'ternary search', 'trees', 'two pointers']


In [9]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    # print(count)
    dictionary = dict()
    # Add words to dictionary based off of how common they are
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

vocabularyF, vocabularyR = build_dataset(allWords)
#print(vocabularyF)
vocab_size = len(vocabularyF)

In [None]:
text = df['main_text'].as_matrix()
# Let's try binary classification
models = []
scores = []
filename = 'models/model{}.h5'
for i, goal_tag in enumerate(allTags):

    print('{}. Training tag: {}'.format(i + 1, goal_tag))
    # Gets indices for each word in line, for each line in main_text
    data = [[vocabularyF.get(i, 0) for i in j.split(' ')] for j in text]
    labels = [1 if goal_tag in j else 0 for j in df['tags'].as_matrix()]
    if 1 in labels:
        print('Label exists')
    else:
        print('no data')
    train_data = data[:3000]
    y_train = np.asarray(labels[:3000])
    test_data = data[3000:]
    y_test = np.asarray(labels[3000:])
    max_review_length = 500
    X_train = sequence.pad_sequences(train_data, maxlen=max_review_length)
    X_test = sequence.pad_sequences(test_data, maxlen=max_review_length)

    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_review_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, y_train, epochs=1, batch_size=64)
    # Final evaluation of the model
    score = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (score[1]*100))
    save_model(model, filename.format(i))
    models.append(model)
    scores.append(score)

1. Training tag: 2-sat
Label exists
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 32)           664992    
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 101       
Total params: 718,293
Trainable params: 718,293
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1
Accuracy: 99.62%
2. Training tag: binary search
Label exists
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 500, 32)           664992    
_________________________________________________________________
lstm_13 (LSTM)          

In [7]:
#  Testing on individual test cases
filename = 'models/model{}.h5'
from keras.models import load_model
if not models:
    for i, tag in enumerate(allTags):
        print('loading {}'.format(tag))
        models.append(load_model(filename.format(i)))

loading 2-sat
loading binary search
loading bitmasks
loading brute force
loading chinese remainder theorem
loading combinatorics
loading constructive algorithms
loading data structures
loading dfs and similar
loading divide and conquer
loading dp
loading dsu
loading expression parsing
loading fft
loading flows
loading games
loading geometry
loading graph matchings
loading graphs
loading greedy
loading hashing
loading implementation
loading math
loading matrices
loading meet-in-the-middle
loading number theory
loading probabilities
loading schedules
loading shortest paths
loading sortings
loading special problem
loading string suffix structures
loading strings
loading ternary search
loading trees
loading two pointers


ValueError: Error when checking : expected embedding_5_input to have shape (None, 500) but got array with shape (177, 1)

In [18]:
#Let's test on trial 3005
test_num = 3100
results = []
text = df['main_text'].as_matrix()
data = [[vocabularyF.get(i, 0) for i in j.split(' ')] for j in text]
train_data = [data[test_num]]
X_train = sequence.pad_sequences(train_data, maxlen=500)
for i, model in enumerate(models):
    result = model.predict(X_train)[0]
    print('Is {}?  {}\t{}'.format(allTags[i], 'yes' if result > 1 - result else 'no', result))
print(df['tags'][test_num])

Is 2-sat?  no	[ 0.00031093]
Is binary search?  no	[ 0.00021245]
Is bitmasks?  no	[  6.71432572e-05]
Is brute force?  no	[ 0.00027262]
Is chinese remainder theorem?  no	[  6.36333207e-05]
Is combinatorics?  no	[  7.59445829e-05]
Is constructive algorithms?  no	[ 0.00021703]
Is data structures?  no	[ 0.00039493]
Is dfs and similar?  no	[ 0.00038267]
Is divide and conquer?  no	[ 0.00036098]
Is dp?  no	[ 0.00013596]
Is dsu?  no	[ 0.0004459]
Is expression parsing?  no	[ 0.0008421]
Is fft?  no	[ 0.0005758]
Is flows?  no	[ 0.00026128]
Is games?  no	[ 0.00043417]
Is geometry?  no	[ 0.00015261]
Is graph matchings?  no	[ 0.00023076]
Is graphs?  no	[ 0.00055019]
Is greedy?  no	[ 0.00033246]
Is hashing?  no	[ 0.00052827]
Is implementation?  no	[ 0.00054067]
Is math?  no	[ 0.0001674]
Is matrices?  no	[  6.65249172e-05]
Is meet-in-the-middle?  no	[ 0.00022687]
Is number theory?  no	[ 0.00043508]
Is probabilities?  no	[ 0.00103406]
Is schedules?  no	[ 0.0003296]
Is shortest paths?  no	[ 0.00032429]
I