In [1]:
import numpy as np
import pandas as pd
import os
import csv
from random import random, sample, seed

In [2]:
df = pd.read_csv('dataset1.csv')

In [3]:
df.head()

Unnamed: 0,id,title,hour,minute,dayofweek,dayofyear,score
0,9gx68l,"Reddit, how would you feel about a law that ba...",14,1,2,261,149070
1,9hef7a,"In a video game, if you come across an empty r...",7,17,4,263,83296
2,9icx7a,What is a website that everyone should know ab...,19,16,0,266,82665
3,9jlras,"What could the U.S.A. have spent $1,000,000,00...",6,17,5,271,74998
4,9fbka2,If a genie grants you the opportunity to ejacu...,16,47,3,255,69915


In [4]:
df.describe()

Unnamed: 0,hour,minute,dayofweek,dayofyear,score
count,503938.0,503938.0,503938.0,503938.0,503938.0
mean,12.78112,29.562274,2.911717,274.629046,25.797312
std,6.602338,17.317721,1.969925,17.578185,815.115481
min,0.0,0.0,0.0,243.0,0.0
25%,8.0,15.0,1.0,260.0,1.0
50%,13.0,30.0,3.0,275.0,1.0
75%,18.0,45.0,5.0,290.0,3.0
max,23.0,59.0,6.0,304.0,149070.0


In [5]:
titles = df['title'].values.tolist()
hours = df['hour'].values.tolist()
minutes = df['minute'].values.tolist()
dayofweeks = df['dayofweek'].values.tolist()
dayofyears = df['dayofyear'].values.tolist()
is_top_submission = df['score'].values.tolist()
split_score = is_top_submission[np.int(len(is_top_submission)/2)]
for i in range(0,len(is_top_submission)):
  if is_top_submission[i] > split_score:
    is_top_submission[i] = 1
  else:
    is_top_submission[i] = 0

titles = np.array(titles)
hours = np.array(hours, dtype=int)
minutes = np.array(minutes, dtype=int)
dayofweeks = np.array(dayofweeks, dtype=int)
dayofyears = np.array(dayofyears, dtype=int)
is_top_submission = np.array(is_top_submission, dtype=int)

In [6]:
print(titles[0:2])
print(titles.shape)
print(hours[0:2])
print(minutes[0:2])
print(dayofweeks[0:2])
print(dayofyears[0:2])
print(is_top_submission[0:2])

['Reddit, how would you feel about a law that bans radio stations from playing commercials with honking/beeping/siren noises in them?'
 'In a video game, if you come across an empty room with a health pack, extra ammo, and a save point, you know some serious shit is about to go down. What is the real-life equivalent of this?']
(503938,)
[14  7]
[ 1 17]
[2 4]
[261 263]
[1 1]


In [7]:
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer

max_features = 40000

word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(titles)

print(str(word_tokenizer.word_counts)[0:100])
print(str(word_tokenizer.word_index)[0:100])
print(len(word_tokenizer.word_counts))   # true word count

Using TensorFlow backend.


OrderedDict([('reddit', 59327), ('how', 68884), ('would', 58961), ('you', 320628), ('feel', 9795), (
{'icles': 76310, 'expereiced': 66467, 'applies': 6525, 'sourced': 17409, 'molton': 59417, 'pp£d': 56
78801


In [8]:
titles_tf = word_tokenizer.texts_to_sequences(titles)
print(titles_tf[0])

[18, 15, 19, 1, 95, 33, 5, 472, 12, 4055, 1749, 4425, 46, 621, 2745, 25, 11387, 10732, 11388, 5041, 11, 87]


In [9]:
maxlen = 20
titles_tf = sequence.pad_sequences(titles_tf, maxlen=maxlen)
print(titles_tf[0])

[   19     1    95    33     5   472    12  4055  1749  4425    46   621
  2745    25 11387 10732 11388  5041    11    87]


In [10]:
embeddings_path = 'glove.6B.50d.txt'

In [11]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec
        
print(embedding_vectors['you'])

[-1.0919e-03  3.3324e-01  3.5743e-01 -5.4041e-01  8.2032e-01 -4.9391e-01
 -3.2588e-01  1.9972e-03 -2.3829e-01  3.5554e-01 -6.0655e-01  9.8932e-01
 -2.1786e-01  1.1236e-01  1.1494e+00  7.3284e-01  5.1182e-01  2.9287e-01
  2.8388e-01 -1.3590e+00 -3.7951e-01  5.0943e-01  7.0710e-01  6.2941e-01
  1.0534e+00 -2.1756e+00 -1.3204e+00  4.0001e-01  1.5741e+00 -1.6600e+00
  3.7721e+00  8.6949e-01 -8.0439e-01  1.8390e-01 -3.4332e-01  1.0714e-02
  2.3969e-01  6.6748e-02  7.0117e-01 -7.3702e-01  2.0877e-01  1.1564e-01
 -1.5190e-01  8.5908e-01  2.2620e-01  1.6519e-01  3.6309e-01 -4.5697e-01
 -4.8969e-02  1.1316e+00]


In [12]:
weights_matrix = np.zeros((max_features + 1, 50))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if embedding_vector is not None and i <= max_features:
        weights_matrix[i] = embedding_vector

# index 0 vector should be all zeroes, index 1 vector should be the same one as above
print(weights_matrix[0:2,:])

[[ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00]
 [-1.0919e-03  3.3324e-01  3.5743e-01 -5.4041e-01  8.2032e-01 -4.9391e-01
  -3.2588e-01  1.9972e-03 -2.3829e-01  3.5554e-01 -6.0655e-01  9.8932e-01
  -2.1786e-01  1.1236e-01  1.1494e+00  7.3284e-01  5.1182e-01  2.9287e-01
   2.8388e-01 -1.3590e+00 -3.7951e-01  5.0943e-01  7.0710e-01  6.2941e-01
   1.0534e+00 -2.1756e+00 -1.3204e+00  4.0001e-01  1.5741e+00 -1.6600e+00
   3.7721e+

In [13]:
dayofyears_tf = dayofyears
print(dayofyears_tf[0:10])

[261 263 266 271 255 300 274 258 271 295]


In [14]:
from keras.models import Input, Model
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, concatenate, Activation
from keras.layers.core import Masking, Dropout, Reshape
from keras.layers.normalization import BatchNormalization

batch_size = 32
embedding_dims = 50
epochs = 20
dropout_rate = 0.2

In [15]:
titles_input = Input(shape=(maxlen,), name='titles_input')
titles_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(titles_input)
titles_pooling = GlobalAveragePooling1D()(titles_embedding)
titles_dropout = Dropout(dropout_rate)(titles_pooling)

In [16]:
aux_output = Dense(1, activation='sigmoid', name='aux_out')(titles_dropout)

In [17]:
meta_embedding_dims = 64

hours_input = Input(shape=(1,), name='hours_input')
hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

dayofweeks_input = Input(shape=(1,), name='dayofweeks_input')
dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

minutes_input = Input(shape=(1,), name='minutes_input')
minutes_embedding = Embedding(60, meta_embedding_dims)(minutes_input)
minutes_reshape = Reshape((meta_embedding_dims,))(minutes_embedding)

dayofyears_input = Input(shape=(1,), name='dayofyears_input')
dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)


In [18]:
merged = concatenate([titles_dropout, hours_reshape, dayofweeks_reshape, minutes_reshape, dayofyears_reshape])

hidden_1 = Dense(256, activation='relu')(merged)
hidden_1 = BatchNormalization()(hidden_1)

main_output = Dense(1, activation='sigmoid', name='main_out')(hidden_1)

In [19]:
model = Model(inputs=[titles_input,
                      hours_input,
                      dayofweeks_input,
                      minutes_input,
                      dayofyears_input], outputs=[main_output, aux_output])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'],
              loss_weights=[1, 0.2])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
titles_input (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 50)       2000050     titles_input[0][0]               
__________________________________________________________________________________________________
hours_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
dayofweeks_input (InputLayer)   (None, 1)            0                                            
__________________________________________________________________________________________________
minutes_in

In [20]:
seed(123)
split = 0.2

# returns randomized indices with no repeats
idx = sample(range(titles_tf.shape[0]), titles_tf.shape[0])


In [21]:
titles_tf = titles_tf[idx, :]
hours = hours[idx]
dayofweeks = dayofweeks[idx]
minutes = minutes[idx]
dayofyears_tf = dayofyears_tf[idx]
is_top_submission = is_top_submission[idx]

In [22]:
print(1 - np.mean(is_top_submission[:(int(titles_tf.shape[0] * split))]))

0.5835871689801264


In [23]:
from keras.callbacks import CSVLogger,EarlyStopping

csv_logger = CSVLogger('training.csv')

In [24]:
model.fit([titles_tf, hours, dayofweeks, minutes, dayofyears_tf], [is_top_submission, is_top_submission],
          batch_size=batch_size,
          epochs=epochs,
          validation_split=split, callbacks=[csv_logger])

Train on 403150 samples, validate on 100788 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f5b3c69e6a0>

In [25]:
model.save_weights('weights.h5')

In [24]:
model.load_weights('weights.h5')

In [25]:
def encode_text(text, maxlen):
    encoded = word_tokenizer.texts_to_sequences([text])
    return sequence.pad_sequences(encoded, maxlen=maxlen)

In [26]:
input_text = "Which movie's plot would drastically change if you removed a letter from its title?"
encoded_text = encode_text(input_text, maxlen)
print(encoded_text)

[[   0    0    0    0    0    0   69 8557 1049   19 2649  137   22    1
  1697    5 1459   46  291  729]]


In [27]:
input_hour = np.array([15])
input_minute = np.array([10])
input_dayofweek = np.array([1])
input_dayofyear = np.array([16 - 1])

model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])

[array([[0.58657765]], dtype=float32), array([[0.5020961]], dtype=float32)]

In [28]:
input_text = "What is perfectly legal but creepy as hell?"
encoded_text = encode_text(input_text, maxlen)
model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])

[array([[0.60986346]], dtype=float32), array([[0.61377]], dtype=float32)]

In [29]:
import copy
word_list = input_text.split(" ")
final_text = ""
max_score = int(0)
for i in range(len(word_list)):
    temp_list = word_list[:]
    if(i != 0 and i != len(word_list)-1 ): # probably not gonna remove first word and last word
        del temp_list[i]
        temp_text = " ".join(temp_list)
        encoded_text = encode_text(temp_text, maxlen)
        predict_score = model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])
        print(temp_list,predict_score[1][0])
       
        if (max_score - predict_score[1][0][0] < 0):
            max_score = predict_score[1][0][0]
            final_text = temp_text
print("After Change : ",final_text, "[", max_score,"]")

['What', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6107462]
['What', 'is', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.49745703]
['What', 'is', 'perfectly', 'but', 'creepy', 'as', 'hell?'] [0.622355]
['What', 'is', 'perfectly', 'legal', 'creepy', 'as', 'hell?'] [0.57864136]
['What', 'is', 'perfectly', 'legal', 'but', 'as', 'hell?'] [0.5981729]
['What', 'is', 'perfectly', 'legal', 'but', 'creepy', 'hell?'] [0.61045]
After Change :  What is perfectly but creepy as hell? [ 0.622355 ]


In [43]:
from nltk.corpus import wordnet

def synonym(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lm in syn.lemmas():
            synonyms.append(lm.name())
    return list(set(synonyms))

In [44]:
synonym('perfectly')

['perfectly', 'absolutely', 'utterly', 'dead']

In [45]:
final_text = ""
max_score = int(0)
for i in range(len(word_list)):
    temp_list = word_list[:]
    syno = synonym(temp_list[i])
    for j in range(len(syno)):
        temp_list[i] = syno[j]
        encoded_text = encode_text(temp_list, maxlen)
        predict_score = model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])
        print(temp_list,predict_score[1][0])
        
        if (max_score - predict_score[1][0][0] < 0):
            max_score = predict_score[1][0][0]
            final_text = temp_list
print("After Change : ",final_text, "[", max_score,"]")

['What', 'equal', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.57594705]
['What', 'make_up', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6111701]
['What', 'embody', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6567825]
['What', 'cost', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.53797853]
['What', 'be', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6171254]
['What', 'live', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.61468846]
['What', 'exist', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6437134]
['What', 'follow', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6050839]
['What', 'personify', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6348581]
['What', 'constitute', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.6250751]
['What', 'represent', 'perfectly', 'legal', 'but', 'creepy', 'as', 'hell?'] [0.62024385]
['What', 'comprise', 'perfectly', 'legal', 'but', 'creepy', 