In [1]:
import numpy as np
import pandas as pd
import os
import csv
from random import random, sample, seed

In [2]:
df = pd.read_csv('dataset1.csv')

In [3]:
df.head()

Unnamed: 0,id,title,hour,minute,dayofweek,dayofyear,score
0,9gx68l,"Reddit, how would you feel about a law that ba...",14,1,2,261,149070
1,9hef7a,"In a video game, if you come across an empty r...",7,17,4,263,83296
2,9icx7a,What is a website that everyone should know ab...,19,16,0,266,82665
3,9jlras,"What could the U.S.A. have spent $1,000,000,00...",6,17,5,271,74998
4,9fbka2,If a genie grants you the opportunity to ejacu...,16,47,3,255,69915


In [4]:
df.describe()

Unnamed: 0,hour,minute,dayofweek,dayofyear,score
count,16000.0,16000.0,16000.0,16000.0,16000.0
mean,12.42975,29.606687,2.94425,273.943562,758.115937
std,6.749771,17.3461,1.996939,17.597535,4513.724777
min,0.0,0.0,0.0,243.0,10.0
25%,7.0,15.0,1.0,259.0,13.0
50%,13.0,30.0,3.0,274.0,20.0
75%,18.0,45.0,5.0,290.0,57.0
max,23.0,59.0,6.0,304.0,149070.0


In [5]:
s_titles = df['title'].values.tolist()
s_hours = df['hour'].values.tolist()
s_minutes = df['minute'].values.tolist()
s_dayofweeks = df['dayofweek'].values.tolist()
s_dayofyears = df['dayofyear'].values.tolist()
s_is_top_submission = df['score'].values.tolist()
split_score = s_is_top_submission[np.int(len(s_is_top_submission)/2)]
for i in range(0,len(s_is_top_submission)):
  if s_is_top_submission[i] > split_score:
    s_is_top_submission[i] = 1
  else:
    s_is_top_submission[i] = 0

titles = np.array(s_titles)
hours = np.array(s_hours, dtype=int)
minutes = np.array(s_minutes, dtype=int)
dayofweeks = np.array(s_dayofweeks, dtype=int)
dayofyears = np.array(s_dayofyears, dtype=int)
is_top_submission = np.array(s_is_top_submission, dtype=int)

In [6]:
print(titles[0:2])
print(titles.shape)
print(hours[0:2])
print(minutes[0:2])
print(dayofweeks[0:2])
print(dayofyears[0:2])
print(is_top_submission[0:2])

[ 'Reddit, how would you feel about a law that bans radio stations from playing commercials with honking/beeping/siren noises in them?'
 'In a video game, if you come across an empty room with a health pack, extra ammo, and a save point, you know some serious shit is about to go down. What is the real-life equivalent of this?']
(16000,)
[14  7]
[ 1 17]
[2 4]
[261 263]
[1 1]


In [7]:
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer

max_features = 40000

word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(titles)

print(str(word_tokenizer.word_counts)[0:100])
print(str(word_tokenizer.word_index)[0:100])
print(len(word_tokenizer.word_counts))   # true word count

Using TensorFlow backend.


OrderedDict([('reddit', 2146), ('how', 1788), ('would', 1999), ('you', 11726), ('feel', 298), ('abou
{'you': 1, 'what': 2, 'the': 3, 'to': 4, 'a': 5, 'your': 6, 'of': 7, 'is': 8, 'do': 9, 'that': 10, '
11816


In [8]:
titles_tf = word_tokenizer.texts_to_sequences(titles)
print(titles_tf[0])

[14, 19, 16, 1, 97, 32, 5, 394, 10, 3740, 1525, 2760, 48, 518, 4614, 29, 6298, 6299, 6300, 3741, 11, 88]


In [9]:
maxlen = 20
titles_tf = sequence.pad_sequences(titles_tf, maxlen=maxlen)
print(titles_tf[0])

[  16    1   97   32    5  394   10 3740 1525 2760   48  518 4614   29 6298
 6299 6300 3741   11   88]


In [10]:
embeddings_path = 'glove.6B.50d.txt'

In [11]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec
        
print(embedding_vectors['you'])

[ -1.09190000e-03   3.33240000e-01   3.57430000e-01  -5.40410000e-01
   8.20320000e-01  -4.93910000e-01  -3.25880000e-01   1.99720000e-03
  -2.38290000e-01   3.55540000e-01  -6.06550000e-01   9.89320000e-01
  -2.17860000e-01   1.12360000e-01   1.14940000e+00   7.32840000e-01
   5.11820000e-01   2.92870000e-01   2.83880000e-01  -1.35900000e+00
  -3.79510000e-01   5.09430000e-01   7.07100000e-01   6.29410000e-01
   1.05340000e+00  -2.17560000e+00  -1.32040000e+00   4.00010000e-01
   1.57410000e+00  -1.66000000e+00   3.77210000e+00   8.69490000e-01
  -8.04390000e-01   1.83900000e-01  -3.43320000e-01   1.07140000e-02
   2.39690000e-01   6.67480000e-02   7.01170000e-01  -7.37020000e-01
   2.08770000e-01   1.15640000e-01  -1.51900000e-01   8.59080000e-01
   2.26200000e-01   1.65190000e-01   3.63090000e-01  -4.56970000e-01
  -4.89690000e-02   1.13160000e+00]


In [12]:
weights_matrix = np.zeros((max_features + 1, 50))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if embedding_vector is not None and i <= max_features:
        weights_matrix[i] = embedding_vector

# index 0 vector should be all zeroes, index 1 vector should be the same one as above
print(weights_matrix[0:2,:])

[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -1.09190000e-03   3.33240000e-01   3.57430000e-01  -5.40410000e-01
    8.20320000e-01  -4.93910000e-01  -3.25880000e-01 

In [13]:
dayofyears_tf = dayofyears
print(dayofyears_tf[0:10])

[261 263 266 271 255 300 274 258 271 295]


In [14]:
from keras.models import Input, Model
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, concatenate, Activation
from keras.layers.core import Masking, Dropout, Reshape
from keras.layers.normalization import BatchNormalization

batch_size = 32
embedding_dims = 50
epochs = 20
dropout_rate = 0.2

In [15]:
titles_input = Input(shape=(maxlen,), name='titles_input')
titles_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(titles_input)
titles_pooling = GlobalAveragePooling1D()(titles_embedding)
titles_dropout = Dropout(dropout_rate)(titles_pooling)

In [16]:
aux_output = Dense(1, activation='sigmoid', name='aux_out')(titles_dropout)

In [17]:
meta_embedding_dims = 64

hours_input = Input(shape=(1,), name='hours_input')
hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

dayofweeks_input = Input(shape=(1,), name='dayofweeks_input')
dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

minutes_input = Input(shape=(1,), name='minutes_input')
minutes_embedding = Embedding(60, meta_embedding_dims)(minutes_input)
minutes_reshape = Reshape((meta_embedding_dims,))(minutes_embedding)

dayofyears_input = Input(shape=(1,), name='dayofyears_input')
dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)


In [18]:
merged = concatenate([titles_dropout, hours_reshape, dayofweeks_reshape, minutes_reshape, dayofyears_reshape])

hidden_1 = Dense(256, activation='relu')(merged)
hidden_1 = BatchNormalization()(hidden_1)

main_output = Dense(1, activation='sigmoid', name='main_out')(hidden_1)

In [19]:
model = Model(inputs=[titles_input,
                      hours_input,
                      dayofweeks_input,
                      minutes_input,
                      dayofyears_input], outputs=[main_output, aux_output])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'],
              loss_weights=[1, 0.2])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
titles_input (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 50)       2000050     titles_input[0][0]               
__________________________________________________________________________________________________
hours_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
dayofweeks_input (InputLayer)   (None, 1)            0                                            
__________________________________________________________________________________________________
minutes_in

In [20]:
seed(123)
split = 0.2

# returns randomized indices with no repeats
idx = sample(range(titles_tf.shape[0]), titles_tf.shape[0])


In [21]:
titles_tf = titles_tf[idx, :]
hours = hours[idx]
dayofweeks = dayofweeks[idx]
minutes = minutes[idx]
dayofyears_tf = dayofyears_tf[idx]
is_top_submission = is_top_submission[idx]

In [22]:
print(1 - np.mean(is_top_submission[:(int(titles_tf.shape[0] * split))]))

0.4990625


In [23]:
from keras.callbacks import CSVLogger,EarlyStopping

csv_logger = CSVLogger('training.csv')

In [24]:
model.fit([titles_tf, hours, dayofweeks, minutes, dayofyears_tf], [is_top_submission, is_top_submission],
          batch_size=batch_size,
          epochs=epochs,
          validation_split=split, callbacks=[csv_logger])

Train on 403150 samples, validate on 100788 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f5b3c69e6a0>

In [25]:
model.save_weights('weights.h5')

In [24]:
model.load_weights('weights.h5')

In [25]:
def encode_text(text, maxlen):
    encoded = word_tokenizer.texts_to_sequences([text])
    return sequence.pad_sequences(encoded, maxlen=maxlen)

In [26]:
input_text = "Which movie's plot would drastically change if you removed a letter from its title?"
encoded_text = encode_text(input_text, maxlen)
print(encoded_text)

[[   0    0    0    0    0    0    0   66  696   16 1734  157   23    1
  2141    5  884   48  374  340]]


In [27]:
input_hour = np.array([15])
input_minute = np.array([10])
input_dayofweek = np.array([1])
input_dayofyear = np.array([16 - 1])

model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])

[array([[ 0.00543976]], dtype=float32), array([[ 0.26231879]], dtype=float32)]

In [28]:
import nltk

# nltk.download()

# Input Text

In [98]:
input_text = "What is that one thing that immediately puts you off a person?"

tokens = nltk.word_tokenize(input_text)
pos = nltk.pos_tag(tokens)

print(pos)

word_list = input_text.split(" ")

[('What', 'WP'), ('is', 'VBZ'), ('that', 'DT'), ('one', 'CD'), ('thing', 'NN'), ('that', 'WDT'), ('immediately', 'RB'), ('puts', 'VBZ'), ('you', 'PRP'), ('off', 'RP'), ('a', 'DT'), ('person', 'NN'), ('?', '.')]


# Delete a word

In [107]:
def deleteWord(word_list):
    temp_list = word_list[:]
    final_text = ""
    max_score = int(0)
    for i in range(len(word_list)):
        temp_list = word_list[:]
        if(pos[i][1]=='JJ' or po s[i][1]=='JJR' or pos[i][1]=='JJS' or pos[i][1]=='RB' or pos[i][1]=='RBR' or pos[i][1]=='RBS'):
            if(i != 0 and i != len(word_list)-1 ): # probably not gonna remove first word and last word
                del temp_list[i]
                temp_text = " ".join(temp_list)
                encoded_text = encode_text(temp_text, maxlen)
                predict_score = model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])
                print(temp_list,predict_score[1][0])

                if (max_score - predict_score[1][0][0] < 0):
                    max_score = predict_score[1][0][0]
                    final_text = temp_text
    word_BeforeChange = " ".join(word_list)
    print("\nBefore Chanege : ",word_BeforeChange, "[", predict_score_B[0][0][0], "]")
    print("After Change : ",final_text, "[", max_score,"]")

In [109]:
deleteWord(word_list)

['What', 'is', 'that', 'one', 'thing', 'that', 'puts', 'you', 'off', 'a', 'person?'] [ 0.40537578]

Before Chanege :  What is that one thing that immediately puts you off a person? [ 0.556207 ]
After Change :  What is that one thing that puts you off a person? [ 0.405376 ]


In [108]:
from nltk.corpus import wordnet

def synonym(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lm in syn.lemmas():
            synonyms.append(lm.name())
    return list(set(synonyms))

In [89]:
synonym('is')

['cost',
 'exist',
 'equal',
 'be',
 'embody',
 'follow',
 'make_up',
 'comprise',
 'personify',
 'live',
 'represent',
 'constitute']

In [90]:
import grammar_check

In [91]:
from grammarbot import GrammarBotClient
client = GrammarBotClient(api_key='9JMF2Y5') 

# Replace a word with a synonym word

In [105]:
def replaceWord(word_list):
    temp_list = word_list[:]
    final_text = ""
    max_score = int(0)
    # tool = grammar_check.LanguageTool('en-GB')
    for i in range(len(word_list)):
        syno = synonym(temp_list[i])
        possyn = nltk.pos_tag(syno)
    #     print(possyn)
        for j in range(len(syno)):
    #         if(pos[i][1] == possyn[j][1]):
                temp_list = word_list[:]
                temp_list[i] = syno[j]
                encoded_text = encode_text(temp_list, maxlen)
                predict_score = model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])
                temp_text = " ".join(temp_list)
                print(temp_text,predict_score[1][0])
    #             res = client.check(temp_text)
    #             matches = tool.check(temp_text)
    #             print("gramma check matches:",len(res.matches))

                if (max_score - predict_score[1][0][0] < 0):
                    max_score = predict_score[1][0][0]
                    temp_text = " ".join(temp_list)
                    final_text = temp_text

    word_BeforeChange = " ".join(word_list)
    print("\nBefore Change : ",word_BeforeChange, "[", predict_score_B[0][0][0],"]")
    print("After Change : ",final_text, "[", max_score,"]")

In [106]:
replaceWord(word_list)

What cost that one thing that immediately puts you off a person? [ 0.18989711]
What exist that one thing that immediately puts you off a person? [ 0.23831394]
What equal that one thing that immediately puts you off a person? [ 0.19243886]
What be that one thing that immediately puts you off a person? [ 0.29819027]
What embody that one thing that immediately puts you off a person? [ 0.20281577]
What follow that one thing that immediately puts you off a person? [ 0.229223]
What make_up that one thing that immediately puts you off a person? [ 0.20281577]
What comprise that one thing that immediately puts you off a person? [ 0.20281577]
What personify that one thing that immediately puts you off a person? [ 0.20281577]
What live that one thing that immediately puts you off a person? [ 0.15165661]
What represent that one thing that immediately puts you off a person? [ 0.12616616]
What constitute that one thing that immediately puts you off a person? [ 0.20281577]
What is that ace thing that

In [93]:
import nltk.collocations
import nltk.corpus
import collections

bgm    = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(nltk.corpus.brown.words())
scored = finder.score_ngrams( bgm.likelihood_ratio  )

# Group bigrams by first word in bigram.                                        
suffix_keys = collections.defaultdict(list)
for key, scores in scored:
   suffix_keys[key[1]].append((key[0], scores))

# Sort keyed bigrams by strongest association.                                  
for key in suffix_keys:
   suffix_keys[key].sort(key = lambda x: -x[1])

# Add a word

In [101]:
def addWord(word_list):
    temp_list = word_list[:]
    final_text = ""
    max_score = int(0)
    # tool = grammar_check.LanguageTool('en-GB')
    for i in range(len(word_list)):
        if(pos[i][1]=='NN' or pos[i][1]=='NNS' or pos[i][1]=='NNP' or pos[i][1]=='NNPS' 
           or pos[i][1]=='JJ' or pos[i][1]=='JJR' or pos[i][1]=='JJS' 
           or pos[i][1]=='VB' or pos[i][1]=='VBD' or pos[i][1]=='VBG' or pos[i][1]=='VBN'):
            if(pos[i-1][1]!='JJ' and pos[i-1][1]!='JJR' and pos[i-1][1]!='JJS' and pos[i-1][1]!='RB' and pos[i-1][1]!='RBR' and pos[i-1][1]!='RBS'):
                prefix_list = []
                for word in suffix_keys[temp_list[i]]:
                    prefix_list.append(word[0])
                pos_prefix = nltk.pos_tag(prefix_list)
                prefix_list_select = []
                for j in range(len(pos_prefix)):
                    if(pos_prefix[j][1]=='JJ' or pos_prefix[j][1]=='JJR' or pos_prefix[j][1]=='JJS' or pos_prefix[j][1]=='RB' or pos_prefix[j][1]=='RBR' or pos_prefix[j][1]=='RBS'):
                        prefix_list_select.append(pos_prefix[j])
                for prefix in prefix_list_select:
                    temp_list.insert(i,prefix[0])
                    encoded_text = encode_text(temp_list, maxlen)
                    predict_score = model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])
                    temp_text = " ".join(temp_list)
                    temp_list = word_list[:]
                    print(temp_text,predict_score[1][0])
                    if (max_score - predict_score[1][0][0] < 0):
                        max_score = predict_score[1][0][0]
                        final_text = temp_text

    word_BeforeChange = " ".join(word_list)
    print("\nBefore Change : ",word_BeforeChange, "[", predict_score_B[0][0][0],"]")
    print("After Change : ",final_text, "[", max_score,"]")


In [102]:
addWord(word_list)

What is that one same thing that immediately puts you off a person? [ 0.24807252]
What is that one important thing that immediately puts you off a person? [ 0.2317185]
What is that one only thing that immediately puts you off a person? [ 0.23635107]
What is that one whole thing that immediately puts you off a person? [ 0.21685261]
What is that one first thing that immediately puts you off a person? [ 0.20137316]
What is that one good thing that immediately puts you off a person? [ 0.21114784]
What is that one real thing that immediately puts you off a person? [ 0.19737232]
What is that one bad thing that immediately puts you off a person? [ 0.21253607]
What is that one such thing that immediately puts you off a person? [ 0.21849221]
What is that one terrible thing that immediately puts you off a person? [ 0.07866835]
What is that one easiest thing that immediately puts you off a person? [ 0.22514383]
What is that one little thing that immediately puts you off a person? [ 0.20934989]
Wh