In [0]:
import string, os, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Dropout, Embedding
from keras.models import Sequential
from keras.callbacks import EarlyStopping
import keras.utils as utils

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"xxxxxxx","key":"xxxxxxxxxxxxxxxxxxxxxxxxxx"}'}

In [5]:
!ls -lrt kaggle.json

-rw-r--r-- 1 root root 64 Sep 14 17:50 kaggle.json


In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
!pip install kaggle
!kaggle datasets download -d aashita/nyt-comments

Downloading nyt-comments.zip to /content
 99% 477M/480M [00:04<00:00, 133MB/s]
100% 480M/480M [00:04<00:00, 125MB/s]


In [8]:
!unzip nyt-comments.zip
!ls -lrt

Archive:  nyt-comments.zip
  inflating: CommentsJan2018.csv     
  inflating: ArticlesApril2017.csv   
  inflating: CommentsMarch2018.csv   
  inflating: CommentsJan2017.csv     
  inflating: ArticlesFeb2018.csv     
  inflating: ArticlesApril2018.csv   
  inflating: CommentsFeb2018.csv     
  inflating: ArticlesMay2017.csv     
  inflating: ArticlesMarch2017.csv   
  inflating: CommentsApril2017.csv   
  inflating: CommentsMarch2017.csv   
  inflating: CommentsFeb2017.csv     
  inflating: ArticlesFeb2017.csv     
  inflating: ArticlesJan2018.csv     
  inflating: CommentsMay2017.csv     
  inflating: CommentsApril2018.csv   
  inflating: ArticlesJan2017.csv     
  inflating: ArticlesMarch2018.csv   
total 2002736
-rw-r--r-- 1 root root    718307 May  2  2018 ArticlesMarch2018.csv
-rw-r--r-- 1 root root    461113 May  2  2018 ArticlesMarch2017.csv
-rw-r--r-- 1 root root    460669 May  2  2018 ArticlesJan2018.csv
-rw-r--r-- 1 root root    412840 May  2  2018 ArticlesJan2017.csv
-rw-r--

In [12]:
curr_dir = './'
headlines = []
for filename in os.listdir(curr_dir):
  if 'Articles' in filename:
    article_df = pd.read_csv(curr_dir+filename)
    headlines.extend(list(article_df.headline.values))
    break
    
headlines = [headline for headline in headlines if headline != "Unknown"]
len(headlines)

829

In [16]:
def clean_text(txt):
  txt = "".join(v for v in txt if v not in string.punctuation).lower()
  txt = txt.encode("utf8").decode("ascii",'ignore')
  return txt

corpus = [clean_text(x) for x in headlines]
print(corpus[:10])
corpus_wordsize = [len(x.split(" ")) for x in headlines]
max(corpus_wordsize)

['nfl vs politics has been battle all season long', 'voice vice veracity', 'a standups downward slide', 'new york today a groundhog has her day', 'a swimmers communion with the ocean', 'trail activity', 'super bowl', 'trumps mexican shakedown', 'pences presidential pet', 'fruit of a poison tree']


18

In [18]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
  tokenizer.fit_on_texts(corpus)
  total_words = len(tokenizer.word_index) + 1
  input_sequences = []
  for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
      n_gram_sequence = token_list[:i+1]
      input_sequences.append(n_gram_sequence)
  return input_sequences, total_words

input_sequences, total_words = get_sequence_of_tokens(corpus)
input_sequences[:10]

[[660, 117],
 [660, 117, 72],
 [660, 117, 72, 73],
 [660, 117, 72, 73, 661],
 [660, 117, 72, 73, 661, 662],
 [660, 117, 72, 73, 661, 662, 63],
 [660, 117, 72, 73, 661, 662, 63, 29],
 [660, 117, 72, 73, 661, 662, 63, 29, 210],
 [211, 663],
 [211, 663, 664]]

In [0]:
def generate_padded_sequences(input_sequences):
  max_sequence_len = max([len(x) for x in input_sequences])
  input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
  predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
  label = utils.to_categorical(label, num_classes=total_words)
  return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

In [20]:
def create_model(max_sequence_len, total_words):
  input_len = max_sequence_len - 1
  model = Sequential()
  model.add(Embedding(total_words, 10, input_length=input_len))
  model.add(LSTM(100))
  model.add(Dropout(0.1))
  model.add(Dense(total_words, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  return model
model = create_model(max_sequence_len, total_words)
model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 16, 10)            22880     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2288)              231088    
Total params: 298,368
Trainable params: 298,368
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.fit(predictors, label, epochs=100, verbose=5)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch

<keras.callbacks.History at 0x7f883a5c7f28>

In [0]:
def generate_text(seed_text, next_words, model, max_sequence_len):
  for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word=""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " "+output_word
  return seed_text.title()

In [24]:
print(generate_text("united states", 5, model, max_sequence_len))

United States Pass Fail Incomplete Loss Of


In [25]:
print(generate_text("climate change", 20, model, max_sequence_len))

Climate Change To Fill Jobs When Total Loyalty Will Close A Moon Spacex Now Yes An Election Pairing Freely View For The


In [27]:
print(generate_text("Prime Minister Modi", 20, model, max_sequence_len))

Prime Minister Modi You Is The Sugar Its The Thrill Ally To Security Them Can The Same States Theyre Freely Points Of Learn


In [0]:
model.save("model.h5")

In [0]:
import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)