In [None]:
import zipfile
import os
import shutil
import tensorflow as tf
import string
import numpy as np

**Data Extraction**

In [None]:
zip_path='/content/drive/MyDrive/DeepLearningTask3/SciFi.zip'
extract_path = "/content/task3"
if not os.path.exists(extract_path):
    os.makedirs(extract_path)

with zipfile.ZipFile(zip_path, 'r') as zip_obj:
  zip_obj.extractall(extract_path)

In [None]:
text=''
with open('/content/task3/internet_archive_scifi_v3.txt', 'r') as f:
    chunk_size = 1024 # set the chunk size to be read
    while True:
        data = f.read(chunk_size)
        if not data:
            break
        # processing the data
        text=text+data
        print(data)

In [None]:
print(f'Length of text: {len(list(text))} characters')

Length of text: 149326361 characters


**Data Pre-processing**

In [None]:
#changing text to lower text and removing punctuations
text=text.lower()

In [None]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

49 unique characters


In [None]:
print(vocab)

[' ', '!', '"', '#', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
book = []
with open('/content/task3/internet_archive_scifi_v3.txt') as pdf:
    for line in pdf:
        book.append(line)
book[0] = book[0][:len(book[0])//1000]

In [None]:
import string
punctuations = string.punctuation
punctuations += '1234567890'
eol = '.!?'

cleaned_book = []
for line in book:
    cleaned_line = ''
    for char in line:
        if char in eol:
            cleaned_line += ' . '
            continue
        if char in punctuations or char == '\n':
            continue
        cleaned_line += char
    cleaned_line = cleaned_line.lower()
    cleaned_book.append(cleaned_line)

all_text = ' \n '.join(cleaned_book)
print(all_text[:200])

march  all stories new and complete publisher editor if is published bimonthly by quinn publishing company inc .  kingston new york .  volume  no .   .  copyright  by quinn publishing company inc .  a


In [None]:
all_text[:500]

'march  all stories new and complete publisher editor if is published bimonthly by quinn publishing company inc .  kingston new york .  volume  no .   .  copyright  by quinn publishing company inc .  application for entry as second class matter at post office buffalo new york pending .  subscription  for  issues in u . s .  and possessions canada  for  issues elsewhere  .  aiiow four weeks for change of address .  all stories appearing in this magazine are fiction .  any similarity to actual pers'

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

corpus = [clean_text(x) for x in all_text]
corpus[:10]

['m', 'a', 'r', 'c', 'h', ' ', ' ', 'a', 'l', 'l']

In [None]:
import numpy as np
text_tokens = all_text.split(".")
text_tokens = np.array(text_tokens)
#text_tokens = text_tokens.reshape(len(text_tokens), 1)
print(text_tokens.shape)
print(text_tokens[:10])

(2430,)
['march  all stories new and complete publisher editor if is published bimonthly by quinn publishing company inc '
 '  kingston new york ' '  volume  no ' '   '
 '  copyright  by quinn publishing company inc '
 '  application for entry as second class matter at post office buffalo new york pending '
 '  subscription  for  issues in u ' ' s '
 '  and possessions canada  for  issues elsewhere  '
 '  aiiow four weeks for change of address ']


In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    ## convert data to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(text_tokens)
inp_sequences[:20]

[[1970, 41],
 [1970, 41, 421],
 [1970, 41, 421, 228],
 [1970, 41, 421, 228, 5],
 [1970, 41, 421, 228, 5, 771],
 [1970, 41, 421, 228, 5, 771, 1971],
 [1970, 41, 421, 228, 5, 771, 1971, 772],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972, 1973],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972, 1973, 43],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972, 1973, 43, 1277],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972, 1973, 43, 1277, 1278],
 [1970,
  41,
  421,
  228,
  5,
  771,
  1971,
  772,
  57,
  37,
  1972,
  1973,
  43,
  1277,
  1278,
  536],
 [1970,
  41,
  421,
  228,
  5,
  771,
  1971,
  772,
  57,
  37,
  1972,
  1973,
  43,
  1277,
  1278,
  536,
  1279],
 [1974, 228],
 [1974, 228, 1280],
 [1975, 32],
 [1976, 43]]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras.utils as ku

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

Model Creation

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()

    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))

    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 83, 10)            47310     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 4731)              477831    
                                                                 
Total params: 569,541
Trainable params: 569,541
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath = "./model_checkpoints/text_generation_checkpoint.h5"
model_checkpoint_callback = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(predictors, label, epochs=100,callbacks=[model_checkpoint_callback])

Epoch 1/100
Epoch 1: loss improved from inf to 6.94991, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 2/100
Epoch 2: loss improved from 6.94991 to 6.57012, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 3/100
Epoch 3: loss improved from 6.57012 to 6.41606, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 4/100
Epoch 4: loss improved from 6.41606 to 6.25963, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 5/100
Epoch 5: loss improved from 6.25963 to 6.12195, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 6/100
Epoch 6: loss improved from 6.12195 to 5.98873, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 7/100
Epoch 7: loss improved from 5.98873 to 5.84671, saving model to ./model_checkpoints/text_generation_checkpoint.h5
Epoch 8/100
Epoch 8: loss improved from 5.84671 to 5.69369, saving model to ./model_checkpoints/text_generation_checkpoint

<keras.callbacks.History at 0x7f5b4c3bba30>

**Text generator block**

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        classes_x=np.argmax(predicted,axis=1)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes_x:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
#model.save('/content/drive/MyDrive/TEXT_GENERATION_model')



In [None]:
#shutil.copytree('/content/model_checkpoints','/content/drive/MyDrive/TEXT_GENERATION_model')

**Generating text**

In [None]:
print (generate_text("port began to swing shut", 40, model, max_sequence_len))

Port Began To Swing Shut With The Dark Boy For A Woman In The Large Period Of Blue Light But The Far Of Her Mouth Youre Applying The Lab And She Was Their Part Of This Was Light And It Seemed He Was Watching And


In [None]:
print (generate_text("a quite voice", 40, model, max_sequence_len))

A Quite Voice In The Darkness And He Went Up Toward His Own Room But The Daughter Of Some Of Earths Better Than No People And The Shakes Closed A Next But The Dark World Of The Cone Of Hot Light And The


In [None]:
shutil.copytree('/content/model_checkpoints','/content/drive/MyDrive/TEXT_GENERATION_checkpoints_v2')

'/content/drive/MyDrive/TEXT_GENERATION_model_v2'