<h3> Imports </h3>

In [1]:
import numpy as np
import pandas as pd
import ebooklib
import re
import nltk
from nltk.util import ngrams
from ebooklib import epub
from bs4 import BeautifulSoup
import gensim 
import spacy
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import WhitespaceTokenizer

<h3>Reading the epub file : </h3>

In [2]:
def epub2thtml(epub_path):
    book = epub.read_epub(epub_path)
    chapters = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())
    return chapters

In [3]:
blacklist = [   '[document]',   'noscript', 'header',   'html', 'meta', 'head','input', 'script',   ]

In [4]:
def chap2text(chap):
    output = ''
    soup = BeautifulSoup(chap, 'html.parser')
    text = soup.find_all(text=True)
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)
    return output

In [5]:
def thtml2ttext(thtml):
    Output = []
    for html in thtml:
        text =  chap2text(html)
        Output.append(text)
    return Output

In [6]:
def epub2text(epub_path):
    chapters = epub2thtml(epub_path)
    ttext = thtml2ttext(chapters)
    return ttext

In [7]:
#epub file as a list
out=epub2text('The-Art-of-Digital-Marketing-The-Definitive-Guide-to-Creating-Strategic-Targeted-and-Measurable-Online-Campaigns.epub')

<h3> Data Munging </h3>

In [17]:
def to_string(text) :
    return ' '.join(text)
text = to_string(out)

In [18]:
#Removing punctuation,extra spaces and numbers 
def clean_file(doc):
    
    clean = re.sub("[^A-Za-z']+",' ',text)
    
    return clean

In [19]:
clean_text = clean_file(text)

In [11]:
#saving the text in a newfile
with open("text_clean.cvs", "w") as text_clean:
    print(f"text: {clean_text}", file=text_clean)

In [20]:
#tokenizing for spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp(clean_text)
tokenizer = nlp.Defaults.create_tokenizer(nlp)
tokens = tokenizer(clean_text)
token_list = []
for token in tokens:
    token_list.append(token.text)

In [21]:
#lemma 
lemma_list = []
for token in tokens:
    lemma_list.append(token.lemma_)

In [22]:
#tokenizing and stemming for nltk
def w_tokenizer(text):
    tokenizer = WhitespaceTokenizer()   
    tokenized_list = tokenizer.tokenize(text) 
    return(tokenized_list)
from nltk.stem.snowball import SnowballStemmer
def stemmer_snowball(text_list):
    snowball = SnowballStemmer(language='english')
    return_list = []
    for i in range(len(text_list)):
        return_list.append(snowball.stem(text_list[i]))
    return(return_list)
clean_text = str(stemmer_snowball(w_tokenizer(clean_text)))

In [23]:
#removing stop words 
for word in STOP_WORDS:
    for w in (word, word[0].capitalize(), word.lower()):
        lex = nlp.vocab[w]
        lex.is_stop = True

doc = nlp(clean_text)
text = [token.text for token in doc if not token.is_stop]

In [26]:
#creating list of list of sentences for word2vec
text = to_string(text)
sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]

<h3>Training</h3>

In [27]:
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

In [28]:
model = gensim.models.Word2Vec(
    sentences,
    size=200,
    window=2,
    min_count=5,
    iter=100)

2020-08-23 12:15:22,549 : INFO : collecting all words and their counts
2020-08-23 12:15:22,554 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-08-23 12:15:22,580 : INFO : collected 3800 word types from a corpus of 168122 raw words and 1 sentences
2020-08-23 12:15:22,581 : INFO : Loading a fresh vocabulary
2020-08-23 12:15:22,618 : INFO : effective_min_count=5 retains 1421 unique words (37% of original 3800, drops 2379)
2020-08-23 12:15:22,619 : INFO : effective_min_count=5 leaves 163902 word corpus (97% of original 168122, drops 4220)
2020-08-23 12:15:22,624 : INFO : deleting the raw counts dictionary of 3800 items
2020-08-23 12:15:22,635 : INFO : sample=0.001 downsamples 11 most-common words
2020-08-23 12:15:22,636 : INFO : downsampling leaves estimated 56844 word corpus (34.7% of prior 163902)
2020-08-23 12:15:22,642 : INFO : estimated required memory for 1421 words and 200 dimensions: 2984100 bytes
2020-08-23 12:15:22,643 : INFO : resetting layer weig

2020-08-23 12:15:23,282 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:23,283 : INFO : EPOCH - 18 : training on 168122 raw words (10000 effective words) took 0.0s, 775100 effective words/s
2020-08-23 12:15:23,288 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:23,290 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:23,300 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:23,301 : INFO : EPOCH - 19 : training on 168122 raw words (10000 effective words) took 0.0s, 782525 effective words/s
2020-08-23 12:15:23,307 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:23,308 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:23,318 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:23,319 : INFO : EPOCH - 20 : training on 168122 raw words (10000 effective w

2020-08-23 12:15:23,633 : INFO : EPOCH - 38 : training on 168122 raw words (10000 effective words) took 0.0s, 907704 effective words/s
2020-08-23 12:15:23,638 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:23,640 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:23,650 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:23,650 : INFO : EPOCH - 39 : training on 168122 raw words (10000 effective words) took 0.0s, 814560 effective words/s
2020-08-23 12:15:23,656 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:23,657 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:23,666 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:23,667 : INFO : EPOCH - 40 : training on 168122 raw words (10000 effective words) took 0.0s, 933958 effective words/s
2020-08-23 12:15:23,672 : INFO : worker thread fi

2020-08-23 12:15:23,997 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:23,998 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:24,008 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:24,009 : INFO : EPOCH - 59 : training on 168122 raw words (10000 effective words) took 0.0s, 838700 effective words/s
2020-08-23 12:15:24,015 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:24,017 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:24,027 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:24,027 : INFO : EPOCH - 60 : training on 168122 raw words (10000 effective words) took 0.0s, 784898 effective words/s
2020-08-23 12:15:24,034 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:24,035 : INFO : worker thread finished; awaiting finish of 1 more threads
20

2020-08-23 12:15:24,348 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:24,358 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:24,358 : INFO : EPOCH - 79 : training on 168122 raw words (10000 effective words) took 0.0s, 821998 effective words/s
2020-08-23 12:15:24,363 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:24,365 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:24,375 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:24,375 : INFO : EPOCH - 80 : training on 168122 raw words (10000 effective words) took 0.0s, 831170 effective words/s
2020-08-23 12:15:24,380 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:24,382 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:24,392 : INFO : worker thread finished; awaiting finish of 0 more threads
20

2020-08-23 12:15:24,698 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:24,699 : INFO : EPOCH - 99 : training on 168122 raw words (10000 effective words) took 0.0s, 928461 effective words/s
2020-08-23 12:15:24,704 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-23 12:15:24,705 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-23 12:15:24,715 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-23 12:15:24,716 : INFO : EPOCH - 100 : training on 168122 raw words (10000 effective words) took 0.0s, 815985 effective words/s
2020-08-23 12:15:24,716 : INFO : training on a 16812200 raw words (1000000 effective words) took 1.8s, 562041 effective words/s


In [None]:
model.wv.vocab

<h3> Preparing For Keras </h3>

In [32]:
import os 
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir('txt dataset')):
    path = os.path.join('', name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (5,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Found 0 texts.


In [None]:
#tokenizing for keras embedding 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [None]:
#using glove embedding
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))