In [1]:
import sys
import os
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

import warnings
warnings.filterwarnings("ignore")

#sys.path.insert(0, os.path.abspath(os.path.join('nlp_library')))
sys.path.insert(0, os.path.abspath(os.path.join('..')))
import nlp.preprocessing as pre
import nlp.embeddings as emb

In [2]:
def get_pipeline():
    config = {
        'stop_words': True,
        'lemmas':True
    }
    return pre.Pipeline(config)

In [4]:
texts = ['This is a test sentence about the test on spend cap keywords in the text.',
         'And this is the second test sentence for the spend cap testing']

pp = get_pipeline()
pp.fit(texts)
embeddings = emb.KeyWordsEmbeddings(top_k=3, mode='binary', key_words=['cap', 'spend cap'])
embeddings.fit(pp)
print(embeddings.top_keyword_vocab)
print(embeddings.key_words)
print(embeddings.transform(pp))

{('sentence', 'test', 'spend'): 0, ('test', 'spend', 'cap', 'keyword'): 1, ('test', 'sentence', 'test', 'spend'): 2, ('sentence', 'test', 'spend', 'cap'): 3, ('spend', 'cap'): 4, ('test', 'sentence', 'test'): 5, ('test', 'spend', 'cap'): 6, ('sentence', 'test'): 7, 'spend': 8, 'test': 9, 'sentence': 10, ('test', 'sentence'): 11}
['cap', 'spend cap']
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1.]]


In [2]:
!wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
config = {'top_k':3, 'mode':'binary', 'key_words':['cap', 'spend cap']}

texts = ['This is a test sentence about the test on spend cap keywords in the text.',
         'And this is the second test sentence for the spend cap testing']

pp = get_pipeline()
pp.fit(texts)
embeddings = emb.KeyWordsEmbeddings()
embeddings.set_params(**config)
embeddings.fit(pp)
print(embeddings.top_keyword_vocab)
print(embeddings.key_words)
print(embeddings.transform(pp))

{('sentence', 'test', 'spend'): 0, ('test', 'spend', 'cap', 'keyword'): 1, ('test', 'sentence', 'test', 'spend'): 2, ('sentence', 'test', 'spend', 'cap'): 3, ('spend', 'cap'): 4, ('test', 'sentence', 'test'): 5, ('test', 'spend', 'cap'): 6, ('sentence', 'test'): 7, 'spend': 8, 'test': 9, 'sentence': 10, ('test', 'sentence'): 11}
['cap', 'spend cap']
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1.]]


In [6]:
config = {'top_k':3, 'mode':'binary', 'key_words':['cap', 'spend cap']}

texts = ['This is a test sentence about the test on spend cap keywords in the text.',
         'And this is the second test sentence for the spend cap testing']

pp = get_pipeline()
pp.fit(texts)
embeddings = emb.KeyWordsEmbeddings(**config)
#embeddings.set_params()
embeddings.fit(pp)
print(embeddings.top_keyword_vocab)
print(embeddings.key_words)
print(embeddings.transform(pp))

{('sentence', 'test', 'spend'): 0, ('test', 'spend', 'cap', 'keyword'): 1, ('test', 'sentence', 'test', 'spend'): 2, ('sentence', 'test', 'spend', 'cap'): 3, ('spend', 'cap'): 4, ('test', 'sentence', 'test'): 5, ('test', 'spend', 'cap'): 6, ('sentence', 'test'): 7, 'spend': 8, 'test': 9, 'sentence': 10, ('test', 'sentence'): 11}
['cap', 'spend cap']
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1.]]


In [9]:
import json
with open('params_embedder.json', 'w') as outfile:
    json.dump(config, outfile)

In [11]:
import nlp.autoencoder as ae
ae_params = {'layers':[64, 40, 30],
             'verbose':False,
             'n_epochs':50,
             'batch_size':512,
             'lr':1e-3,
             'early_stopping':15,
             'save_checkpoint':True}
autoencoder = ae.Autoencoder(layers = [64, 40, 30],
                             verbose=False, 
                             n_epochs=50,
                             batch_size=512,
                             lr = 1e-3,
                             early_stopping = 15,
                             save_checkpoint=True
                            )
with open('ae_params.json', 'w') as outfile:
    json.dump(ae_params, outfile)