Use as reference: https://www.safaribooksonline.com/library/view/hands-on-automated-machine/9781788629898/ccac8d45-a703-42b9-992c-d82eafafe94d.xhtml

In [3]:
import string
import pickle
punctuations = string.punctuation

from spacy.lang.en import English
parser = English()

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [4]:
data = pickle.load(open('../data/all_responses_equal.p','rb'))
qa = []
for i in list(data):
    qa.append((i[0]['utterance'],i[1]['utterance']))

# Split pairs into question and answer
question, answer = zip(*qa)

In [84]:
def shorten_sentence(txt):
    """
    Shortens to 20 words
    """
    clean = ' '.join(str(txt).replace("\n","").split(' ')[:40])
    
    return ' '.join(clean.split())

data = pickle.load(open('../data/all_responses_equal.p','rb'))
qa = []
for i in list(data):
    q = shorten_sentence(i[0]['utterance'])
    a = shorten_sentence(i[1]['utterance'])
    if 'deleted' not in a:
        qa.append((q.replace("\n",""),a.replace("\n","")))

In [85]:
question, answer = zip(*qa)
answer[237:238]

('You need to choose to go into medicine for the right reasons. Not everyone is dissatisfied by the choice. As a rule, it seems those who follow the money seem to be the most bitter. I liken it',)

In [86]:
with open('seq2seq_examples/fra-eng/all_responses_equal.txt', 'w') as fp:
    fp.write('\n'.join('%s -----+----- %s' % x for x in qa))

In [88]:
num_samples = 10000
with open('seq2seq_examples/fra-eng/all_responses_equal.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for idx,line in enumerate(lines[: min(num_samples, len(lines) - 1)]):
    #print(line,'\n')
    try:
        input_text, target_text = line.split('-----+-----')
    except:
        print(idx,line)

In [16]:
# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]     
    return tokens

In [25]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [36]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer)])

In [37]:
import time

In [46]:
%%time 
test = pipe.fit_transform([x[0] for x in qa[:1000]], [x[1] for x in qa[:1000]])
test

CPU times: user 65.8 ms, sys: 4.77 ms, total: 70.6 ms
Wall time: 68.2 ms


In [59]:
%time pipe.fit([x[0] for x in qa[:10]], [x[1] for x in qa[:10]])

CPU times: user 2.42 ms, sys: 239 µs, total: 2.66 ms
Wall time: 2.47 ms


Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x118617be0>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram...\\w\\w+\\b',
        tokenizer=<function spacy_tokenizer at 0x115e8e488>,
        vocabulary=None))])

In [67]:
pipe.transform([x[0] for x in qa[:10]]).toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [70]:
def PreprocData(X, Y):
    pipe.fit(X,Y)
    return pipe.transform(X), Y

In [71]:
X_transformed, Y_transformed = PreprocData([x[0] for x in qa[:1000]], [x[1] for x in qa[:1000]])