In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [5]:
# Pickling time
encoder = pickle.load(open('encoder.pkl', 'rb'))
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
lsa = pickle.load(open('lsa.pkl', 'rb'))
knn_lsa = pickle.load(open('knn_lsa.pkl', 'rb'))


In [7]:
encoder.classes_

array(['APPLICATION', 'BILL', 'BILL BINDER', 'BINDER',
       'CANCELLATION NOTICE', 'CHANGE ENDORSEMENT', 'DECLARATION',
       'DELETION OF INTEREST', 'EXPIRATION NOTICE',
       'INTENT TO CANCEL NOTICE', 'NON-RENEWAL NOTICE', 'POLICY CHANGE',
       'REINSTATEMENT NOTICE', 'RETURNED CHECK'], dtype=object)

In [8]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=10000,
                min_df=2, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
words = '918d14133622 8558ed4a226a 586242498a88 f0666bdbc8a5 9d83e581af4b 714fd033a722 1d0ee75170f9 b73e657498f2 1d4249bb404a 72709e886f7e fbe7c05e32d5 fe64d3cdfe5b b7ab56536ec4 cbfb3eb99bea 6d10c76d455a 7a9da1fc6d99 b1bac4b400e2 957b5cf4e65e ff714b9fa7fd 04503bc22789 2ea49cf89745 ce1f034abb5d 25c57acdf805 9bc96abb24e5 878460b4304e 4e5019f629a9 f7bb594ff117 20a84e403407 a62cd2076061 ecee69844b4e 133d46f7ed38 b9699ce57810 5d7641b096f0 586242498a88 5d7641b096f0 ecee69844b4e 6b1c8f75a7e2 0302af775b89 ce1f034abb5d 25c57acdf805 67f9c02008c6 6ce6cc5a3203 93790ade6682 61b7e0f00ffe 4357c81e10c1 a31962fbd5f3 2ea49cf89745 75440bb763a2 036087ac04f9 b136f6349cf3 d0a6ba7c50bf d85aeb8537e1 9cdf4a63deb0 b274cd8dd187 4e5019f629a9 2c624417f3e9 17c11ac4749f eb0d1ab7aefb e8194541d049 286937d51e36 133d46f7ed38 cbfb3eb99bea 41bf90201c72 bbf5683af9a7 223e7af4a080 7a9da1fc6d99 b9699ce57810 7e07ecc160fd 0a73937a5263 cafaf222091d 769e648b85f2 28b12e45914e 6b5158066b67 382618e9d034 fe64d3cdfe5b b02eb907dd1a 4d28878de1db b7ab56536ec4 586242498a88 9d83e581af4b 3ca3606d0b69 acaf107da8bc f0666bdbc8a5 f81146092cfc 02cf5092c064 586242498a88 6ce6cc5a3203 0a73937a5263 9bc65adc033c ee086b7b25a3 586242498a88 e943e5e5b779 448cca02dae6 e4dad7cb07b6 1ab34730c1e0 30ca33997a38 f8b0c07e306c cafaf222091d dc987be61f66 36e7aa72ffe1 6ef2ade170d9 d38820625542 382618e9d034 9374c105ef84 20a84e403407 317714aee69a a62cd2076061 0a73937a5263 97b6014f9e50 f7bb594ff117 4e5019f629a9 f8b0c07e306c 6eaad2528880 9d83e581af4b fbb1a70d2795 894af811cc5b e27acd17313e cf6e30467290 133d46f7ed38 6d1fb90988cf 23344d9339f0 46a0c35e916c 6b343f522f78 30ca33997a38 eca16ee06b98 5f63e56d4b7a 8db54a4cb57b d4bec079d88a 4e5019f629a9 25c57acdf805 4eb37245db2f 3012dd989e4f 133d46f7ed38 48e3af069d42 8f75273e5510 de9738ee8b24 54709b24b45f 299d73abab25 b240b1474e50 1169f8aa8528 e27acd17313e eca16ee06b98 6e86c10fc88e 8f67fb7907c9 ed5d3a65ee2d 30ca33997a38 fee60001e67a e79316ac9e91 54709b24b45f f7ae6f8257da c516cf70f339 9d83e581af4b 133d46f7ed38 42e393bc7d8e 75199e110c91 6b343f522f78 cede998708f7 f898dba78476 ce1f034abb5d d24ee96e26fa f7ae6f8257da 50798a03ad62 1ab34730c1e0 ed5d3a65ee2d 25c57acdf805 73801426ea65 2ca392db71fd 11d62d3598ce 1b3e5dc39eb9 365a9390d225 288ccf089872 572b3238a2c4 e943e5e5b779 448cca02dae6 da2e1b058e4b b8a5ca36d2e2 ff714b9fa7fd 2ea49cf89745 f7ae6f8257da ff714b9fa7fd b2011443d94d 878460b4304e 15cb0ad212f0 56c2c356d772 dec88250479b 000ec8768139 1b3e5dc39eb9 9ccf259ca087 586242498a88 461edd1ddaa6 1d4249bb404a 586242498a88 54709b24b45f eb51798a89e1 60fb2adbbb87 b8a5ca36d2e2 ff714b9fa7fd 2ea49cf89745 f0666bdbc8a5 fc96b835cfc3 133d46f7ed38 878460b4304e 1d4249bb404a 93a5aefea103 5ee06767bc0f b9699ce57810 2ca392db71fd b9699ce57810 a6a1fd3bc1bb df03d288b8b6 ecee69844b4e cd50f04925dd e11a0de517a6 586242498a88 5037561ecd63 586242498a88 1ab34730c1e0 e943e5e5b779 448cca02dae6 d38820625542 54709b24b45f b73e657498f2 167018123c55 d9fc58babbe1 e48a761e8248 86993aa28b61 66c5acf0b655 bcdd3940ccfc bcdd3940ccfc e4dad7cb07b6 98d0d51b397c e48a761e8248 c0d455ef7403 ca5b23de2dd1 56c2c356d772 da9ad7407226 d38820625542 894af811cc5b 6c14785745cb e943e5e5b779 7cd2e94152fb 96d738393f7e 179dce4734b4 8cbab2d81d91 918d14133622 b4d271e03820 8190a4945d7e 46a0c35e916c e943e5e5b779 7309ee3e7545 586242498a88 e27acd17313e e7e059c82399 30ca33997a38 8509b3818454 98d0d51b397c 1ab34730c1e0 f485c480ec19 ac4b4b0920b6 9a1d3ab82b96 adbc913c7024 066a2f7f16c8 e7e059c82399 4eb37245db2f 890c458776f2 4c11a71c788b 6b304aabdcee e43c4b6f2c61 cfb58745e13c 5e47c6e79c1c 90e758ce84ce df330922aae2 b4d271e03820 fea862065b74 b80facfeaff7 dee46d35626e 8db54a4cb57b e0a718aaedd9 586242498a88 30ca33997a38 7cd2e94152fb 586242498a88 3e84d44158be b9699ce57810 6b343f522f78 ee086b7b25a3 41bf90201c72 bbf5683af9a7 a291e077a0a6 774445039259 54709b24b45f b4fcf37f996c 0a73937a5263 93e786b92f57 432423689f19 b9699ce57810 d38820625542 019a80c31e1b bbf5683af9a7 9997460cf372 0828d0584141 ccfc7efc0832 da61efdd2b77 f07761418345 d582f8281ac3 d38820625542 5d7641b096f0 ecee69844b4e 10e45001c2f2 e943e5e5b779 448cca02dae6 b9699ce57810 7309ee3e7545 6b343f522f78 2ca392db71fd ce1f034abb5d 1ab34730c1e0 cd50f04925dd 54709b24b45f dc17032b3469 b9699ce57810 5d7641b096f0 0990b2a5d621 e4dad7cb07b6 586242498a88 befbf6d74a98 133d46f7ed38 9d83e581af4b 573cf83717f5 45a60fde9bb0 df03d288b8b6 8fc932e0816b 586242498a88 7d9e333a86da f7bb594ff117 f07761418345 de078996c1a5 8c4c99dc5cc9 aff49ae41c0a'

def predict(words):
    #test_label, test_words
    predict_x = pd.Series([words])
    predict_x_tfidf = vectorizer.transform(predict_x)
    predict_x_lsa = lsa.transform(predict_x_tfidf)

    p = knn_lsa.predict(predict_x_lsa)
    print(encoder.inverse_transform(p))

In [12]:
predict(words)

['BILL']
