In [1]:
import warnings
import pickle
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

import re
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Base Pipeline

In [14]:
class PreProcessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        
    def preprocess_text(self, text):
        text = text.lower()
        text = re.sub('<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub('[^a-zA-Z]', ' ', text)
        words = nltk.word_tokenize(text)
        words = [word for word in words if word not in self.stop_words]
        words = [self.stemmer.stem(word) for word in words]
        text = ' '.join(words)
        return text

    def clean_taxonomy(self, raw_taxonomy):
        words = raw_taxonomy.split('|')
        words = [x for word in words for x in word.split('>')]
        words = [word.lower() for word in words]
        words = [word.replace(' ', '_') for word in words]
        return "  ".join(words)



In [28]:
class ContentBasedRecommender:
    
    def __init__(self):
        self.preprocessor = PreProcessor()
        self.vectorizer = TfidfVectorizer(lowercase=False)
        self.knn_model = NearestNeighbors(metric='cosine')
        self.documents = None
        self.index_to_product_id=None
        self.product_id_to_index = None
        self.vectors = None
        self.raw_data = None
        
    def identity_tokenizer(self, text):
        return text
    
    def store_documents(self, data_path):
        data = pd.read_csv(data_path)
        self.raw_data = data
        data.Name = data.Name.apply(self.preprocessor.preprocess_text)
        data.Taxonomy_List = data.Taxonomy_List.apply(self.preprocessor.clean_taxonomy)

        documents = (data.Name + " " + data.Taxonomy_List).tolist()
        documents = [doc.split() for doc in documents]
        
        self.documents = documents
        self.index_to_product_id = data.Product_ID.to_dict()
        self.product_id_to_index = {str(v): k for k, v in self.index_to_product_id.items()}
        
        with open('feature_store/docs_parts.pkl', 'wb') as f:
            pickle.dump(documents, f)
        
    def save_vectors(self):
        #vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
        self.vectorizer.tokenizer = self.identity_tokenizer
        self.vectors = self.vectorizer.fit_transform(self.documents)
        
        with open('feature_store/vectors_parts.pkl', 'wb') as f:
            pickle.dump(self.vectors, f)
            
    def init_knn_model(self):
        _ = self.knn_model.fit(self.vectors)
        
    def predict(self, product_id, n=12):
        index = self.product_id_to_index[str(product_id)]
        vector = self.vectors[index]
        _, indices = self.knn_model.kneighbors(vector, n+1)
        res=self.raw_data.iloc[indices[0][1:]].reset_index(drop=True)
        return res

In [29]:
base_model = ContentBasedRecommender()
base_model.store_documents(data_path='DataSplit/part_1.csv')
base_model.save_vectors()
base_model.init_knn_model()

In [30]:
base_model.predict(product_id=433, n=12)

Unnamed: 0,Product_ID,Style,Name,Brand,Lookup_List,Taxonomy_List,Product_URL,Price,Retail_Price,Thumbnail_URL,Image_URL,Keywords,Romantic_Copy_Short,Romantic_Copy_Long,Color,Size,Inventory_Count
0,31761,5006C,trick rope toy,PARRIS MANUFACTURING,10192581,toys kids toys toys tomfoolery kids toys...,/parris-manufacturing/trick-rope-toy-31761,7.99,7.99,/prodimages/17223-DEFAULT-s.jpg,/prodimages/17223-DEFAULT-l.jpg,"5006, Trick Rope, Cowgirl Rope, Cowboy Rope, S...",<p>Trick Rope *12 E/C - YF</p>,<p>The perfect accessory for your cowboy or co...,,,13
1,41404,1320,chicken egg toy,Toysmith,10243062,gift_ideas kids staff_picks toys gift_idea...,/toysmith/chicken-and-the-egg-toy-41404,4.99,4.99,/prodimages/14699-DEFAULT-s.jpg,/prodimages/14699-DEFAULT-l.jpg,"41404, Toysmith, 1320, Chicken and the Egg, Ru...",<p>Chicken and the Egg*24 Ec</p>,<p>Which came first- the chicken or the egg? S...,,,20
2,24649,10348,trick hand buzzer toy,Toysmith,10140516,toys kids toys toys tomfoolery kids toys...,/toysmith/trick-hand-buzzer-toy-24649,3.99,3.99,/prodimages/19805-DEFAULT-s.jpg,/prodimages/19805-DEFAULT-l.jpg,"Jw-0015, Trick Hand Buzzer, hand buzzer toy, j...",<p>Trick Hand Buzzer-Jw-0015 *12/576</p>,<p>The fun never stops with classic gags like ...,,,10
3,82221,A7009,snap gum trick toy,S.S. ADAMS CO.,10408910,kids toys kids toys toys tomfoolery kids...,/s-s-adams-co/snapping-gum-trick-toy-82221,2.99,2.99,/prodimages/60417-DEFAULT-s.jpg,/prodimages/60417-DEFAULT-l.jpg,"SS ADAMS, TRICK JOKE SNAPPING GUM x12, 82221, ...",YF - Copy,<p>Are your friends always stealing your last ...,,,85
4,106490,B7073,spin dog trick toy,S.S. ADAMS CO.,10512041,kids toys kids toys toys tomfoolery kids...,/s-s-adams-co/spinning-dogs-trick-toy-106490,3.49,3.49,/prodimages/60415-DEFAULT-s.jpg,/prodimages/60415-DEFAULT-l.jpg,"SS ADAMS, TRICK SPINNING DOGS SCOTTIE x12, 10...",YF - Copy,<p>We&rsquo;ve all laughed over a few moments ...,.,,45
5,83572,A1727,magic fli butterfli trick toy,S.S. ADAMS CO.,10412240,kids toys kids toys toys tomfoolery kids...,/s-s-adams-co/magic-flying-butterfly-trick-toy...,3.99,3.99,/prodimages/60418-DEFAULT-s.jpg,/prodimages/60418-DEFAULT-l.jpg,"SS ADAMS, MAGIC FLYING BUTTERFLY x12, 83572, A...",YF - Copy,<p>&nbsp;Never underestimate the element of su...,,,86
6,98418,A7014,magic thumb tip trick toy,S.S. ADAMS CO.,10480089,kids toys kids toys toys tomfoolery kids...,/s-s-adams-co/magic-thumb-tip-trick-toy-98418,2.99,2.99,/prodimages/60420-DEFAULT-s.jpg,/prodimages/60420-DEFAULT-l.jpg,"SS ADAMS, TRICK MAGIC THUMB TIP x12, 98418, A7...",YF - Copy,<p>Use the Magic Thumb Tip to make a handkerch...,,,80
7,11980,AU_0003,unputtabal golf ball trick toy,Loftus,10097529,kids toys kids toys toys tomfoolery kids...,/loftus/unputtaball-golf-ball-trick-toy-11980,5.99,5.99,/prodimages/60331-DEFAULT-s.jpg,/prodimages/60331-DEFAULT-l.jpg,"LOFTUS, TRICK UNPUTTBLE PUTTING GOLF BALL x 48...",<p>MT</p>,<p>We bet you can&#39;t make that last hole wi...,,,41
8,24648,13_0025,rubber chicken toy,Loftus,10140515,toys kids toys toys tomfoolery kids toys...,/loftus/rubber-chicken-toy-24648,10.99,10.99,/prodimages/3906-DEFAULT-s.jpg,/prodimages/3906-DEFAULT-l.jpg,"24648, 13_0025, RUBBER CHICKEN 19 inch x 1 48,...",<p>Rubber Chicken-20 Inch *1/48 E/C</p>,<p>Who couldn&#39;t use a Rubber Chicken aroun...,,,43
9,72899,WUCTB,chatter teeth toy,Schylling,10372981,kids toys kids toys toys tomfoolery kids...,/schylling/chattering-teeth-toy-72899,4.99,4.99,/prodimages/51455-DEFAULT-s.jpg,/prodimages/51455-DEFAULT-l.jpg,"Toysmith, 6953, Yakity Yak Teeth, wind up teet...",<p>Yakity Yak Teeth -Key Wound *18 8/10</p>Edi...,<p>These Chattering Teeth are alive! Originall...,,,2


In [37]:
class RetrainCotentBasedRecommender:
    def __init__(self, data_path, vocab_path, maps_path, vectors_path):
        self.data_path = data_path
        self.vocab_path = vocab_path
        self.maps_path = maps_path
        self.vectors_path = vectors_path
        
        self.preprocessor = PreProcessor()
        self.vectorizer = TfidfVectorizer(tokenizer=self.identity_tokenizer,lowercase=False)
        
        self.documents = self.load_documents(docs_path)
        self.data = self.load_data(data_path)
        
        self.index_to_product_id=None
        self.product_id_to_index = None
        self.vectors = None
    
    def load_data(self, data_path):
        return pd.read_csv(data_path)
    
    def identity_tokenizer(self, text):
        return text
    
    def load_documents(self, old_docs_path):
        return pickle.load(open(old_docs_path))


    def update_database(self, new_data_path):
        new_data = pd.read_csv(new_data_path)
        new_data.Name = new_data.Name.apply(self.preprocessor.preprocess_text)
        new_data.Taxonomy_List = new_data.Taxonomy_List.apply(self.preprocessor.clean_taxonomy)

        new_documents = (new_data.Name + " " + new_data.Taxonomy_List).tolist()
        new_documents = [doc.split() for doc in new_documents]
        
        self.documents.extend(new_documents)
        self.data = pd.concat([self.data, self.new_data], ignore_index=True)
        self.data.to_csv(self.data_path, index=False)
        
    def update_vocabulary(self):
        
        with open(self.vocab_path, 'wb') as f:
            pickle.dump(self.documents, f)
            
        self.index_to_product_id = self.data.Product_ID.to_dict()
        self.product_id_to_index = {str(v): k for k, v in self.index_to_product_id.items()}
        maps = {
            "index_to_product_id":index_to_product_id,
            "product_id_to_index":product_id_to_index
        }
        with open('mappings.json', 'w') as fp:
            json.dump(maps, fp)
    
    def update_vectors(self):
        self.vectors = self.vectorizer.fit_transform(self.documents)
        with open(self.vectors_path, 'wb') as f:
            pickle.dump(self.vectors, f)
            

In [42]:
class KNNModel:
    def __init__(self, vectors_path, metadata_path, data_path):
        self.vectors = self.load_vectors(vectors_path)
        self.product_id_to_index = self.load_metadata(metadata_path)
        self.knn_model = init_knn_model()
    
    def load_vectors(self, vectors_path):
        vectors = pickle.load(open(vectors_path, 'rb'))
        return vectors
    
    def load_metadata(self, metadata_path):
        mappings = json.load(open(metadata_path))
        index_to_product_id = mappings["index_to_product_id"]
        product_id_to_index = mappings["product_id_to_index"]
        del mappings
        return product_id_to_index
    
    def init_knn_model(self):
        model = NearestNeighbors(metric='cosine')
        _ = model.fit(self.vectors)
        return model
        
    
    def recommend_products(product_id, n=12):
        index = self.product_id_to_index[str(product_id)]
        vector = self.vectors[index]
        _, indices =self. knn_model.kneighbors(vector, n+1)
        res=self.data.iloc[indices[0][1:]].reset_index(drop=True)
        return res

In [None]:
rec