In [6]:
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

In [7]:
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

stemmer = PorterStemmer()
features = ['Product_ID', 'Name', 'Brand', 'Taxonomy_List', 'Keywords']
def preprocess_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text

def clean_taxonomy(raw_taxonomy):
    words = raw_taxonomy.split('|')
    words = [x for word in words for x in word.split('>')]
    words = [word.lower() for word in words]
    return "  ".join(words)


def preprocess_data(df):
    df.fillna('Missing', inplace=True)
    df.Name = df.Name.apply(preprocess_text)
    df.Taxonomy_List = df.Taxonomy_List.apply(clean_taxonomy)
#     df.Brand = df.Brand.str.lower()
#     df.Keywords = df.Keywords.apply(preprocess_text)
    return df
    

def generate_vectors(df):
#     documents = (df.Name + " " + df.Brand + " " + df.Taxonomy_List + " " + df.Keywords).tolist()
    documents = df.Taxonomy_List.tolist()
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    tfidf_vectors = vectorizer.fit_transform(documents)
    nn_model = NearestNeighbors(n_neighbors=10, algorithm='brute')
    nn_model.fit(tfidf_vectors)
    return vectorizer, nn_model

def preprocess_input(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text
    
    
def get_product_vector(product_id, id_to_index, vectors):
    indx = id_to_index[product_id]
    vector = vectors[indx]
    return vector

def recommend_products(product_id, vectors, nn_model, main_df, n=10):
#     new_document = preprocess_input(query)
#     new_tfidf_vector = vectorizer.transform([new_document])
    new_tfidf_vector = get_product_vector(product_id)
    _, indices = nn_model.kneighbors(new_tfidf_vector,n_neighbors=n)
    res = main_df.iloc[indices[0]].reset_index(drop=True)
    return dict(res)
    

In [9]:
#from utils import preprocess_data, generate_vectors
# import pandas as pd
import pickle


main_df = pd.read_csv("../data/mast_product_level_information.csv")
df = preprocess_data(main_df)

print("data loaded..")

vectorizer, model = generate_vectors(df)

print('vectors generated')

with open('feature_store/model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('feature_store/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print('models saved..')

data loaded..
vectors generated
models saved..


In [12]:
nn_model = pickle.load(open('feature_store/model.pkl', 'rb'))
vectorizer = pickle.load(open('feature_store/vectorizer.pkl', 'rb'))
main_df = pd.read_csv('../data/mast_product_level_information.csv')
query = input('Search: ')

res = recommend_products(query, vectorizer, nn_model, main_df, n=10)

pd.DataFrame(res)

Search: t shirt


Unnamed: 0,Product_ID,Style,Name,Brand,Lookup_List,Taxonomy_List,Product_URL,Price,Retail_Price,Thumbnail_URL,Image_URL,Keywords,Romantic_Copy_Short,Romantic_Copy_Long,Color,Size,Inventory_Count
0,226574,LS_BESTIE,Best Friends Long Sleeve T-Shirt,Simply Southern,11056396|11056397|11056398|11056399|11056400,T-SHIRTS & SWEATSHIRTS|WOMEN|T-SHIRTS & SWEATS...,/simply-southern/best-friends-long-sleeve-t-sh...,24.0,24.0,/prodimages/71885-DEFAULT-s.jpg,/prodimages/71885-DEFAULT-l.jpg,"SIMPLY SOUTHERN, LS_BESTIE, 226574, Best Frien...",,<p>A shirt for making memories with your best ...,LILAC,"L,M,S,2XL,XL",38
1,226572,LS_DIFFERENT,What Makes You Different Long Sleeve T-Shirt,Simply Southern,11056386|11056387|11056388|11056389|11056390,T-SHIRTS & SWEATSHIRTS|WOMEN|T-SHIRTS & SWEATS...,/simply-southern/what-makes-you-different-long...,24.0,24.0,/prodimages/71883-DEFAULT-s.jpg,/prodimages/71883-DEFAULT-l.jpg,"SIMPLY SOUTHERN, LS_DIFFERENT, 226572, What Ma...",,<p>Everyone is different! It&#39;s the qualiti...,CREME,"M,L,S,XL,2XL",77
2,187984,1001,Appalachian Trail Short Sleeve T-Shirt,The Landmark Project,10892557|10892558|10892559|10892560|10892561|1...,MEN|T-SHIRTS & SWEATSHIRTS|MEN>T-SHIRTS & SWEA...,/the-landmark-project/appalachian-trail-short-...,34.95,34.95,/prodimages/62119-DEFAULT-s.jpg,/prodimages/62119-DEFAULT-l.jpg,"THE LANDMARK PROJECT, APPALACHIAN TRAIL SS 36,...",YF - Copy,"<p>From Springer Mountain to Katahdin, the App...",DUNE,"S,XL,2XL,M",52
3,198206,DEBB_MGS_F22_LS,Mast General Store Double Exposure Black Bear ...,NO SHOW,10932264|10932265|10932266|10932267|10932268|1...,MAST COLLECTION|MEN|T-SHIRTS & SWEATSHIRTS|WOM...,/no-show/mast-general-store-double-exposure-bl...,32.99,32.99,/prodimages/57784-DEFAULT-s.jpg,/prodimages/57784-DEFAULT-l.jpg,"HIGH RANGE, DBL EXP BLK BR MGS MTNS CALLING 60...",Edit by YF,<p>A wearable memento from the Appalachian Mou...,"SEAFOAM,WATERMELON","XL,L,M,S,2XL,3XL",323
4,207104,KT_980_06SP10193,Knoxville Rainbow Sphere Short Sleeve T-Shirt,Bacon And Company,10972273|10972274|10972275|10972276|10972277|1...,T-SHIRTS & SWEATSHIRTS|T-SHIRTS & SWEATSHIRTS>...,/bacon-and-company/knoxville-rainbow-sphere-sh...,24.99,24.99,/prodimages/71225-DEFAULT-s.jpg,/prodimages/71225-DEFAULT-l.jpg,"BACON AND COMPANY, KNOXVILLE RAINBOW SPHERE TO...",LS,"<p>Righteously retro, this short-sleeve t-shir...",HTR_ROYAL,M,2
5,141645,TRI_STAR_LS,Tri Star Long Sleeve T-Shirt,Threds,10700299|10700300|10700301|10700302|10700303|1...,MEN|T-SHIRTS & SWEATSHIRTS|WOMEN|MEN>T-SHIRTS ...,/threds/tri-star-long-sleeve-t-shirt-141645,36.99,36.99,/prodimages/71548-DEFAULT-s.jpg,/prodimages/71548-DEFAULT-l.jpg,"Threds, TRI_STAR_LS, 141645, Tri Star Long Sle...",,"<p>Fall is here, so the&nbsp;Tri Star Long Sle...","RED,ORANGE,GREY","L,XL,M,2XL,S",162
6,226578,LS_FARMXMAS,Merry Christmas Truck Long Sleeve T-Shirt,Simply Southern,11056416|11056417|11056418|11056419|11056420,T-SHIRTS & SWEATSHIRTS|WOMEN|T-SHIRTS & SWEATS...,/simply-southern/merry-christmas-truck-long-sl...,24.0,24.0,/prodimages/71889-DEFAULT-s.jpg,/prodimages/71889-DEFAULT-l.jpg,"SIMPLY SOUTHERN, MERRY CHRISTMAS TRUCK LS, 226...",,<p>Don&#39;t let the chill hold you back from ...,BLACK,"L,S,M,XL,2XL",40
7,46947,003,World Traveler Short Sleeve T-Shirt,Rocket 9,10272994|10272995|10272996|10272997|10272998|1...,T-SHIRTS & SWEATSHIRTS|WOMEN|MEN>T-SHIRTS & SW...,/rocket-9/world-traveler-short-sleeve-t-shirt-...,19.99,19.99,/prodimages/20489-DEFAULT-s.jpg,/prodimages/20489-DEFAULT-l.jpg,"World Traveler on Ice Grey SS, Rocket 9 Tee, M...",<p>World Traveler on Ice Grey SS- LM</p>,"<p>Rooted in the past and reissued for today, ...",ICE_GREY,"M,S,2XL,L,XL",77
8,191783,STAR_ROA,Mast General Store Roanoke Star Short Sleeve T...,Mast General Store,10905683|10905684|10905685|10905686|10905687|1...,MEN|T-SHIRTS & SWEATSHIRTS|WOMEN|MEN>T-SHIRTS ...,/mast-general-store/mast-general-store-roanoke...,23.99,23.99,/prodimages/58873-DEFAULT-s.jpg,/prodimages/58873-DEFAULT-l.jpg,"MAST GENERAL STORE, MGS STAR ROANOKE VA NL3604...",YF,<p>Let your star shine brightly in our&nbsp;Ma...,NATURAL_MAROON,L,1
9,203097,T1241913,Women's Re Form Long Sleeve Flannel Shirt,Toad And Co,10954772|10954773|10954774|10954775|10954776|1...,WOMEN|WOMEN>SHIRTS & TOPS,/toad-and-co/women-s-re-form-long-sleeve-flann...,80.0,80.0,/prodimages/66925-DEFAULT-s.jpg,/prodimages/66925-DEFAULT-l.jpg,"TOAD AND CO, RE FORM FLANNEL SHIRT, 203097, T1...",YF - Copy,<p>A true aficionado and devotee of the great ...,"942_CANOE,257_BARLEY","L,S,M,XL,XS",22


In [14]:
df.Taxonomy_List.head()

0    staff picks  toys  kids  toys  staff picks  to...
1    toys  kids  toys  toys  pretend play & dress-u...
2    outdoor gear  health & safety  outdoor gear  h...
3    outdoor gear  outdoor gear  health & safety  o...
4    outdoor gear  furniture & sleep systems  outdo...
Name: Taxonomy_List, dtype: object