In [18]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import nltk
from nltk.tokenize import RegexpTokenizer
import re
from nltk.corpus import stopwords


import warnings;
warnings.filterwarnings('ignore')
import pickle

In [13]:
# Read the data
df = pd.read_csv(r'name_des.csv')
df[1:10]

Unnamed: 0,name,description
1,B00260G4Z2,"For five generations, the Henry family has bee..."
2,B002QTVMOG,"Finally, Sateen Solid superior weave, woven in..."
3,B0002DVBGM,Odyssey's CCD300E classic carpeted CD case hol...
4,B006GJL0DK,"Charmeuse satin pillowcase set, a modern alter..."
5,B005U65MFM,The Foamnasium Step is a wonderful activity an...
6,B00A6H272G,The round framed stainless steel mesh wires on...
7,B001GCU4U2,"Inspired by classic cottage styling, Hillsdale..."
8,B000059TT6,
9,B00659N336,


In [14]:
# check for missing values
df.isnull().sum()

name             0
description    812
dtype: int64

In [15]:
# remove missing values
df.dropna(inplace=True)

In [16]:
#reset index. 
df = df.reset_index(drop=True)
df.shape

(9188, 2)

In [19]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
df

# Functions to Clean the Descriptions 

In [20]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Applying all the functions in description and storing as a cleaned_desc
df['cleaned_desc'] = df['description'].apply(_removeNonAscii)
df['cleaned_desc'] = df.cleaned_desc.apply(func = make_lower_case)
df['cleaned_desc'] = df.cleaned_desc.apply(func = remove_stop_words)
df['cleaned_desc'] = df.cleaned_desc.apply(func=remove_punctuation)
df['cleaned_desc'] = df.cleaned_desc.apply(func=remove_html)

In [None]:
df

# TF-IDF 

Used a weighting factor for features. The weight increases as the word frequency in a document increases. But this is offset by the number of times the word appears in the entire document. This helps emphasize keywords. 

$$ TF-IDF(t, D) = tf(t,d)* idf(t, D)$$ 

$$ tf(t,d) = \text{frequency in a sentence}$$

$$idf(t, D) = log(\text{# of times it appears in the entire corpus})$$

In [21]:
## Create a TF-IDF matrix of unigrams, and bigrams for each product. The 'stop_words' param
## tells the TF-IDF module to ignore common english words like 'the', etc.
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the index into series
indices = pd.Series(df.index, index = df['name'])
    
#Converting the book description into vectors and used bigram
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df = 1, stop_words='english')
tfidf_matrix = tf.fit_transform(df['cleaned_desc'])

In [22]:
tfidf_matrix

<9188x270120 sparse matrix of type '<class 'numpy.float64'>'
	with 715062 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
cosine_similarities.shape

(9188, 9188)

In [25]:
cosine_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.01173931, ..., 0.        , 0.00302152,
        0.        ],
       [0.        , 0.01173931, 1.        , ..., 0.0081649 , 0.00954115,
        0.        ],
       ...,
       [0.        , 0.        , 0.0081649 , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00302152, 0.00954115, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [26]:
indices

name
B001SATMSM       0
B00260G4Z2       1
B002QTVMOG       2
B0002DVBGM       3
B006GJL0DK       4
              ... 
B0009H63NM    9183
B00020M7K4    9184
B00GMCDWHI    9185
B004WI7D92    9186
B00CIGPC2K    9187
Length: 9188, dtype: int64

# Function to get top 5 similar products. 

In [58]:
def top5_similar_products(product_id, cosine_similarities, df):
    idx = indices[product_id]
    sig = list(enumerate(cosine_similarities[idx]))# Sort the products
    sig = sorted(sig, key=lambda x: x[1], reverse=True)# Scores of the 5 most similar products
    sig = sig[1:6]# product indicies
    product_indices = [i[0] for i in sig]
    
    # Top 5 product recommendation
    rec = df['name'].iloc[product_indices]
    return rec
    

# Examples 

In [59]:
print(top5_similar_products('B001SATMSM', cosine_similarities, df))

3945    B004X76DAC
4937    B00BTM51MK
5290    B0039PYSLU
4738    B000KRC5XI
5831    B0013VAEBA
Name: name, dtype: object


In [60]:
print(top5_similar_products('B006GJL0DK', cosine_similarities, df))

5072    B00A7Z9UEK
3499    B00GW615KU
6983    B00BAWDS38
7179    B005TPLP5K
8704    B002O5FXII
Name: name, dtype: object


In [61]:
obj = top5_similar_products('B004WI7D92', cosine_similarities, df)

In [62]:
df.head()

Unnamed: 0,name,description,cleaned_desc
0,B001SATMSM,No Gambling Blue Porcelain Metal Tin Sign,gambling blue porcelain metal tin sign
1,B00260G4Z2,"For five generations, the Henry family has bee...",five generations henry family making tableware...
2,B002QTVMOG,"Finally, Sateen Solid superior weave, woven in...",finally sateen solid superior weave woven best...
3,B0002DVBGM,Odyssey's CCD300E classic carpeted CD case hol...,odyssey s ccd300e classic carpeted cd case hol...
4,B006GJL0DK,"Charmeuse satin pillowcase set, a modern alter...",charmeuse satin pillowcase set modern alternat...


In [63]:
obj

7361    B007KX4DLM
1466    B00DQG9GC8
4663    B004SH7HO8
2903    B004JO49FK
7678    B0063D2M5Y
Name: name, dtype: object

In [67]:
obj

# get recommendations for each product. 
out_data = pd.DataFrame(columns =['name', 'rec'])

for i in range(len(df)):
    out_data.loc[i] = df['name'][i]
    out_data.at[i, 'rec'] = top5_similar_products(df['name'][i], cosine_similarities, df)
    
    



out_data.head()

Unnamed: 0,name,rec
0,B001SATMSM,3945 B004X76DAC 4937 B00BTM51MK 5290 ...
1,B00260G4Z2,6311 B001G8Y1BO 4105 B004MYFBXG 2607 ...
2,B002QTVMOG,4997 B003U0CPPK 6710 B003U08VLM 9134 ...
3,B0002DVBGM,7626 B00BIT0T9E 7455 B0012DTA7S 8720 ...
4,B006GJL0DK,5072 B00A7Z9UEK 3499 B00GW615KU 6983 ...


In [69]:
json = out_data.to_json()
print(json)

{"name":{"0":"B001SATMSM","1":"B00260G4Z2","2":"B002QTVMOG","3":"B0002DVBGM","4":"B006GJL0DK","5":"B005U65MFM","6":"B00A6H272G","7":"B001GCU4U2","8":"B002XR5H1K","9":"B0072WOIFM","10":"B00004UE88","11":"B004RZDT6G","12":"B002OPRYKI","13":"B00004SY8Z","14":"B00DF20TDI","15":"B00A6EWQK2","16":"B00K1OKB54","17":"B002HRFL8U","18":"B000RHCZ5E","19":"B0073B8KQU","20":"B0049J346K","21":"B00422BB8W","22":"B008C80EUI","23":"B00478VQZY","24":"B00BMZOKPI","25":"B002W4X3X8","26":"B009R90KSM","27":"B004XJ528O","28":"B005AI89V4","29":"B004B8UMN2","30":"B00DQAKTXO","31":"B008I3TJNA","32":"B000H8W92C","33":"B002MUBTAQ","34":"B006K4HOUY","35":"B002L874UI","36":"B007X72UQU","37":"B006KIS5TY","38":"B009L4CX7E","39":"B003NG11DS","40":"B009D4T6J0","41":"B000GX32G0","42":"B000W7GJYW","43":"B000U1YZ78","44":"B009MTAH12","45":"B00AHY2HWI","46":"B0043WA8M6","47":"B004WP5G6M","48":"B005D4VBK6","49":"B002MK6QKO","50":"B004VSHZIC","51":"B000I0YWQK","52":"B00AOY3QEY","53":"B0018I8PL4","54":"B004TQP08S","55":"B0078

Save in results in local. We can iterate all of our products in local and collect the similar products. Then we can just upload this database onto the website. 

Look into group leaders for next recommendation method. 

Speak with Xiao Liu about how we can combine our algorithms. 

Li will create an endpoint, see if I can post onto that. Use Strapi API. 