In [1]:
#library

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# source dataset

item_1 = pd.read_csv("sample-data.csv", sep=',', error_bad_lines=False, encoding="latin-1")

In [3]:
item_1.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [4]:
item_1.tail()

Unnamed: 0,id,description
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...
499,500,All-wear shorts - Time to simplify? Our All-We...


In [5]:
#info dataset
item_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
id             500 non-null int64
description    500 non-null object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


In [6]:
item_1.describe()

Unnamed: 0,id
count,500.0
mean,250.5
std,144.481833
min,1.0
25%,125.75
50%,250.5
75%,375.25
max,500.0


In [7]:
# type object
item_1.dtypes

id              int64
description    object
dtype: object

In [8]:
#hitung percent missing value

total = item_1.isnull().sum().sort_values(ascending=False)

percent = (item_1.isnull().sum()/item_1.isnull().count()) \
.sort_values(ascending=False)

missing_books = pd.concat([total, percent], axis=1, \
                          keys=['Total', 'Missing Percent'])

missing_books['Missing Percent'] = \
missing_books['Missing Percent'].apply(lambda x: x * 100)

missing_books.loc[missing_books['Missing Percent'] > 10][:10]

Unnamed: 0,Total,Missing Percent


In [9]:
print(f'Panjang Id yang dimiliki yaitu: {len(item_1)}')

Panjang Id yang dimiliki yaitu: 500


In [10]:
item_1.columns = item_1.columns.str.strip().str.lower().str.replace('-', '_')
item_1.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [12]:
#cek nilai yang null di kolom description

description_null = item_1.description.isnull().sum()
print(f'There are {description_null} empty description in our set')

There are 0 empty description in our set


In [13]:
top_item = item_1.description.value_counts()[:10]
print(f'The 10 description are:\n{top_item}')

The 10 description are:
Cap 1 graphic t-shirt - An elegant union of graceful design and technical ability, our silky Capilene 1 Graphic T-shirt provides dry, cool comfort and sun protection. Made from our lightest-weight Capilene performance baselayer fabric (3.7-oz all-recycled polyester), this breathable, fast-drying shirt features a flattering V-neck with a brushed, folded-over elastic collar that's soft against the skin, raglan sleeves that reduce chafe under packs, a tuckable length and a feminine silhouette. Sublimated graphics won't impede breathability. With Gladiodor natural odor control for the garment and 15-UPF sun protection. Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Silky soft Capilene 1 is the lightest of our technical baselayer fabrics</li> <li>"Fast-drying, highly breathable and sleek, it provides 15-UPF sun protection and can be worn either on its own or under layers in cooler temps"</li> <li>V-neck vents excess heat</li> 

In [15]:
# data preprocessing
prep_item = [BeautifulSoup(text).get_text() for text in item_1['description']]

In [16]:
import string
i=0
for word in prep_item:
    for punctuation in string.punctuation:
        word = word.replace(punctuation,"")
    for number in '1234567890':
        word = word.replace(number,"")
    prep_item[i] = word
    i = i+1

prep_item

['Active classic boxers  Theres a reason why our boxers are a cult favorite  they keep their cool especially in sticky situations The quickdrying lightweight underwear takes up minimal space in a travel pack An exposed brushed waistband offers nexttoskin softness fivepanel construction with a traditional boxer back for a classic fit and a functional fly Made of oz  recycled polyester with moisturewicking performance Inseam size M is   Recyclable through the Common Threads Recycling ProgramDetails Silky Capilene  fabric is ultralight breathable and quicktodry Exposed brushed elastic waistband for comfort panel construction with traditional boxer back Inseam size M is  Fabric oz  allrecycled polyester with Gladiodor natural odor control for the garment Recyclable through the Common Threads Recycling ProgramWeight  g  ozMade in Mexico',
 'Active sport boxer briefs  Skinning up Glory requires enough movement without your boxers deciding to poach their own route The formfitting Active Sport

In [17]:
# case folding
case_folding = [x.lower() for x in prep_item]
case_folding

['active classic boxers  theres a reason why our boxers are a cult favorite  they keep their cool especially in sticky situations the quickdrying lightweight underwear takes up minimal space in a travel pack an exposed brushed waistband offers nexttoskin softness fivepanel construction with a traditional boxer back for a classic fit and a functional fly made of oz  recycled polyester with moisturewicking performance inseam size m is   recyclable through the common threads recycling programdetails silky capilene  fabric is ultralight breathable and quicktodry exposed brushed elastic waistband for comfort panel construction with traditional boxer back inseam size m is  fabric oz  allrecycled polyester with gladiodor natural odor control for the garment recyclable through the common threads recycling programweight  g  ozmade in mexico',
 'active sport boxer briefs  skinning up glory requires enough movement without your boxers deciding to poach their own route the formfitting active sport

In [18]:
# Stemming

from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_item = [[ps.stem(word) for word in sentence.split(" ")] for sentence in case_folding]

In [19]:
stemmed_item

[['activ',
  'classic',
  'boxer',
  '',
  'there',
  'a',
  'reason',
  'whi',
  'our',
  'boxer',
  'are',
  'a',
  'cult',
  'favorit',
  '',
  'they',
  'keep',
  'their',
  'cool',
  'especi',
  'in',
  'sticki',
  'situat',
  'the',
  'quickdri',
  'lightweight',
  'underwear',
  'take',
  'up',
  'minim',
  'space',
  'in',
  'a',
  'travel',
  'pack',
  'an',
  'expos',
  'brush',
  'waistband',
  'offer',
  'nexttoskin',
  'soft',
  'fivepanel',
  'construct',
  'with',
  'a',
  'tradit',
  'boxer',
  'back',
  'for',
  'a',
  'classic',
  'fit',
  'and',
  'a',
  'function',
  'fli',
  'made',
  'of',
  'oz',
  '',
  'recycl',
  'polyest',
  'with',
  'moisturewick',
  'perform',
  'inseam',
  'size',
  'm',
  'is',
  '',
  '',
  'recycl',
  'through',
  'the',
  'common',
  'thread',
  'recycl',
  'programdetail',
  'silki',
  'capilen',
  '',
  'fabric',
  'is',
  'ultralight',
  'breathabl',
  'and',
  'quicktodri',
  'expos',
  'brush',
  'elast',
  'waistband',
  'for',


In [20]:
# lemmatizer

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_item = [[wordnet_lemmatizer.lemmatize(word) for word in sentence] for sentence in stemmed_item]

In [21]:
lemmatized_item

[['activ',
  'classic',
  'boxer',
  '',
  'there',
  'a',
  'reason',
  'whi',
  'our',
  'boxer',
  'are',
  'a',
  'cult',
  'favorit',
  '',
  'they',
  'keep',
  'their',
  'cool',
  'especi',
  'in',
  'sticki',
  'situat',
  'the',
  'quickdri',
  'lightweight',
  'underwear',
  'take',
  'up',
  'minim',
  'space',
  'in',
  'a',
  'travel',
  'pack',
  'an',
  'expo',
  'brush',
  'waistband',
  'offer',
  'nexttoskin',
  'soft',
  'fivepanel',
  'construct',
  'with',
  'a',
  'tradit',
  'boxer',
  'back',
  'for',
  'a',
  'classic',
  'fit',
  'and',
  'a',
  'function',
  'fli',
  'made',
  'of',
  'oz',
  '',
  'recycl',
  'polyest',
  'with',
  'moisturewick',
  'perform',
  'inseam',
  'size',
  'm',
  'is',
  '',
  '',
  'recycl',
  'through',
  'the',
  'common',
  'thread',
  'recycl',
  'programdetail',
  'silki',
  'capilen',
  '',
  'fabric',
  'is',
  'ultralight',
  'breathabl',
  'and',
  'quicktodri',
  'expo',
  'brush',
  'elast',
  'waistband',
  'for',
  

In [22]:
# tf-idf

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(item_1['description'])

In [23]:
# cosine similarities

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
results = {}

In [25]:
for idx, row in item_1.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], item_1['id'][i]) for i in similar_indices]

    results[row['id']] = similar_items[1:]

print('done!')


done!


In [28]:
def item(id):
    return item_1.loc[item_1['id'] == id]['description'].tolist()[0].split(' - ')[0]

In [29]:
# Just reads the results out of the dictionary.

def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(item_id=400, num=8)

Recommending 8 products similar to Retro grade shorts...
-------
Recommended: Retro grade pants (score:0.41366595688761404)
Recommended: Rock guide shorts (score:0.10210511736245738)
Recommended: Relax fit organic ctn jeans-reg (score:0.10198842229653748)
Recommended: Relax fit organic ctn jeans-shor (score:0.101765720521608)
Recommended: Relax fit organic ctn jeans-long (score:0.1014326948317337)
Recommended: Undercover cami (score:0.09780607202312828)
Recommended: Rock guide shorts (score:0.09646719841010444)
Recommended: Super cali shorts (score:0.0944687642782413)
