In [1]:
import pandas as pd
from scraper import Scraper
import os

# 1 Scraping 

In [2]:
csv_name = '20_tokopedia_products.csv'

if csv_name in os.listdir():
    data = pd.read_csv(csv_name)
else:
    a = Scraper()
    data = a.get_data()
    a.driver.quit()
    
    data = pd.DataFrame(data)
    data.to_csv(csv_name, index=False)

In [3]:
n = 4
print(data.Product[n], data.Description[n])

FLUX SET - Wedding Gift | Birthday Hampers | Kado | Hadiah FLUX SET :
- RTR : Classic Mini Air Diffuser / Humidifier 220ml Capacity (Include USB Cable for Air Humidifier & cotton sticks)
- RTR Aromatherapy Oil : 2 variants @10ml
- RTR : Bear Acrylic Night Lamp
- Corrugated white box 22x22cm
* FREE GREETING CARD


In [4]:
len(data)

20

In [5]:
data['id'] = data.index

# 2 Similarity

In [6]:
import contractions
import re
import nltk
nltk.download('stopwords')

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, MinMaxScaler, StandardScaler, RobustScaler, Normalizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Cleaning the texts
def txtprocess(txt):
    # Lower the texts
    txt = str(txt).lower()
    # Remove contractions
    txt = contractions.fix(txt)
    
    # Just pick the alphabet
    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    # Fix unnecessary space
    txt = re.sub(' +', ' ', txt)
    
    txt = ' '.join(txt.split())
    
    return txt

data.Product = data.Product.map(txtprocess)
data.Description = data.Description.map(txtprocess)

In [8]:
# Cleaning stopwords. Omit the negative maker words (I found this quite effective on this case)
stop_words = set(nltk.corpus.stopwords.words('indonesian'))
stop_words.add('gift')
stop_words.add('hampers')
stop_words.add('hadiah')
stop_words.add('kado')

def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

data.Product = data.Product.map(remove_stopwords)
data.Description = data.Description.map(remove_stopwords)

In [9]:
data['char_count'] = data['Description'].map(len)

In [10]:
data.Price = data.Price.str[2:].str.replace('.', '').astype(int)

In [77]:
# For product and description
tfidf_product = TfidfVectorizer()
product_vectors = tfidf_product.fit_transform(data.Product).toarray()

tfidf_description = TfidfVectorizer()
description_vectors = tfidf_description.fit_transform(data.Description).toarray()

product_similarity_matrix = cosine_similarity(product_vectors)
description_similarity_matrix = cosine_similarity(description_vectors)

# For prices and c
normalized_prices = data.Price.values.reshape(1, -1)
normalized_char_count = data.char_count.values.reshape(1, -1)

scaler = Normalizer() 
normalized_prices = scaler.fit_transform(normalized_prices)
normalized_char_count = scaler.fit_transform(normalized_char_count)

normalized_prices = cosine_similarity(normalized_prices)
normalized_char_count = cosine_similarity(normalized_char_count)

weight_product = 0.4
weight_description = 0.3
weight_prices = 0.2
weight_char_count = 0.1

combined_similarity_matrix = (weight_product * product_similarity_matrix) + (weight_description * description_similarity_matrix) + (weight_prices * normalized_prices) + (weight_char_count * normalized_char_count)

j = 0
for i in range(len(combined_similarity_matrix)):
    top_3_similar_indices = combined_similarity_matrix[i].argsort()[::-1][1:6]  # Exclude the current product
    top_3_similar_products = data.iloc[top_3_similar_indices]['Product']
    if j!=19 and j!=3 and j!=4 and j!=1 and j!=12 and j!=13:
        j+=1
        continue
    print(j)
    j+=1
    print(f"Top 3 similar products to '{data.iloc[i]['Product']}':")
    print(top_3_similar_products)
    print("\n")

1
Top 3 similar products to 'mug couple wedding':
16    souvenir mug couple pernikahan wedding
14               souvenir pernikahan wedding
0                souvenir pernikahan wedding
19               wedding pernikahan handuk x
4                  flux set wedding birthday
Name: Product, dtype: object


3
Top 3 similar products to 'bayi lahiran bayi newborn baby':
8                            newborn set baby bayi bayi
15              kelahiran bayi baby newborn set newborn
6     bayi baby girl boy lahiran perempuan laki laki...
7            baby boy set parcel bayi laki laki premium
2          zwitsal baby set box paket perlengkapan bayi
Name: Product, dtype: object


4
Top 3 similar products to 'flux set wedding birthday':
0                souvenir pernikahan wedding
14               souvenir pernikahan wedding
1                         mug couple wedding
19               wedding pernikahan handuk x
16    souvenir mug couple pernikahan wedding
Name: Product, dtype: object


12
Top 3

In [51]:
tfidf_vectorizer = TfidfVectorizer()
product_description_vectors = tfidf_vectorizer.fit_transform(data.Product).toarray()

feature_matrix = pd.DataFrame(product_description_vectors)
similarity_matrix = cosine_similarity(feature_matrix)

j = 0
for i in range(len(similarity_matrix)):
    top_3_similar_indices = similarity_matrix[i].argsort()[::-1][1:6]  # Exclude the current product
    top_3_similar_products = data.iloc[top_3_similar_indices]['Product']
    if j!=19 and j!=3 and j!=4 and j!=1:
        j+=1
        continue
    print(j)
    j+=1
    print(f"Top 3 similar products to '{data.iloc[i]['Product']}':")
    print(top_3_similar_products)
    print("\n")

1
Top 3 similar products to 'mug couple wedding':
16    souvenir mug couple pernikahan wedding
0                souvenir pernikahan wedding
14               souvenir pernikahan wedding
19               wedding pernikahan handuk x
4                  flux set wedding birthday
Name: Product, dtype: object


3
Top 3 similar products to 'bayi lahiran bayi newborn baby':
8                            newborn set baby bayi bayi
15              kelahiran bayi baby newborn set newborn
6     bayi baby girl boy lahiran perempuan laki laki...
2          zwitsal baby set box paket perlengkapan bayi
7            baby boy set parcel bayi laki laki premium
Name: Product, dtype: object


4
Top 3 similar products to 'flux set wedding birthday':
0                souvenir pernikahan wedding
14               souvenir pernikahan wedding
19               wedding pernikahan handuk x
1                         mug couple wedding
16    souvenir mug couple pernikahan wedding
Name: Product, dtype: object


19
Top 3

In [537]:
# Vectorize 'Product' and 'Description'
tfidf_vectorizer = TfidfVectorizer()
product_description_vectors = tfidf_vectorizer.fit_transform(data['Product']).toarray()

# Normalize 'Price' and reshape
normalized_prices = data['Price'].values.reshape(1, -1)
normalized_char_count = data['char_count'].values.reshape(1, -1)
scaler = Normalizer() 
normalized_prices = scaler.fit_transform(normalized_prices)
normalized_char_count = scaler.fit_transform(normalized_char_count)
# normalized_prices = normalize(normalized_prices)
# normalized_char_count = normalize(normalized_char_count)

# Combine feature vectors
feature_matrix = pd.DataFrame(product_description_vectors)
# feature_matrix['Normalized_Price'] = normalized_prices[0]
# feature_matrix['Normalized_Char_Count'] = normalized_char_count[0]

# Calculate cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Find the top 3 most similar products for each product
j = 0
for i in range(len(similarity_matrix)):
    top_3_similar_indices = similarity_matrix[i].argsort()[::-1][1:4]  # Exclude the current product
    top_3_similar_products = data.iloc[top_3_similar_indices]['Product']
    if j!=19 and j!=3 and j!=4 and j!=1:
        j+=1
        continue
    print(j)
    j+=1
    print(f"Top 3 similar products to '{data.iloc[i]['Product']}':")
    print(top_3_similar_products)
    print("\n")

1
Top 3 similar products to 'mug couple wedding':
16    souvenir mug couple pernikahan wedding
0                souvenir pernikahan wedding
14               souvenir pernikahan wedding
Name: Product, dtype: object


3
Top 3 similar products to 'bayi lahiran bayi newborn baby':
8                            newborn set baby bayi bayi
15              kelahiran bayi baby newborn set newborn
6     bayi baby girl boy lahiran perempuan laki laki...
Name: Product, dtype: object


4
Top 3 similar products to 'flux set wedding birthday':
0     souvenir pernikahan wedding
14    souvenir pernikahan wedding
19    wedding pernikahan handuk x
Name: Product, dtype: object


19
Top 3 similar products to 'wedding pernikahan handuk x':
14               souvenir pernikahan wedding
0                souvenir pernikahan wedding
16    souvenir mug couple pernikahan wedding
Name: Product, dtype: object




Preprocessing product and description

-strip | / -
-stop words strip
-imbuhan strip
-char count (theory of orang yang males bace kasi prod desc pendek)

1. Price Binning --> rp10k-20k, rp20k-50k, etc
2. Product + Description then cosine similarity
3. Gabung semua di satu kolom, atau product_souvenir, product_kado etc

Metric will be cosine similarity score