In [1]:
import pandas as pd
from scraper import Scraper
import os

In [2]:
import contractions
import re
import nltk
nltk.download('stopwords')

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import openai

# 1 Scraping 

For scraping, we will use selenium for browser automation and beautifulsoup to extract data.

The algorithm starts by scraping each searched item with keyword of gift hampers, there we can get product names, prices and links by using beautifulsoup. 

Then, each link will be opened by selenium, and the description data will be extracted by beautifulsoup.

In [4]:
csv_name = '20_tokopedia_products.csv'

if csv_name in os.listdir():
    data = pd.read_csv(csv_name)
else:
    a = Scraper()
    data = a.get_data()
    a.driver.quit()
    
    data = pd.DataFrame(data)
    data.to_csv(csv_name, index=False)
    
data['id'] = data.index
pure_data = data.copy()

Sample of Product and Description

In [5]:
n = 4
print(pure_data.Product[n], '\n', pure_data.Description[n])

FLUX SET - Wedding Gift | Birthday Hampers | Kado | Hadiah 
 FLUX SET :
- RTR : Classic Mini Air Diffuser / Humidifier 220ml Capacity (Include USB Cable for Air Humidifier & cotton sticks)
- RTR Aromatherapy Oil : 2 variants @10ml
- RTR : Bear Acrylic Night Lamp
- Corrugated white box 22x22cm
* FREE GREETING CARD


In [7]:
pure_data

Unnamed: 0,Product,Price,Description,id
0,souvenir kado hampers pernikahan wedding gift,Rp49.900,ready stock Kado pernikahan premium \nPENTING=...,0
1,hampers mug couple wedding gift,Rp85.000,"jika ingin d berikan kartu ucapan, mohon tuli...",1
2,ZWITSAL Baby Gift Set Box Paket Perlengkapan B...,Rp114.898,Barang Ready Sesuai Foto!\nZWITSAL Essential B...,2
3,HAMPERS BAYI | KADO LAHIRAN BAYI / NEWBORN GIF...,Rp83.000,Kado untuk bayi yang praktis dan estetik\nHamp...,3
4,FLUX SET - Wedding Gift | Birthday Hampers | K...,Rp159.000,FLUX SET :\n- RTR : Classic Mini Air Diffuser ...,4
5,Gift Box Ulang Tahun / Kado Cewek Ultah Murah ...,Rp50.000,untuk order jangan lupa kirim format order (un...,5
6,Hampers Bayi Baby Girl Boy Kado Lahiran Peremp...,Rp110.000,Kado Lahiran Bayi \nDijamin nyaman dan hangat ...,6
7,Hampers Baby Boy Gift Set Parcel Kado Bayi Lak...,Rp88.000,"Ready Stok, Siap Kirim. \nMohon Baca Deskripsi...",7
8,Newborn Hampers Gift Set | Baby Hampers | Hamp...,Rp365.000,NEW PREMIUM BABY GIFT SET\n• Premium Bunny Dol...,8
9,Gift Set Premium Reed Diffuser and Scented Can...,Rp75.000,Gift Set Premium Elegan Luxury by Little Wick ...,9


# 2 Similarity

For similarity, we will use cosine similarity. The process will be:

1. Decide the metric

The metric that we'll be used is cosine similarity value and precision@k. The similarity metric, cosine similarity value will be used to descending sort the value from highest to lowest. The best three will be taken.

After that, precisio@k will also be counted. Precision@k will be based on the synthetic category label. This method is good if there is not so many count unique category label in a dataset.

2. Clean the data

Remove pucntuations, enter, space excess

3. Remove stop words and common words

Remove stop words and common words like gift, hampers, kado

4. Feature engineering

Fix price column to inteeger, generate new feature char_count, by hypothesis of, customer that is like to read short description, will like another product with short description too

5. Fine tune weight

Fine tune to get the best precision@k

In [114]:
pure_data['category'] = ['wedding', 'wedding', 'baby', 'baby', 'wedding', 'general', 'baby', 'baby', 'baby', 'general', 'general', 'general', 'general', 'general', 'wedding', 'baby', 'wedding', 'general', 'general', 'wedding']
data = pure_data.copy()
data

Unnamed: 0,Product,Price,Description,id,category
0,souvenir kado hampers pernikahan wedding gift,Rp49.900,ready stock Kado pernikahan premium \nPENTING=...,0,wedding
1,hampers mug couple wedding gift,Rp85.000,"jika ingin d berikan kartu ucapan, mohon tuli...",1,wedding
2,ZWITSAL Baby Gift Set Box Paket Perlengkapan B...,Rp114.898,Barang Ready Sesuai Foto!\nZWITSAL Essential B...,2,baby
3,HAMPERS BAYI | KADO LAHIRAN BAYI / NEWBORN GIF...,Rp83.000,Kado untuk bayi yang praktis dan estetik\nHamp...,3,baby
4,FLUX SET - Wedding Gift | Birthday Hampers | K...,Rp159.000,FLUX SET :\n- RTR : Classic Mini Air Diffuser ...,4,wedding
5,Gift Box Ulang Tahun / Kado Cewek Ultah Murah ...,Rp50.000,untuk order jangan lupa kirim format order (un...,5,general
6,Hampers Bayi Baby Girl Boy Kado Lahiran Peremp...,Rp110.000,Kado Lahiran Bayi \nDijamin nyaman dan hangat ...,6,baby
7,Hampers Baby Boy Gift Set Parcel Kado Bayi Lak...,Rp88.000,"Ready Stok, Siap Kirim. \nMohon Baca Deskripsi...",7,baby
8,Newborn Hampers Gift Set | Baby Hampers | Hamp...,Rp365.000,NEW PREMIUM BABY GIFT SET\n• Premium Bunny Dol...,8,baby
9,Gift Set Premium Reed Diffuser and Scented Can...,Rp75.000,Gift Set Premium Elegan Luxury by Little Wick ...,9,general


In [115]:
# Cleaning the texts
def txtprocess(txt):
    # Lower the texts
    txt = str(txt).lower()
    # Remove contractions
    txt = contractions.fix(txt)
    # Just pick the alphabet
    txt = re.sub(r'[^a-zA-Z]', ' ', txt)
    # Fix unnecessary space
    txt = re.sub(' +', ' ', txt)
    
    txt = ' '.join(txt.split())
    return txt

data.Product = data.Product.map(txtprocess)
data.Description = data.Description.map(txtprocess)

In [116]:
# Cleaning stopwords
stop_words = set(nltk.corpus.stopwords.words('indonesian'))
stop_words.add('gift')
stop_words.add('hampers')
stop_words.add('hadiah')
stop_words.add('kado')
stop_words.add('x')

def remove_stopwords(txt):
    no_stopword_txt = [w for w in txt.split() if not w in stop_words]
    return ' '.join(no_stopword_txt)

data.Product = data.Product.map(remove_stopwords)
data.Description = data.Description.map(remove_stopwords)

In [117]:
# Feature Engineering
data['char_count'] = data['Description'].map(len)
data.Price = data.Price.str[2:].str.replace('.', '').astype(int)

In [118]:
def get_similarity_matrix(
    weight_product = 0.4,
    weight_description = 0.3,
    weight_prices = 0.2,
    weight_char_count = 0.1
):
    
    # For product and description
    tfidf_product = TfidfVectorizer()
    product_vectors = tfidf_product.fit_transform(data.Product).toarray()

    tfidf_description = TfidfVectorizer()
    description_vectors = tfidf_description.fit_transform(data.Description).toarray()

    product_similarity_matrix = cosine_similarity(product_vectors)
    description_similarity_matrix = cosine_similarity(description_vectors)

    # For prices and char count
    normalized_prices = data.Price.values.reshape(1, -1)
    normalized_char_count = data.char_count.values.reshape(1, -1)

    scaler = Normalizer() 
    normalized_prices = scaler.fit_transform(normalized_prices)
    normalized_char_count = scaler.fit_transform(normalized_char_count)

    normalized_prices = cosine_similarity(normalized_prices)
    normalized_char_count = cosine_similarity(normalized_char_count)

    # Combined Similarity with weights
    combined_similarity_matrix = (weight_product * product_similarity_matrix) + (weight_description * description_similarity_matrix) + (weight_prices * normalized_prices) + (weight_char_count * normalized_char_count)
    
    return combined_similarity_matrix

combined_similarity_matrix = get_similarity_matrix(
    weight_product = 0.4,
    weight_description = 0.3,
    weight_prices = 0.2,
    weight_char_count = 0.1
)

In [119]:
def result(prod_id):
    i = combined_similarity_matrix[prod_id]
    a = i.argsort()[::-1][1:4]
    b = sorted(i)[::-1][1:4]

    recs = []
    for j,k in zip(a,b):
        rec = {}
        rec['id'] = j
        rec['sim_score'] = k
        rec['relevant'] = pure_data.category[j] == pure_data.category[prod_id]
        recs.append(rec)

    return recs

data['result'] = data.id.map(result)

In [120]:
data.result[17]

[{'id': 5, 'sim_score': 0.4275233654445941, 'relevant': True},
 {'id': 18, 'sim_score': 0.3718727421449775, 'relevant': True},
 {'id': 2, 'sim_score': 0.3488254938413472, 'relevant': False}]

### Function that absorb product id and output 3 most similar product id

In [121]:
def three_similar_product(prod_id):
    prods = []
    for i in data.result[prod_id]:
        prods.append(i['id'])
        
    return prods

three_similar_product(0)

[14, 16, 19]

### Show precision

In [123]:
def prec_k():
    prods = []
    for i in range(len(data)):
        for j in data.result[i]:
            prods.append(j['relevant'])
    
    return round(sum(prods)*100/(len(data)*len(data.result[i])), 2)

prec_k()

73.33

### This optimization algorithm is the best when we have a lot of data

### Another good also when we take the similarity rank (from 1,2,3) taken also into account, thus penalize the bad recommendation that occur in first or second rank

In [182]:
max_prec = 0
i = 0
while i<10000:
    def generate_random_weights():
        weight_product = random.uniform(0, 1)
        weight_description = random.uniform(0, 1 - weight_product)
        weight_prices = random.uniform(0, 1 - weight_product - weight_description)
        weight_char_count = 1 - weight_product - weight_description - weight_prices

        weights = {
            'weight_product': weight_product,
            'weight_description': weight_description,
            'weight_prices': weight_prices,
            'weight_char_count': weight_char_count
        }

        return weights

    ran = generate_random_weights()

    combined_similarity_matrix = get_similarity_matrix(
        ran['weight_product'],
        ran['weight_description'],
        ran['weight_prices'],
        ran['weight_char_count'],
    )

    data['result'] = data.id.map(result)

    if prec_k()>max_prec:
        max_prec = prec_k()
        best_weights = ran
    
    i+=1

In [183]:
print('Best precision and weights \n')
max_prec, best_weights

Best precision and weights 



(73.33,
 {'weight_product': 0.7921794639543291,
  'weight_description': 0.16461875771221068,
  'weight_prices': 0.004628135395285146,
  'weight_char_count': 0.03857364293817503})

### Expand all results

In [113]:
def show_all_products():
    for i in range(len(data)):
        print(f"Top 3 similar products to product {i} '{pure_data.Product[i]}':")
        for j in three_similar_product(i):
            print(j, pure_data.Product[j])
        print('\n')

show_all_products()

Top 3 similar products to product 0 'souvenir kado hampers pernikahan wedding gift':
14 souvenir hampers kado pernikahan wedding gift
16 souvenir mug couple hampers kado pernikahan wedding gift
19 Hampers Wedding / Kado Hadiah Gift Pernikahan / Handuk Besar 70x140


Top 3 similar products to product 1 'hampers mug couple wedding gift':
16 souvenir mug couple hampers kado pernikahan wedding gift
14 souvenir hampers kado pernikahan wedding gift
0 souvenir kado hampers pernikahan wedding gift


Top 3 similar products to product 2 'ZWITSAL Baby Gift Set Box Paket Perlengkapan Bayi Hampers Kado Hadiah':
8 Newborn Hampers Gift Set | Baby Hampers | Hampers Bayi | Kado Bayi
3 HAMPERS BAYI | KADO LAHIRAN BAYI / NEWBORN GIFT / HAMPERS BABY
15 Hadiah Kelahiran Bayi | Baby Newborn Gift Set |Hampers Newborn


Top 3 similar products to product 3 'HAMPERS BAYI | KADO LAHIRAN BAYI / NEWBORN GIFT / HAMPERS BABY':
8 Newborn Hampers Gift Set | Baby Hampers | Hampers Bayi | Kado Bayi
15 Hadiah Kelahiran B

# 3 Product Description Enhancement

We will do prompt engineering. We start by watching our result subjectively first. Then we fine tune our prompt. Later we can apply a metric on this to decide which prompt are the best.

In [44]:
key = "YOUR_API_KEY"

In [132]:
# prompt engineering trial and error
# message = 'perbaiki deskripsi produk berikut sehingga menarik bagi pembeli tanpa menambah kalimat \n\n'
# message = 'perbaiki deskripsi produk berikut sehingga menarik bagi pembeli dan mempunyai informasi yang padat \n\n'
message = 'perbaiki deskripsi produk berikut sehingga menarik bagi pembeli namun mempunyai informasi yang padat \n\n'
message = message+pure_data.Description[10]

### Sample of message

In [133]:
print(message)

perbaiki deskripsi produk berikut sehingga menarik bagi pembeli namun mempunyai informasi yang padat 

Mangkuk Jepang Set dengan sumpit bambu / Japanese Bowl Set 
📢📢Perlu Diperhatikan!! 
- Semua Varian sudah termasuk gift box dan sumpit
- untuk set 2 mangkok, motif random diambil dr varian set 4  jenis produk yg dipilih ( tidak bisa pilih motif )
- untuk sakura mix set 2, warna random tidak bisa pilih
➡️ Gratis Kartu Ucapan dalam Bentuk Sticker print, mohon ucapan ditulis langsung di note / catatan produk.
jika tidak ditulis, maka akan dikirim tanpa sticker ucapan 
▶️▶️untuk mangkok set 6, link ada di produk sebelah
Semua mangkuk keramik adalah handmade dan hand painted sehingga titik kecil, titik hitam dll adalah hal yg normal terjadi ketika proses produksi.  Hal ini tidak akan mempengaruhi kualitas mangkok
mangkuk juga sudah didesign anti slip, bagian paling bawah sengaja tidak diglaze
Glaze Artistik Ceramic terbuat dari tanah liat langsung yang harus di tempa dan dipanaskan disuhu 1

In [134]:
openai.api_key = key

response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[{'role':'user', 'content':f'{message}'}],
  temperature=0,
  max_tokens=1024
)

### Sample of result

In [135]:
print(response['choices'][0]['message']['content'])

Mangkuk Jepang Set dengan Sumpit Bambu / Japanese Bowl Set

🎁🎁 Perhatian! Setiap varian sudah termasuk kotak hadiah dan sumpit.
- Untuk set 2 mangkuk, motif akan dipilih secara acak dari varian set 4 jenis produk yang dipilih (tidak dapat memilih motif).
- Untuk Sakura Mix Set 2, warna akan dipilih secara acak dan tidak dapat dipilih.

➡️ Dapatkan Kartu Ucapan Gratis dalam Bentuk Stiker cetak, harap tuliskan ucapan langsung di catatan produk.
Jika tidak ditulis, maka akan dikirim tanpa stiker ucapan.

▶️▶️ Untuk mangkuk set 6, silakan lihat link di produk sebelah.

Semua mangkuk keramik ini dibuat secara handmade dan dicat tangan sehingga adanya titik kecil, titik hitam, dll adalah hal yang normal dalam proses produksi. Hal ini tidak akan mempengaruhi kualitas mangkuk.
Mangkuk ini juga didesain anti slip, dengan bagian bawah yang sengaja tidak dilapisi glaze.

Keramik Glaze Artistik terbuat dari tanah liat langsung yang harus ditempa dan dipanaskan pada suhu 1300 derajat, sehingga mang

# 4 Metrics for prompt engineering

If i have more time, i will formulize metrics and methodology for improving the prompt engineering. I will try to use metrics of information density and legibility of a text.