In [116]:
import pandas as pd
from pandasql import sqldf
import pyodbc
import yaml

### Functions

In [117]:
def levenshtein_distance(s, t):
    m, n = len(s), len(t)
    
    distance = [[0 for j in range(n+1)] for i in range(m+1)]
    
    for i in range(1, m+1):
        distance[i][0] = i
    for j in range(1, n+1):
        distance[0][j] = j
    
    for j in range(1, n+1):
        for i in range(1, m+1):
            if s[i-1] == t[j-1]:
                cost = 0
            else:
                cost = 1
            distance[i][j] = min(distance[i-1][j] + 1, distance[i][j-1] + 1, distance[i-1][j-1] + cost)
    
    return distance[m][n]


In [118]:
def compare_products(from_db, from_csv, brand):
    name1, name2 = from_db[2], from_csv[1]
    ld = levenshtein_distance("".join(name1.lower().split(brand)),"".join(name2.lower().split(brand)))
    return from_db[0], ld if ld<max(len(name1),len(name2))//3 else None

### Db init

In [119]:
cn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};Server=(localdb)\mssqllocaldb;Database=AffordableSkin;Trusted_Connection=yes')


In [120]:
cr = cn.cursor()

### Get configuration settings

In [121]:
with open(r'config.yaml') as file:
    conf = yaml.load(file, Loader=yaml.FullLoader)
    print(conf)

{'include_files': ['coslovemetics_all_products.csv', 'yeppeuda_all_products.csv']}


In [122]:
include_files = conf['include_files']

### Get existing products

In [123]:
all_product_prices_with_ids = []

In [126]:
for file in include_files:
    df = pd.read_csv(file)
    unique_brands = df.brand.unique()
    
    seller = file.split("_")[0]
    sql = f"select max(Id) from Product"
    cr.execute(sql)
    max_id = cr.fetchone()[0] 
    for brand in unique_brands:
        
        sql = "select * from Product where BrandName = ?"
        values = (brand)
        cr.execute(sql,values)
        brand_products = cr.fetchall()

        q = f"""select * from df where brand = '{brand}'"""
        brand_df = sqldf(q, globals())
        for index,row in brand_df.iterrows():
            brand_row = row[4]
            product_id = None
            min_score = 1000
            for product in brand_products:  
                id, score = compare_products(product,row.tolist(),brand)
                if score != None and score<min_score:
                    product_id = id
                    min_score = score
                    
            product_values = [row[0], row[1], 0, row[4]]
            product_prices_values = [row[3], seller, row[2],row[5],row[1]]
            date = row[3]
            if product_id is None:
                
                sql = "insert into Product values (?, ?, ?, ?)"
                
                values = tuple(product_values)
                cr.execute(sql,values)
                cn.commit()
                
                all_product_prices_with_ids.append([max_id+1] + product_prices_values)
                max_id+=1
            else:
                sql = "select * from ProductPrice where Date = ? and ProductId = ?"
                
                values = (date, product_id)
                cr.execute(sql,values)
                added_products = cr.fetchone()
                if added_products is None:
                    all_product_prices_with_ids.append([product_id] + product_prices_values)
                else:
                    sql = "insert into Product values (?, ?, ?, ?)"
                
                    values = tuple(product_values)
                    cr.execute(sql,values)
                    cn.commit()
                    
                    all_product_prices_with_ids.append([max_id+1] + product_prices_values)
                    max_id+=1

In [127]:
all_product_prices_with_ids =list(set(map(tuple,all_product_prices_with_ids)))

In [128]:
for row in all_product_prices_with_ids:
    sql = "insert into ProductPrice values (?, ?, ?, ?, ?,?)"
    values = tuple(row)
    cr.execute(sql,values)
    cn.commit()

In [129]:
cn.commit()
cr.close()
cn.close()

In [130]:
new_df = pd.DataFrame(all_product_prices_with_ids)

In [131]:
new_df.columns=['productId', 'date','seller','price','link','name']

In [132]:
new_df

Unnamed: 0,productId,date,seller,price,link,name
0,15190,2023-04-27,coslovemetics,1136,https://coslovemetics.mk/product/cosrx-bha-bla...,COSRX BHA Blackhead Power Liquid 100ml
1,15312,2023-04-27,coslovemetics,560,https://coslovemetics.mk/product/purito-all-ca...,PURITO All Care Recovery Cica-Aid
2,15627,2023-04-27,coslovemetics,1206,https://coslovemetics.mk/product/iunik-beta-gl...,iUNIK Beta Glucan Power Moisture Serum 50ml
3,15161,2023-04-27,coslovemetics,1832,https://coslovemetics.mk/product/pyunkang-yul-...,Pyunkang Yul Skin Set Low PH Pore Deep Cleansi...
4,15603,2023-04-27,coslovemetics,104,https://coslovemetics.mk/product/benton-aloe-s...,BENTON Aloe Soothing Mask Pack 1 парче
...,...,...,...,...,...,...
881,15448,2023-04-27,coslovemetics,656,https://coslovemetics.mk/product/unleashia-min...,Unleashia MINEST DOUBLE LASH UP MASCARA 01 BLACK
882,15835,2023-04-27,yeppeuda,980,https://yeppeuda.mk/p/275/smooth-cozy-souffle-...,Smooth Cozy Soufflé moisturizing & soothing 50мл
883,15885,2023-04-27,yeppeuda,570,https://yeppeuda.mk/p/385/hot-oil-treatment-fo...,Hot oil treatment for high porosity hair 100ml
884,15150,2023-04-27,coslovemetics,532,https://coslovemetics.mk/product/pyunkang-yul-...,Pyunkang Yul Mist Toner 100ml


In [133]:
new_df.to_csv("total_products_with_ids.csv")