In [63]:
import pandas as pd
from pandasql import sqldf
import pyodbc
import yaml

### Functions

In [64]:
def levenshtein_distance(s, t):
    m, n = len(s), len(t)
    
    distance = [[0 for j in range(n+1)] for i in range(m+1)]
    
    for i in range(1, m+1):
        distance[i][0] = i
    for j in range(1, n+1):
        distance[0][j] = j
    
    for j in range(1, n+1):
        for i in range(1, m+1):
            if s[i-1] == t[j-1]:
                cost = 0
            else:
                cost = 1
            distance[i][j] = min(distance[i-1][j] + 1, distance[i][j-1] + 1, distance[i-1][j-1] + cost)
    
    return distance[m][n]


In [65]:
def compare_products(from_db, from_csv):
    name1, name2 = from_db[2], from_csv[1]
    ld = levenshtein_distance(name1.lower(),name2.lower())
    return from_db[0], ld if ld<max(len(name1),len(name2))//2 else None

### Db init

In [66]:
cn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};Server=(localdb)\mssqllocaldb;Database=AffordableSkin;Trusted_Connection=yes')


In [67]:
cr = cn.cursor()

### Get configuration settings

In [68]:
with open(r'config.yaml') as file:
    conf = yaml.load(file, Loader=yaml.FullLoader)
    print(conf)

{'include_files': ['coslovemetics_all_products.csv', 'yeppeuda_all_products.csv']}


In [69]:
include_files = conf['include_files']

### Get existing products

In [70]:
all_product_prices_with_ids = []

In [71]:
for file in include_files:
    df = pd.read_csv(file)
    unique_brands = df.brand.unique()
    
    seller = file.split("_")[0]
    sql = f"select max(Id) from Product"
    cr.execute(sql)
    max_id = cr.fetchone()[0] 
    for brand in unique_brands:
        
        sql = "select * from Product where BrandName = ?"
        values = (brand)
        cr.execute(sql,values)
        brand_products = cr.fetchall()

        q = f"""select * from df where brand = '{brand}'"""
        brand_df = sqldf(q, globals())
        for index,row in brand_df.iterrows():
            brand_row = row[4]
            product_id = None
            min_score = 1000
            for product in brand_products:  
                id, score = compare_products(product,row.tolist())
                if score != None and score<min_score:
                    product_id = id
                    min_score = score
                    
            product_values = [row[0], row[1], 0, row[4]]
            product_prices_values = [row[3], seller, row[2],row[5],row[1]]
            date = row[3]
            if product_id is None:
                
                sql = "insert into Product values (?, ?, ?, ?)"
                
                values = tuple(product_values)
                cr.execute(sql,values)
                cn.commit()
                
                all_product_prices_with_ids.append([max_id+1] + product_prices_values)
                max_id+=1
            else:
                sql = "select * from ProductPrice where Date = ? and ProductId = ?"
                
                values = (date, product_id)
                cr.execute(sql,values)
                added_products = cr.fetchone()
                if added_products is None:
                    all_product_prices_with_ids.append([product_id] + product_prices_values)
                else:
                    sql = "insert into Product values (?, ?, ?, ?)"
                
                    values = tuple(product_values)
                    cr.execute(sql,values)
                    cn.commit()
                    
                    all_product_prices_with_ids.append([max_id+1] + product_prices_values)
                    max_id+=1

In [72]:
all_product_prices_with_ids =list(set(map(tuple,all_product_prices_with_ids)))

In [73]:
for row in all_product_prices_with_ids:
    sql = "insert into ProductPrice values (?, ?, ?, ?, ?,?)"
    values = tuple(row)
    cr.execute(sql,values)
    cn.commit()

In [74]:
cn.commit()
cr.close()
cn.close()

In [75]:
new_df = pd.DataFrame(all_product_prices_with_ids)

In [76]:
new_df.columns=['productId', 'date','seller','price','link','name']

In [77]:
new_df

Unnamed: 0,productId,date,seller,price,link,name
0,13263,2023-04-27,yeppeuda,360,https://yeppeuda.mk/p/395/anti-cellulite-body-...,Anti cellulite body lotion 150ml
1,13057,2023-04-27,coslovemetics,1192,https://coslovemetics.mk/product/benton-cerami...,Benton Ceramide Cream 10.000PPM 80ml
2,13362,2023-04-27,yeppeuda,770,https://yeppeuda.mk/p/480/probiotic-moisturizi...,PROBIOTIC MOISTURIZING - SOOTHING SHOWER GEL 2...
3,13119,2023-04-27,coslovemetics,510,https://coslovemetics.mk/product/innisfree-blu...,innisfree Blueberry Rebalancing 5.5 Cleanser 1...
4,13346,2023-04-27,yeppeuda,690,https://yeppeuda.mk/p/376/mousse-shampoo-for-l...,Mousse shampoo for low porosity hair 200 ml
...,...,...,...,...,...,...
881,12721,2023-04-27,coslovemetics,748,https://coslovemetics.mk/product/missha-all-ar...,Missha All Around Safe Block Cotton Sun SPF50+...
882,13314,2023-04-27,yeppeuda,430,https://yeppeuda.mk/p/205/no-more-pores-2-sali...,No more pores 2% salicylic acid 30ml
883,13102,2023-04-27,coslovemetics,650,https://coslovemetics.mk/product/geekgorgeous-...,Geek&Gorgeous B-Bomb 10% Niacinamide Serum
884,12612,2023-04-27,coslovemetics,952,https://coslovemetics.mk/product/pyunkang-yul-...,PYUNKANG YUL Pyunkang Miniature 4 type Set Ess...


In [78]:
new_df.to_csv("total_products_with_ids.csv")

In [79]:
new_df[new_df.productId==11081].name.unique()

array([], dtype=object)