In [28]:
import pandas as pd
from pandasql import sqldf
import pyodbc
import yaml

### Functions

In [29]:
def levenshtein_distance(s, t):
    m, n = len(s), len(t)
    
    distance = [[0 for j in range(n+1)] for i in range(m+1)]
    
    for i in range(1, m+1):
        distance[i][0] = i
    for j in range(1, n+1):
        distance[0][j] = j
    
    for j in range(1, n+1):
        for i in range(1, m+1):
            if s[i-1] == t[j-1]:
                cost = 0
            else:
                cost = 1
            distance[i][j] = min(distance[i-1][j] + 1, distance[i][j-1] + 1, distance[i-1][j-1] + cost)
    
    return distance[m][n]


In [30]:
def compare_products(from_db, from_csv):
    name1, name2 = from_db[2], from_csv[1]
    return from_db[0] if levenshtein_distance(name1,name2)<max(len(name1),len(name2))//3 else None

### Db init

In [31]:
cn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};Server=(localdb)\mssqllocaldb;Database=AffordableSkin;Trusted_Connection=yes')


In [32]:
cr = cn.cursor()

### Get configuration settings

In [33]:
with open(r'config.yaml') as file:
    conf = yaml.load(file, Loader=yaml.FullLoader)
    print(conf)

{'include_files': ['coslovemetics_all_products.csv', 'yeppeuda_all_products.csv']}


In [34]:
include_files = conf['include_files']

### Get existing products

In [35]:
all_product_prices_with_ids = []

In [36]:
for file in include_files:
    df = pd.read_csv(file)
    unique_brands = df.brand.unique()
    
    seller = file.split("_")[0]
    sql = f"select max(Id) from Product"
    cr.execute(sql)
    max_id = cr.fetchone()[0] 
    for brand in unique_brands:
        
        sql = "select * from Product where BrandName = ?"
        values = (brand)
        cr.execute(sql,values)
        brand_products = cr.fetchall()

        q = f"""select * from df where brand = '{brand}'"""
        brand_df = sqldf(q, globals())
        for index,row in brand_df.iterrows():
            product_id = None
            for product in brand_products:
                id = compare_products(product,row.tolist())
                if id != None:
                    product_id = id
                    break
            product_values = [row[0], row[1], 0, row[4]]
            product_prices_values = [row[3], seller, row[2],row[5]]
            if product_id is None:
                sql = "insert into Product values (?, ?, ?, ?)"
                
                values = tuple(product_values)
                cr.execute(sql,values)
                cn.commit()
                
                all_product_prices_with_ids.append([max_id+1] + product_prices_values)
                max_id+=1
            else:
                all_product_prices_with_ids.append([product_id] + product_prices_values)

In [37]:
all_product_prices_with_ids =list(set(map(tuple,all_product_prices_with_ids)))

In [38]:
for row in all_product_prices_with_ids:
    sql = "insert into ProductPrice values (?, ?, ?, ?, ?)"
    values = tuple(row)
    cr.execute(sql,values)
    cn.commit()

In [39]:
cn.commit()
cr.close()
cn.close()