In [None]:
import pandas as pd

In [None]:
from pathlib import Path

In [None]:
from concurrent.futures import ProcessPoolExecutor

In [None]:
project_dir = Path('.').resolve().parents[0]

In [None]:
data_dir = project_dir/'data'/'raw'

In [None]:
def read_csv(path):
    try:
        df = pd.read_csv(path, sep = '\t')
        df['parse_date'] = path.name.rstrip('.tsv')
        return df
    except:
        return None

In [None]:
with ProcessPoolExecutor() as pool:
    dfs = pool.map(read_csv, data_dir.iterdir())

In [None]:
df = pd.concat(dfs)

In [None]:
df.head(1)

Unnamed: 0,provider,provider_id,name,price,original_price,hidden_price,URL,parse_date
0,bol,9200000093063167,Philips Sonicare ProtectiveClean 4500 HX6830/4...,79.99,,79.99,https://bol.com/nl/nl/p/philips-sonicare-prote...,2021-10-06 09:31AM


In [None]:
def clean_numeric(s, to_type = float):
    if isinstance(s, str):
        return to_type(s.strip().strip('-'))
    else:
        return s

In [None]:
df['price'] = df['price'].apply(clean_numeric)

In [None]:
df.sort_values(by = 'price', inplace = True)

In [None]:
df.drop_duplicates(subset = ['provider', 'provider_id'], keep = 'first', inplace = True)

In [None]:
df

Unnamed: 0,provider,provider_id,name,price,original_price,hidden_price,URL,parse_date
104,bol,9200000071329283,Dettol Handzeep Zachte Mousse - Navulling Magi...,5.52,,5.52,https://bol.com/nl/nl/p/dettol-handzeep-zachte...,2021-10-06 12:03PM
97,bol,9200000071329283,Dettol Handzeep Zachte Mousse - Navulling Magi...,5.52,,5.52,https://bol.com/nl/nl/p/dettol-handzeep-zachte...,2021-10-11 09:48PM
377,bcc,000000000000294775,Roccat gaming muismat Kanga Mini,6.99,,,https://bcc.nl/gaming/gaming-pc-en-laptop/gami...,2021-10-12 09:40PM
104,bol,9200000071329277,Dettol - Magic Foam - navulling Aloë Vera - 20...,7.48,,,https://bol.com/nl/nl/p/dettol-magic-foam-navu...,2021-10-06 04:02AM
102,bol,9200000071329277,Dettol - Magic Foam - navulling Aloë Vera - 20...,7.48,,7.48,https://bol.com/nl/nl/p/dettol-magic-foam-navu...,2021-10-09 10:23PM
...,...,...,...,...,...,...,...,...
443,bcc,000000000000303173,Samsung Neo QLED 4K TV 65QN95A (2021),2799.00,3199.,,https://bcc.nl/televisie/qled-tv/4k-qled-tv/sa...,2021-10-09 10:23PM
200,mediamarkt,MMNL1646450,"APPLE MacBook Pro 16"" - Zilver i9 16GB 1TB",2949.00,2949.,,https://mediamarkt.nl/nl/product/_apple-macboo...,2021-10-12 08:28AM
53,bol,9300000042507207,AMD Ryzen Threadripper 3960X Game PC / Streami...,8499.00,,,https://bol.com/nl/nl/p/amd-ryzen-threadripper...,2021-10-05 11:02PM
56,bol,9300000042507207,AMD Ryzen Threadripper 3960X Game PC / Streami...,8499.00,,8499.00,https://bol.com/nl/nl/p/amd-ryzen-threadripper...,2021-10-09 05:22PM


In [30]:
df['parse_date'] = pd.to_datetime(df['parse_date'])

In [31]:
df.set_index('parse_date', inplace = True)

In [32]:
df.sort_index(inplace = True)

In [35]:
df.to_csv('combined_data.tsv', sep = '\t')

In [38]:
df[df.provider == 'mediamarkt']

Unnamed: 0_level_0,provider,provider_id,name,price,original_price,hidden_price,URL
parse_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-09 14:17:00,mediamarkt,MMNL1700318,LENOVO IdeaCentre 5 - i5-10400 8GB 512GB SSD,549.00,549.,,https://mediamarkt.nl/nl/product/_lenovo-ideac...
2021-10-09 14:17:00,mediamarkt,MMNL1698756,MSI MAG META 5 3SI-441MYS,1139.00,1139.,,https://mediamarkt.nl/nl/product/_msi-mag-meta...
2021-10-09 14:17:00,mediamarkt,MMNL1698956,HP OMEN 25L GT12-1410nd,1389.00,1389.,,https://mediamarkt.nl/nl/product/_hp-omen-25l-...
2021-10-09 14:17:00,mediamarkt,MMNL1696650,HP OMEN 25L GT12-1420nd,1499.00,1499.,,https://mediamarkt.nl/nl/product/_hp-omen-25l-...
2021-10-09 14:17:00,mediamarkt,MMNL1670320,"APPLE iMac 21.5"" - i5/8GB/1TB/FHD",999.00,1249.,,https://mediamarkt.nl/nl/product/_apple-imac-2...
...,...,...,...,...,...,...,...
2021-10-09 17:22:00,mediamarkt,MMNL1698952,HP Pavilion TP01-2050nd,649.99,649.99,,https://mediamarkt.nl/nl/product/_hp-pavilion-...
2021-10-09 17:22:00,mediamarkt,MMNL1698756,MSI MAG META 5 3SI-441MYS,1139.00,1139.,,https://mediamarkt.nl/nl/product/_msi-mag-meta...
2021-10-09 17:22:00,mediamarkt,MMNL1701541,SCREENON GamePC V621817 - Ryzen 3 480GB,559.99,559.99,,https://mediamarkt.nl/nl/product/_screenon-gam...
2021-10-09 17:22:00,mediamarkt,MMNL1606590,PHILIPS Sonicare Protective Clean HX6803/63,59.95,59.95,,https://mediamarkt.nl/nl/product/_philips-soni...


In [39]:
df[df.provider_id.isnull()]

Unnamed: 0_level_0,provider,provider_id,name,price,original_price,hidden_price,URL
parse_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
