In [77]:
import os
import numpy as np
import json
import pandas as pd
import itertools
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import re
import ast
import matplotlib.pyplot as plt

In [2]:
def create_dataframe(dataset_path):
    """Function used to create a Pandas DataFrame containing specifications page titles

    Reads products specifications from the file system ("dataset_path" variable in the main function) and creates a Pandas DataFrame where each row is a
    specification. The columns are 'source' (e.g. www.sourceA.com), 'spec_number' (e.g. 1) and the 'page title'. Note that this script will consider only
    the page title attribute for simplicity.

    Args:
        dataset_path (str): The path to the dataset

    Returns:
        df (pd.DataFrame): The Pandas DataFrame containing specifications and page titles
    """

    print('>>> Creating dataframe...\n')
    columns_df = ['source', 'spec_number', 'spec_id', 'page_title']

    progressive_id = 0
    progressive_id2row_df = {}
    for source in tqdm(os.listdir(dataset_path)):
        for specification in os.listdir(os.path.join(dataset_path, source)):
            specification_number = specification.replace('.json', '')
            specification_id = '{}//{}'.format(source, specification_number)
            with open(os.path.join(dataset_path, source, specification)) as specification_file:
                specification_data = json.load(specification_file)
                page_title = specification_data.get('<page title>').lower()
                row = (source, specification_number, specification_id, page_title)
                progressive_id2row_df.update({progressive_id: row})
                progressive_id += 1
    df = pd.DataFrame.from_dict(progressive_id2row_df, orient='index', columns=columns_df)
    print('>>> Dataframe created successfully!\n')
    return df

In [3]:
df = create_dataframe('../datasets/unlabeled/2013_camera_specs')

 12%|█▎        | 3/24 [00:00<00:00, 26.68it/s]

>>> Creating dataframe...



100%|██████████| 24/24 [00:03<00:00,  6.99it/s]

>>> Dataframe created successfully!






## Title

In [4]:
df.head()

Unnamed: 0,source,spec_number,spec_id,page_title
0,www.wexphotographic.com,154,www.wexphotographic.com//154,nikon coolpix aw120 digital camera - camouflag...
1,www.wexphotographic.com,553,www.wexphotographic.com//553,canon ixus 150 digital camera - red (9148b007a...
2,www.wexphotographic.com,601,www.wexphotographic.com//601,fuji finepix s1 digital camera (p10nc12730a) -...
3,www.wexphotographic.com,197,www.wexphotographic.com//197,nikon coolpix s5300 digital camera - black (vn...
4,www.wexphotographic.com,178,www.wexphotographic.com//178,fuji finepix s8600 digital camera - red (p10nc...


In [5]:
df = df.drop(columns = ["source", "spec_number"], axis = 1)

In [6]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

In [7]:
stopWords = set(['itself', 'down', 'by', 'with', 'doesn', 'wouldn', 'other', 'ours', 'of', 'then', 'where', 'don', 'these', 'nor', 'she', "should've", 'won', 'ma', 'from', 'had', "you're", 'our', 'did', 'them', 'too', 'her', 'that', 'haven', 'after', "you'll", 'hers', 'because', 'yourself', 'against', 'mightn', 'as', 'll', 'whom', 'how', 'couldn', 'further', 'aren', "you'd", 'and', 'needn', "couldn't", 'those', 'to', "doesn't", "weren't", 'both', 'ourselves', 'in', 'which', 'yours', 'under', 'some', 'what', 'during', 'before', "needn't", "shan't", 'here', 'having', 'hasn', 'your', "hasn't", 'between', 'me', "she's", 'into', 'all', 'at', 'shan', 'who', 'o', 'an', 'very', 'can', 'you', 'shouldn', 'such', 'but', 'do', 'out', 'am', "shouldn't", 'above', 'wasn', 'or', 'were', 'own', 'didn', "you've", 'on', 'will', 'my', 'it', 'have', 'once', 'only', 'been', 'themselves', 'his', 'be', "mightn't", 'they', 'not', 'so', 'up', 'any', 'most', 'has', 'myself', 't', 'yourselves', 'isn', "it's", 'y', 'm', 'now', 'until', 're', 'there', 'their', 'mustn', "mustn't", 'again', 'being', 'hadn', 'doing', 'just', 'no', 'if', 've', "wasn't", "won't", 'we', 'below', 'does', 'more', 'this', 'should', "isn't", 'ain', "don't", 'i', "haven't", 'than', "didn't", 'are', 'about', 'off', 'him', 'for', 'few', "wouldn't", 'was', 'weren', 'why', 'he', "that'll", 'd', 'the', 'its', 'a', 'each', 'is', 'while', "aren't", 'when', 'theirs', 'same', 's', 'himself', 'herself', "hadn't", 'through', 'over'])

In [8]:
punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~€£¥₹₽"

In [9]:
def replace_punctuation(word):
    return ''.join(c for c in word if c not in punctuation)

In [10]:
df["page_title"] = df["page_title"].apply(lambda x : [i.lower() for i in list(map(lambda y: replace_punctuation(y), word_tokenize(x))) if i and i.lower() not in stopWords])

### Modelwords

In [11]:
pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")

In [12]:
## In the data replace lumix with panasonic

In [13]:
brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']

In [14]:
df["page_title"] = df["page_title"].apply(lambda line : list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line))))

In [15]:
df.head()

Unnamed: 0,spec_id,page_title
0,www.wexphotographic.com//154,"[aw120, nikon, vna593e1]"
1,www.wexphotographic.com//553,"[9148b007aa, canon]"
2,www.wexphotographic.com//601,"[s1, p10nc12730a, fuji]"
3,www.wexphotographic.com//197,"[nikon, s5300, vna540e1]"
4,www.wexphotographic.com//178,"[s8600, p10nc12690a, fuji]"


In [16]:
df["brand"] = [[] for _ in range(len(df))]

In [17]:
# See how many products have more than one brand
for index, row in df.iterrows():
    for brand in row["page_title"]:
        if brand in brands:
            df.at[index, "brand"].append(brand)

In [22]:
def clean_mp_mm_g_oz(value):
    if not isinstance(value, list) and pd.isna(value):
        return np.nan
    regex = r"[0-9]+mm(\n|)"
    regex2 = r"[0-9]+mp(\n|)"
    regex3 = r"[0-9]+oz"
    regex4 = r"[0-9]+g(\n|)$"
    repl = value
    for e in repl:
        if bool(re.match(regex, e)) or bool(re.match(regex2, e)) or bool(re.match(regex3, e)) or bool(re.match(regex4, e)):
            repl.remove(e)
    return repl

In [23]:
df["page_title"] = df["page_title"].apply(lambda row : clean_mp_mm_g_oz(row))

In [24]:
df.head()

Unnamed: 0,spec_id,page_title,brand
0,www.wexphotographic.com//154,"[aw120, nikon, vna593e1]",[nikon]
1,www.wexphotographic.com//553,"[9148b007aa, canon]",[canon]
2,www.wexphotographic.com//601,"[s1, p10nc12730a, fuji]",[fuji]
3,www.wexphotographic.com//197,"[nikon, s5300, vna540e1]",[nikon]
4,www.wexphotographic.com//178,"[s8600, p10nc12690a, fuji]",[fuji]


## Load cleaned datasets

In [25]:
import os
import glob

os.chdir("../datasets/unlabeled/cleaned")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
df_cleaned = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [26]:
df_cleaned = df_cleaned.reset_index(drop = True)

In [27]:
df_cleaned.drop(columns=["page_title"], inplace=True)

In [28]:
df_cleaned.head()

Unnamed: 0,brand,dimensions,dots,manufacturer,megapixels,screen_size,short_descr,spec_id,type,weight
0,canon,,,,,,,www.canon-europe.com//115,,
1,canon,,,,,,,www.canon-europe.com//154,,
2,canon,,,,,,,www.canon-europe.com//103,,
3,canon,,,,,,,www.canon-europe.com//20,,
4,canon,,,,,,,www.canon-europe.com//98,,


## Merge clean with title

In [29]:
df = df.merge(df_cleaned, on="spec_id")

In [30]:
df.head()

Unnamed: 0,spec_id,page_title,brand_x,brand_y,dimensions,dots,manufacturer,megapixels,screen_size,short_descr,type,weight
0,www.wexphotographic.com//154,"[aw120, nikon, vna593e1]",[nikon],,,,,16.0,3.0,,,
1,www.wexphotographic.com//553,"[9148b007aa, canon]",[canon],,,,,16.0,2.7,,,
2,www.wexphotographic.com//601,"[s1, p10nc12730a, fuji]",[fuji],,,,,16.4,3.0,,,
3,www.wexphotographic.com//197,"[nikon, s5300, vna540e1]",[nikon],,,,,16.0,3.0,,,
4,www.wexphotographic.com//178,"[s8600, p10nc12690a, fuji]",[fuji],,,,,16.0,3.0,,,


In [31]:
df.rename(columns={"brand_x" : "brand_from_title", "brand_y" : "brand_descr"}, inplace=True)

In [32]:
df.head()

Unnamed: 0,spec_id,page_title,brand_from_title,brand_descr,dimensions,dots,manufacturer,megapixels,screen_size,short_descr,type,weight
0,www.wexphotographic.com//154,"[aw120, nikon, vna593e1]",[nikon],,,,,16.0,3.0,,,
1,www.wexphotographic.com//553,"[9148b007aa, canon]",[canon],,,,,16.0,2.7,,,
2,www.wexphotographic.com//601,"[s1, p10nc12730a, fuji]",[fuji],,,,,16.4,3.0,,,
3,www.wexphotographic.com//197,"[nikon, s5300, vna540e1]",[nikon],,,,,16.0,3.0,,,
4,www.wexphotographic.com//178,"[s8600, p10nc12690a, fuji]",[fuji],,,,,16.0,3.0,,,


In [70]:
def clean_short_descr(line):
    pattern = re.compile("(\S*[A-Za-z]\S*[0-9]\S*|\S*[0-9]\S*[A-Za-z]\S*)")
    brands = ['360fly', 'acer', 'achiever', 'acorn', 'actionpro', 'activeon', 'aee', 'agfa', 'agfaphoto', 'aiptek', 'akaso', 'alpine', 'alpine', 'amkov', 'andoer', 'annke', 'ansco', 'apeman', 'apex', 'apple', 'archos', 'argus', 'arlo', 'arri', 'axis', 'bell', 'benq', 'blackmagic', 'blackmagic', 'bosch', 'bower', 'brinno', 'brookstone', 'browning', 'cambo', 'campark', 'canon', 'carl', 'casio', 'celestron', 'chinon', 'cisco', 'cobra', 'coleman', 'concord', 'contax', 'contour', 'covert', 'craig', 'crayola', 'creative', 'creative', 'crosstour', 'crumpler', 'datavideo', 'delkin', 'dell', 'digitrex', 'discovery', 'disney', 'dji', 'd-link', 'domke', 'dörr', 'dragon', 'dxg', 'dxo', 'easypix', 'elecom', 'elmo', 'emerson', 'energizer', 'epson', 'fisher-price', 'flip', 'flir', 'foscam', 'fotoman', 'fotopro', 'fuji', 'fujifilm', 'fujinon', 'garmin', 'gateway', 'godox', 'goodmans', 'google', 'gopro', 'grundig', 'hahnel', 'hamilton', 'hasselblad', 'hello', 'herofiber', 'hitachi', 'holga', 'horseman', 'hoya', 'htc', 'huawei', 'ikelite', 'ilford', 'impossible', 'innovage', 'insignia', 'insta360', 'intel', 'intova', 'ion', 'iris', 'jazz', 'jenoptik', 'jjrc', 'jvc', 'kaiser', 'kenko', 'keyence', 'king', 'kitvision', 'kodak', 'konica', 'kyocera', 'leaf', 'lego', 'leica', 'lenovo', 'lexibook', 'linhof', 'liquid', 'little', 'logitech', 'lomography', 'lowepro', 'ltl', 'lytro', 'maginon', 'magnavox', 'mamiya', 'manfrotto', 'marshall', 'marumi', 'mattel', 'maxell', 'meade', 'medion', 'memorex', 'mercury', 'metz', 'microsoft', 'microtek', 'midland', 'minolta', 'minox', 'monster', 'motorola', 'moultrie', 'mustek', 'nabi', 'neewer', 'nest', 'netgear', 'night', 'nikkon', 'nikkor', 'nikon', 'nilox', 'nintendo', 'nippon', 'nokia', 'norcent', 'olympus', 'optech', 'ordro', 'oregon', 'packard', 'palm', 'panasonic', 'parrot', 'pelco', 'pentacon', 'pentax', 'phase', 'philips', 'philips', 'phoenix', 'pioneer', 'playskool', 'polaroid', 'polarpro', 'praktica', 'premier', 'promaster', 'proscan', 'pyle', 'radioshack', 'raymarine', 'raynox', 'rca', 'ricoh', 'ring', 'rode', 'rokinon', 'rollei', 'ryobi', 'sakar', 'samsung', 'sandisk', 'sanyo', 'schneider', 'schneider', 'schneider', 'scosche', 'seasea', 'sealife', 'sharp', 'sharper', 'sigma', 'sinar', 'sipix', 'sjcam', 'sony', 'soocoo', 'stealth', 'superheadz', 'svp', 'swann', 'tamrac', 'tamron', 'technika', 'tenba', 'think', 'thule', 'tokina', 'tomy', 'toshiba', 'transcend', 'traveler', 'trust', 'verbatim', 'vibe', 'victure', 'vistaquest', 'vivitar', 'voigtländer', 'vtech', 'vupoint', 'walimex', 'wyze', 'xiaomi', 'xit', 'xtreme', 'yashica', 'zeiss']
    if not isinstance(line, list) and pd.isna(line):
        return np.nan
    else:
        line = ast.literal_eval(line)
        return list(set(filter(lambda word : bool(pattern.match(word)) or word in brands,line)))

In [72]:
df["short_descr"] = df["short_descr"].apply(clean_short_descr)

In [75]:
df["short_descr"] = df["short_descr"].apply(lambda row : clean_mp_mm_g_oz(row))

## Create set of attributes

In [97]:
def merge_attributes(row):
    row_set = set()
    for column in row[1:]:
        if not isinstance(column, list):
            if not pd.isna(column):
                row_set.update([column])
        else:
            row_set.update(set(column))
            
    row["attributes"] = row_set
    return row

In [91]:
df = df.apply(merge_attributes, axis = 1)

In [92]:
needed = ["spec_id", "attributes"]

In [93]:
df = df[needed]

In [94]:
df.head()

Unnamed: 0,spec_id,attributes
0,www.wexphotographic.com//154,"{3.0, vna593e1, aw120, 16.0, nikon}"
1,www.wexphotographic.com//553,"{16.0, 2.7, 9148b007aa, canon}"
2,www.wexphotographic.com//601,"{3.0, fuji, 16.4, s1, p10nc12730a}"
3,www.wexphotographic.com//197,"{3.0, s5300, 16.0, nikon, vna540e1}"
4,www.wexphotographic.com//178,"{s8600, 3.0, fuji, 16.0, p10nc12690a}"


In [95]:
import itertools

In [105]:
def create_combs(row):
    combs = list(itertools.combinations(row["attributes"], 3))
    for i, c_name in enumerate(combs):
        row[str(i)] = c_name
    return row

In [107]:
df = df.apply(create_combs, axis =1)

In [112]:
df.to_csv("/Users/gfotiadis/programming/sigmod/datasets/created/combs.csv")

In [124]:
df.head()

Unnamed: 0,0,1,10,100,1000,1001,1002,1003,1004,1005,...,992,993,994,995,996,997,998,999,attributes,spec_id
0,"(3.0, vna593e1, 16.0)",,,,,,,,,,...,,,,,,,,,"(3.0, vna593e1, 16.0)","(3.0, vna593e1, aw120)"
1,"(16.0, 2.7, 9148b007aa)","(16.0, 2.7, canon)",,,,,,,,,...,,,,,,,,,"{16.0, 2.7, 9148b007aa, canon}",www.wexphotographic.com//553
2,"(3.0, fuji, 16.4)","(3.0, fuji, s1)",,,,,,,,,...,,,,,,,,,"{3.0, fuji, 16.4, s1, p10nc12730a}",www.wexphotographic.com//601
3,"(3.0, s5300, 16.0)","(3.0, s5300, nikon)",,,,,,,,,...,,,,,,,,,"{3.0, s5300, 16.0, nikon, vna540e1}",www.wexphotographic.com//197
4,"(s8600, 3.0, fuji)","(s8600, 3.0, 16.0)",,,,,,,,,...,,,,,,,,,"{s8600, 3.0, fuji, 16.0, p10nc12690a}",www.wexphotographic.com//178


In [125]:
attr_to_spec = {}

In [126]:
def create_dict(row):
    for value in row[:-2]:
        if value in attr_to_spec:
            attr_to_spec[value].append(row[-1])
        else:
            attr_to_spec[value] = []

In [127]:
df.apply(create_dict, axis = 1)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
29756    None
29757    None
29758    None
29759    None
29760    None
29761    None
29762    None
29763    None
29764    None
29765    None
29766    None
29767    None
29768    None
29769    None
29770    None
29771    None
29772    None
29773    None
29774    None
29775    None
29776    None
29777    None
29778    None
29779    None
29780    None
29781    None
29782    None
29783    None
29784    None
29785    None
Length: 29786, dtype: object

In [134]:
attr_to_spec[np.nan] = []

In [135]:
attr_to_spec.values()

dict_values([[], [], [], ['www.ukdigitalcameras.co.uk//48', 'www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1281', 'www.shopmania.in//1352', 'www.shopmania.in//1190', 'www.shopmania.in//1460', 'www.shopmania.in//1178', 'www.shopmania.in//1158'], [], [], ['www.ebay.com//43579', 'www.ebay.com//56904'], [], [], ['www.pcconnection.com//12115', 'www.ebay.com//56510', 'www.ebay.com//57345', 'www.ebay.com//56913', 'www.ebay.com//43863', 'www.ebay.com//45447'], [], [], ['www.henrys.com//1'], [], [], [], ['www.wexphotographic.com//210', 'www.price-hunt.com//9613'], ['www.wexphotographic.com//210', 'www.price-hunt.com//9613'], [], ['www.wexphotographic.com//210', 'www.wexphotographic.com//181', 'www.wexphotographic.com//559', 'www.wexphotographic.com//187', 'www.wexphotographic.com//190', 'www.wexphotographic.com//185', 'www.wexphotographic.com//214', 'www.wexphotographic.com//192', 'www.wexphotographic.com//160', 'www.wexphotographic.com//208', 'www.price-hunt.com//749', 'www.price-hunt.c

In [136]:
to_be_csv = []

In [141]:
for matches in attr_to_spec.values():
    if len(matches) > 2:
        for comb in list(itertools.combinations(matches, 2)):
            to_be_csv.append(comb)
    elif len(matches) == 2:
        to_be_csv.append(matches)

In [143]:
len(to_be_csv)

793831

In [144]:
to_be_csv[:25]

[('www.ukdigitalcameras.co.uk//48', 'www.ukdigitalcameras.co.uk//262'),
 ('www.ukdigitalcameras.co.uk//48', 'www.shopmania.in//1281'),
 ('www.ukdigitalcameras.co.uk//48', 'www.shopmania.in//1352'),
 ('www.ukdigitalcameras.co.uk//48', 'www.shopmania.in//1190'),
 ('www.ukdigitalcameras.co.uk//48', 'www.shopmania.in//1460'),
 ('www.ukdigitalcameras.co.uk//48', 'www.shopmania.in//1178'),
 ('www.ukdigitalcameras.co.uk//48', 'www.shopmania.in//1158'),
 ('www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1281'),
 ('www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1352'),
 ('www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1190'),
 ('www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1460'),
 ('www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1178'),
 ('www.ukdigitalcameras.co.uk//262', 'www.shopmania.in//1158'),
 ('www.shopmania.in//1281', 'www.shopmania.in//1352'),
 ('www.shopmania.in//1281', 'www.shopmania.in//1190'),
 ('www.shopmania.in//1281', 'www.shopmania.in//1460'),
 

In [146]:
for a1, a2 in to_be_csv[:10]:
    print(a1)

www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//48
www.ukdigitalcameras.co.uk//262
www.ukdigitalcameras.co.uk//262
www.ukdigitalcameras.co.uk//262


In [149]:
with open("/Users/gfotiadis/programming/sigmod/datasets/created/dictionary_mdw_comb3.csv", "w") as f:
    f.write("left_spec_id,right_spec_id\n")
    for e1, e2 in to_be_csv:
        f.write(str(e1 + ","+ e2 + "\n"))