In [1]:
!rm -rf model_discovery
!rm -rf data
import gdown
# download source code
gdown.download('https://drive.google.com/uc?id=1exw9vOYcb0fPVseleifK0m1VxbBCsBM_', output=None, quiet=False)
# download data
gdown.download('https://drive.google.com/uc?id=1ozrvNU128WOGWDVhhDKtZENzBMuO-w4i', output=None, quiet=False)

!unzip -qq data.zip
!unzip -qq model_discovery.zip

Downloading...
From: https://drive.google.com/uc?id=1exw9vOYcb0fPVseleifK0m1VxbBCsBM_
To: /content/model_discovery.zip
100%|██████████| 11.9k/11.9k [00:00<00:00, 9.47MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ozrvNU128WOGWDVhhDKtZENzBMuO-w4i
To: /content/data.zip
22.5MB [00:00, 142MB/s] 


In [5]:
import itertools
import numpy as np
import os
import pandas as pd
from collections import Counter
from importlib import reload
from model_discovery import utils, l2lsh
from tqdm.notebook import tqdm
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Deepmatcher Part


In [3]:
DATA_DIR = 'data/deepmatcher'

datasets = ['abt_buy_exp', 'dblp_acm_exp_data', 'dblp_scholar_exp_data',
            'dirty_itunes_amazon_exp_data', 'walmart_amazon_exp_data']

In [6]:
comb = list(itertools.permutations(datasets, 2))

src_list = []
tar_list = []
jsd_list = []
ada_list = []

for src, tar in tqdm(comb, leave=False):
    src_file_train = ''.join([src, '-train.csv'])
    src_file_valid = ''.join([src, '-valid.csv'])
    tar_file_valid = ''.join([tar, '-valid.csv'])

    d1_path = os.path.join(DATA_DIR, src_file_train)
    d2_path = os.path.join(DATA_DIR, tar_file_valid)

    d1 = pd.read_csv(d1_path)
    d2 = pd.read_csv(d2_path)

    d1.drop(['id', 'label'], axis=1, inplace=True)
    d2.drop(['id', 'label'], axis=1, inplace=True)

    data1 = utils.flatten_df(d1)
    word1 = utils.word_tokenize(data1)
    data2 = utils.flatten_df(d2)
    word2 = utils.word_tokenize(data2)
    
    src_list.append(src)
    tar_list.append(tar)
    jsd_list.append(utils.jsd_for_word(word1, word2))
    ada_list.append(utils.adaptivity_word(d1, d2, word1, word2, threshold=0.6))


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

In [7]:
result_df = pd.DataFrame({'Source 1': src_list, 'Source 2': tar_list, 'JSD': jsd_list, 'Adaptivity': ada_list})

In [8]:
result_df

Unnamed: 0,Source 1,Source 2,JSD,Adaptivity
0,abt_buy_exp,dblp_acm_exp_data,0.584039,20.0
1,abt_buy_exp,dblp_scholar_exp_data,0.573541,20.0
2,abt_buy_exp,dirty_itunes_amazon_exp_data,0.465671,20.0
3,abt_buy_exp,walmart_amazon_exp_data,0.269325,20.0
4,dblp_acm_exp_data,abt_buy_exp,0.601097,25.0
5,dblp_acm_exp_data,dblp_scholar_exp_data,0.071087,25.0
6,dblp_acm_exp_data,dirty_itunes_amazon_exp_data,0.564749,25.0
7,dblp_acm_exp_data,walmart_amazon_exp_data,0.624247,25.0
8,dblp_scholar_exp_data,abt_buy_exp,0.572959,58.0
9,dblp_scholar_exp_data,dblp_acm_exp_data,0.050957,58.0


## Magellan part

In [9]:
DATA_DIR = 'data/magellan'

In [10]:
datasets = ['Anime', 'Bikes', 'Books1', 'Books2', 'Books3', 'Restaurants4',
      'Movies1','Movies2','Movies3','Movies4','Movies5']

comb = list(itertools.permutations(datasets, 2))

src_list = []
tar_list = []
jsd_list = []
ada_list = []
l2d_list = []


for src, tar in tqdm(comb):
    src_file_train = ''.join([src, '.csv'])
    tar_file_valid = ''.join([tar, '.csv'])

    d1_path = os.path.join(DATA_DIR, src_file_train)
    d2_path = os.path.join(DATA_DIR, tar_file_valid)

    d1 = pd.read_csv(d1_path)
    d2 = pd.read_csv(d2_path)

    d1.drop(['id','label'], axis=1, inplace=True)
    d2.drop(['id','label'], axis=1, inplace=True)
    
    data1 = utils.flatten_df(d1)
    word1 = utils.word_tokenize(data1)
    data2 = utils.flatten_df(d2)
    word2 = utils.word_tokenize(data2)
    
    src_list.append(src)
    tar_list.append(tar)
    jsd_list.append(utils.jsd_for_word(word1, word2))
    ada_list.append(utils.adaptivity_word(d1, d2, word1, word2, threshold=0.7, partition_size=90))
    l2d_list.append(utils.l2d_btw_domains(word1, word2))

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))

  prob1 = prob1 / np.sum(prob1)





In [11]:
result = pd.DataFrame({'Source 1': src_list, 'Source 2': tar_list, 'JSD': jsd_list, 'Adaptivity': ada_list,
                         'L2D': l2d_list})

In [12]:
result

Unnamed: 0,Source 1,Source 2,JSD,Adaptivity,L2D
0,Anime,Bikes,0.670891,5.0,0.258389
1,Anime,Books1,0.654520,5.0,0.232731
2,Anime,Books2,0.623253,5.0,0.341144
3,Anime,Books3,0.597067,5.0,0.234402
4,Anime,Restaurants4,0.674455,5.0,0.282821
...,...,...,...,...,...
105,Movies5,Restaurants4,0.628563,4.0,0.226536
106,Movies5,Movies1,0.280102,4.0,0.081562
107,Movies5,Movies2,0.333406,4.0,0.087357
108,Movies5,Movies3,0.364668,4.0,0.111434
