# Load data

In [1]:
import pandas as pd

file = '../data/yelp_sample.csv'

delta = 0.7

df = pd.read_csv(file, header=None, nrows=1000)
df = df.reset_index(drop=False)
df.columns = ['id', 'text']
df.text = df.text.apply(lambda x: x.split(';'))
df.text = df.text.apply(lambda x: list(set(x)))
df

Unnamed: 0,id,text
0,0,"[bar, american, gardens, traditional, nightlif..."
1,1,"[delis, soup, cafes, salad, vegetarian, sandwi..."
2,2,"[antiques, used, vintage, consignment, fashion..."
3,3,"[beauty, salons, hair, spas]"
4,4,"[life, gyms, instruction, training, fitness, a..."
...,...,...
995,995,"[services, southern, event, caterers, barbeque..."
996,996,"[repair, automotive, parts, car, supplies, aut..."
997,997,"[beauty, health, massage, spas, medical, therapy]"
998,998,"[entertainment, bars, nightlife, adult]"


# Self Join with Threshold

## Jaccard Self Join

In [2]:
from pyTokenJoin.jaccard.jaccard_delta import JaccardTokenJoin
from pyTokenJoin.edit.edit_delta import EditTokenJoin

In [3]:
output_df_1 = JaccardTokenJoin().tokenjoin_self(df, id='id', join='text', posFilter=True, jointFilter=True)
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Progress 900/1,000
Time elapsed: Init: 0.01, Cand Gen: 0.04, Cand Ref: 0.02, Cand Ver: 0.05
Candidates Generated: 11,628, Refined: 4,577, Verified: 1,619, Survived: 1,615


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,5,376,1.000000,"[restaurants, thai]","[restaurants, thai]"
1,5,700,1.000000,"[restaurants, thai]","[restaurants, thai]"
2,47,138,1.000000,"[restaurants, indian]","[restaurants, indian]"
3,47,277,1.000000,"[restaurants, indian]","[restaurants, indian]"
4,47,308,1.000000,"[restaurants, indian]","[restaurants, indian]"
...,...,...,...,...,...
1610,866,949,0.785714,"[beauty, salons, service, nail, hair, skin, ey...","[makeup, service, salons, beauty, nail, hair, ..."
1611,987,635,0.785714,"[beauty, yoga, life, health, massage, instruct...","[beauty, accessories, yoga, life, health, fash..."
1612,993,70,0.714286,"[beauty, service, hair, skin, massage, laser, ...","[beauty, service, salons, nail, hair, skin, ma..."
1613,451,707,0.923077,"[beauty, hair, skin, health, laser, doctors, s...","[beauty, hair, skin, health, laser, doctors, s..."


## Edit Self Join

In [4]:
output_df_2 = EditTokenJoin().tokenjoin_self(df, id='id', join='text', posFilter=True, jointFilter=True)
output_df_2

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Progress 900/1,000
Time elapsed: Init: 0.01, Cand Gen: 0.10, Cand Ref: 0.07, Cand Ver: 0.50
Candidates Generated: 51,163, Refined: 14,726, Verified: 6,097, Survived: 2,056


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,5,376,1.000000,"[restaurants, thai]","[restaurants, thai]"
1,5,700,1.000000,"[restaurants, thai]","[restaurants, thai]"
2,47,138,1.000000,"[restaurants, indian]","[restaurants, indian]"
3,47,277,1.000000,"[restaurants, indian]","[restaurants, indian]"
4,47,308,1.000000,"[restaurants, indian]","[restaurants, indian]"
...,...,...,...,...,...
2051,993,70,0.756098,"[beauty, service, hair, skin, massage, laser, ...","[beauty, service, salons, nail, hair, skin, ma..."
2052,451,707,0.923077,"[beauty, hair, skin, health, laser, doctors, s...","[beauty, hair, skin, health, laser, doctors, s..."
2053,451,473,0.714286,"[beauty, hair, skin, health, laser, doctors, s...","[beauty, hair, skin, health, laser, spas, cont..."
2054,70,949,0.820225,"[beauty, service, salons, nail, hair, skin, ma...","[makeup, service, salons, beauty, nail, hair, ..."


## Compare

In [5]:
output_df = output_df_2.copy()
output_df['edit_score'] = output_df['score']
output_df = output_df.set_index(['l_id', 'r_id'])
output_df['jaccard_score'] = output_df_1.set_index(['l_id', 'r_id'])['score']
output_df = output_df.reset_index(drop=False)
output_df = output_df.drop('score', axis=1)
output_df

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
0,5,376,"[restaurants, thai]","[restaurants, thai]",1.000000,1.000000
1,5,700,"[restaurants, thai]","[restaurants, thai]",1.000000,1.000000
2,47,138,"[restaurants, indian]","[restaurants, indian]",1.000000,1.000000
3,47,277,"[restaurants, indian]","[restaurants, indian]",1.000000,1.000000
4,47,308,"[restaurants, indian]","[restaurants, indian]",1.000000,1.000000
...,...,...,...,...,...,...
2051,993,70,"[beauty, service, hair, skin, massage, laser, ...","[beauty, service, salons, nail, hair, skin, ma...",0.756098,0.714286
2052,451,707,"[beauty, hair, skin, health, laser, doctors, s...","[beauty, hair, skin, health, laser, doctors, s...",0.923077,0.923077
2053,451,473,"[beauty, hair, skin, health, laser, doctors, s...","[beauty, hair, skin, health, laser, spas, cont...",0.714286,
2054,70,949,"[beauty, service, salons, nail, hair, skin, ma...","[makeup, service, salons, beauty, nail, hair, ...",0.820225,0.800000


In [6]:
output_df.loc[output_df.jaccard_score.isna()].head()

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
54,158,578,"[restaurants, mexican]","[restaurants, african]",0.714286,
90,310,578,"[restaurants, mexican]","[restaurants, african]",0.714286,
126,578,639,"[restaurants, african]","[restaurants, mexican]",0.714286,
127,578,646,"[restaurants, african]","[restaurants, mexican]",0.714286,
128,578,658,"[restaurants, african]","[restaurants, mexican]",0.714286,


In [7]:
output_df.loc[(output_df.jaccard_score != output_df.edit_score) & (output_df.jaccard_score < 1.0)].head()

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
516,145,274,"[doctors, dermatologists, health, medical]","[doctors, health, medical, otologists]",0.818182,0.791045
517,145,411,"[doctors, dermatologists, health, medical]","[doctors, health, ophthalmologists, medical]",0.8,0.708738
690,274,411,"[doctors, health, medical, otologists]","[doctors, health, ophthalmologists, medical]",0.846154,0.756098
1247,115,922,"[real, estate, services, home, apartments]","[real, agents, estate, services, home]",0.791045,0.714286
1271,153,922,"[real, estate, services, home, apartments]","[real, agents, estate, services, home]",0.791045,0.714286


# Foreign Join with Threshold

In [8]:
df2 = df.sample(1000, random_state=1924).reset_index(drop=True)
df1 = df.sample(100, random_state=1924).reset_index(drop=True)

## Jaccard Foreign Join

In [9]:
output_df_1 = JaccardTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True)
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.28. Tokens per Element: 5.843949044585988
Progress 0/100
Time elapsed: Init: 0.01, Cand Gen: 0.02, Cand Ref: 0.01, Cand Ver: 0.01
Candidates Generated: 8,442, Refined: 1,284, Verified: 438, Survived: 435


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,751,751,1.0,"[restaurants, vietnamese]","[restaurants, vietnamese]"
1,727,727,1.0,"[burgers, restaurants]","[burgers, restaurants]"
2,51,688,1.0,"[restaurants, italian]","[restaurants, italian]"
3,51,463,1.0,"[restaurants, italian]","[restaurants, italian]"
4,51,51,1.0,"[restaurants, italian]","[restaurants, italian]"
...,...,...,...,...,...
430,846,846,1.0,"[wine, new, american, traditional, nightlife, ...","[wine, new, american, traditional, nightlife, ..."
431,867,867,1.0,"[new, breakfast, american, cream, burgers, tra...","[new, breakfast, american, cream, burgers, tra..."
432,176,176,1.0,"[design, accessories, services, home, jewelry,...","[design, accessories, services, home, jewelry,..."
433,531,531,1.0,"[parks, party, local, trampoline, courses, ser...","[parks, party, local, trampoline, courses, ser..."


## Edit Foreign Join

In [10]:
output_df_2 = EditTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True)
output_df_2

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.28. Tokens per Element: 5.843949044585988
Progress 0/100
Time elapsed: Init: 0.02, Cand Gen: 0.05, Cand Ref: 0.04, Cand Ver: 0.22
Candidates Generated: 25,939, Refined: 5,813, Verified: 1,944, Survived: 587


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,751,751,1.0,"[restaurants, vietnamese]","[restaurants, vietnamese]"
1,727,727,1.0,"[burgers, restaurants]","[burgers, restaurants]"
2,51,688,1.0,"[restaurants, italian]","[restaurants, italian]"
3,51,463,1.0,"[restaurants, italian]","[restaurants, italian]"
4,51,51,1.0,"[restaurants, italian]","[restaurants, italian]"
...,...,...,...,...,...
582,846,846,1.0,"[wine, new, american, traditional, nightlife, ...","[wine, new, american, traditional, nightlife, ..."
583,867,867,1.0,"[new, breakfast, american, cream, burgers, tra...","[new, breakfast, american, cream, burgers, tra..."
584,176,176,1.0,"[design, accessories, services, home, jewelry,...","[design, accessories, services, home, jewelry,..."
585,531,531,1.0,"[parks, party, local, trampoline, courses, ser...","[parks, party, local, trampoline, courses, ser..."


# Foreign Jaccard Join with k

In [11]:
from pyTokenJoin.jaccard.jaccard_topk import JaccardTokenJoin

In [12]:
df2 = df.sample(1000, random_state=1924).reset_index(drop=True)
df1 = df.sample(100, random_state=1924).reset_index(drop=True)

In [13]:
output_df_1 = JaccardTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text')
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.28. Tokens per Element: 5.843949044585988
Progress 0/100 	: δ: 1e-09
Time elapsed: Init: 0.01, Cand Gen: 0.07, Cand Ref: 0.18, Cand Ver: 0.95
Candidates Generated: 34,185, Refined: 8,210, Verified: 5,002, Survived: 4,699
Final δ is 0.500


Unnamed: 0,score,l_id,r_id,l_text,r_text
750,0.909091,836,303,"[orthodontists, general, health, surgeons, per...","[general, health, surgeons, periodontists, med..."
587,0.875000,884,749,"[endodontists, general, health, medical, denti...","[dentists, endodontists, orthodontists, genera..."
722,0.875000,311,939,"[repair, automotive, parts, car, supplies, aut...","[used, repair, automotive, car, parts, supplie..."
788,0.875000,338,615,"[beauty, salons, nail, hair, skin, spas, care]","[beauty, salons, nail, hair, skin, spas, care,..."
799,0.857143,338,172,"[beauty, salons, nail, hair, skin, spas, care]","[beauty, salons, nail, skin, spas, care]"
...,...,...,...,...,...
184,0.500000,65,218,"[tex-mex, mexican, coffee, restaurants, food, ...","[burgers, fast, coffee, restaurants, food, tea]"
183,0.500000,420,232,"[new, american, italian, pizza, sandwiches, re...","[restaurants, pizza, italian]"
182,0.500000,705,138,"[restaurants, indian, himalayan/nepalese, chin...","[restaurants, indian]"
181,0.500000,705,974,"[restaurants, indian, himalayan/nepalese, chin...","[restaurants, chinese]"
