# Load data

In [1]:
import pandas as pd

file = '../data/yelp_sample.csv'

delta = 0.7

df = pd.read_csv(file, header=None, nrows=1000)
df = df.reset_index(drop=False)
df.columns = ['id', 'text']
df.text = df.text.apply(lambda x: x.split(';'))
df.text = df.text.apply(lambda x: list(set(x)))
df

Unnamed: 0,id,text
0,0,"[restaurants, traditional, bar, gardens, night..."
1,1,"[restaurants, delis, sandwiches, vegetarian, s..."
2,2,"[home, antiques, fashion, used, consignment, v..."
3,3,"[salons, spas, beauty, hair]"
4,4,"[interval, life, instruction, active, training..."
...,...,...
995,995,"[restaurants, barbeque, caterers, southern, pl..."
996,996,"[supplies, repair, dealers, parts, automotive,..."
997,997,"[health, beauty, therapy, medical, massage, spas]"
998,998,"[bars, entertainment, nightlife, adult]"


# Self Join with Threshold

## Jaccard Self Join

In [2]:
from pytokenjoin.jaccard.join_delta import JaccardTokenJoin
from pytokenjoin.edit.join_delta import EditTokenJoin

In [3]:
output_df_1 = JaccardTokenJoin().tokenjoin_self(df, id='id', join='text', posFilter=True, jointFilter=True)
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Progress 900/1,000
Time elapsed: Init: 0.01, Cand Gen: 0.04, Cand Ref: 0.02, Cand Ver: 0.07
Candidates Generated: 11,631, Refined: 4,579, Verified: 1,619, Survived: 1,615


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,5,376,1.000000,"[restaurants, thai]","[restaurants, thai]"
1,5,700,1.000000,"[restaurants, thai]","[restaurants, thai]"
2,47,138,1.000000,"[indian, restaurants]","[indian, restaurants]"
3,47,277,1.000000,"[indian, restaurants]","[restaurants, indian]"
4,47,308,1.000000,"[indian, restaurants]","[restaurants, indian]"
...,...,...,...,...,...
1610,866,949,0.785714,"[salons, nail, service, beauty, care, waxing, ...","[makeup, service, nail, salons, care, beauty, ..."
1611,987,635,0.785714,"[health, beauty, spas, instruction, therapy, l...","[health, beauty, spas, instruction, therapy, l..."
1612,993,70,0.714286,"[service, beauty, care, laser, eyelash, massag...","[service, nail, salons, care, beauty, waxing, ..."
1613,451,707,0.923077,"[doctors, health, beauty, care, cosmetic, lase...","[doctors, plastic, health, beauty, care, cosme..."


## Edit Self Join

In [4]:
output_df_2 = EditTokenJoin().tokenjoin_self(df, id='id', join='text', posFilter=True, jointFilter=True)
output_df_2

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Progress 900/1,000
Time elapsed: Init: 0.01, Cand Gen: 0.10, Cand Ref: 0.07, Cand Ver: 0.53
Candidates Generated: 51,162, Refined: 14,726, Verified: 6,097, Survived: 2,056


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,5,376,1.000000,"[restaurants, thai]","[restaurants, thai]"
1,5,700,1.000000,"[restaurants, thai]","[restaurants, thai]"
2,47,138,1.000000,"[indian, restaurants]","[indian, restaurants]"
3,47,277,1.000000,"[indian, restaurants]","[restaurants, indian]"
4,47,308,1.000000,"[indian, restaurants]","[restaurants, indian]"
...,...,...,...,...,...
2051,993,70,0.756098,"[service, beauty, care, laser, eyelash, massag...","[service, nail, salons, care, beauty, waxing, ..."
2052,451,707,0.923077,"[doctors, health, beauty, care, cosmetic, lase...","[doctors, plastic, health, beauty, care, cosme..."
2053,451,473,0.714286,"[doctors, health, beauty, care, cosmetic, lase...","[health, contouring, care, beauty, medical, la..."
2054,70,949,0.820225,"[service, nail, salons, care, beauty, waxing, ...","[makeup, service, nail, salons, care, beauty, ..."


## Compare

In [5]:
output_df = output_df_2.copy()
output_df['edit_score'] = output_df['score']
output_df = output_df.set_index(['l_id', 'r_id'])
output_df['jaccard_score'] = output_df_1.set_index(['l_id', 'r_id'])['score']
output_df = output_df.reset_index(drop=False)
output_df = output_df.drop('score', axis=1)
output_df

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
0,5,376,"[restaurants, thai]","[restaurants, thai]",1.000000,1.000000
1,5,700,"[restaurants, thai]","[restaurants, thai]",1.000000,1.000000
2,47,138,"[indian, restaurants]","[indian, restaurants]",1.000000,1.000000
3,47,277,"[indian, restaurants]","[restaurants, indian]",1.000000,1.000000
4,47,308,"[indian, restaurants]","[restaurants, indian]",1.000000,1.000000
...,...,...,...,...,...,...
2051,993,70,"[service, beauty, care, laser, eyelash, massag...","[service, nail, salons, care, beauty, waxing, ...",0.756098,0.714286
2052,451,707,"[doctors, health, beauty, care, cosmetic, lase...","[doctors, plastic, health, beauty, care, cosme...",0.923077,0.923077
2053,451,473,"[doctors, health, beauty, care, cosmetic, lase...","[health, contouring, care, beauty, medical, la...",0.714286,
2054,70,949,"[service, nail, salons, care, beauty, waxing, ...","[makeup, service, nail, salons, care, beauty, ...",0.820225,0.800000


In [6]:
output_df.loc[output_df.jaccard_score.isna()].head()

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
54,158,578,"[restaurants, mexican]","[african, restaurants]",0.714286,
90,310,578,"[restaurants, mexican]","[african, restaurants]",0.714286,
126,578,639,"[african, restaurants]","[restaurants, mexican]",0.714286,
127,578,646,"[african, restaurants]","[restaurants, mexican]",0.714286,
128,578,658,"[african, restaurants]","[restaurants, mexican]",0.714286,


In [7]:
output_df.loc[(output_df.jaccard_score != output_df.edit_score) & (output_df.jaccard_score < 1.0)].head()

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
516,145,274,"[doctors, health, dermatologists, medical]","[doctors, health, medical, otologists]",0.818182,0.791045
517,145,411,"[doctors, health, dermatologists, medical]","[doctors, health, medical, ophthalmologists]",0.8,0.708738
690,274,411,"[doctors, health, medical, otologists]","[doctors, health, medical, ophthalmologists]",0.846154,0.756098
1247,115,922,"[home, real, estate, apartments, services]","[home, real, agents, estate, services]",0.791045,0.714286
1271,153,922,"[home, real, estate, apartments, services]","[home, real, agents, estate, services]",0.791045,0.714286


# Foreign Join with Threshold

In [8]:
df2 = df.sample(1000, random_state=1924).reset_index(drop=True)
df1 = df.sample(100, random_state=1924).reset_index(drop=True)

## Jaccard Foreign Join

In [9]:
output_df_1 = JaccardTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True)
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.28. Tokens per Element: 5.843949044585988
Progress 0/100
Time elapsed: Init: 0.01, Cand Gen: 0.02, Cand Ref: 0.01, Cand Ver: 0.02
Candidates Generated: 8,158, Refined: 1,276, Verified: 437, Survived: 435


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,751,751,1.0,"[restaurants, vietnamese]","[restaurants, vietnamese]"
1,727,727,1.0,"[restaurants, burgers]","[restaurants, burgers]"
2,51,688,1.0,"[restaurants, italian]","[restaurants, italian]"
3,51,463,1.0,"[restaurants, italian]","[restaurants, italian]"
4,51,51,1.0,"[restaurants, italian]","[restaurants, italian]"
...,...,...,...,...,...
430,846,846,1.0,"[restaurants, traditional, american, new, nigh...","[restaurants, traditional, american, new, nigh..."
431,867,867,1.0,"[restaurants, traditional, breakfast, yogurt, ...","[restaurants, traditional, breakfast, yogurt, ..."
432,176,176,1.0,"[decor, jewelry, home, design, gifts, fashion,...","[decor, jewelry, home, design, gifts, fashion,..."
433,531,531,1.0,"[event, supplies, care, party, life, child, lo...","[event, supplies, care, party, life, child, lo..."


## Edit Foreign Join

In [10]:
output_df_2 = EditTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True)
output_df_2

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.28. Tokens per Element: 5.843949044585988
Progress 0/100
Time elapsed: Init: 0.02, Cand Gen: 0.04, Cand Ref: 0.03, Cand Ver: 0.22
Candidates Generated: 25,939, Refined: 5,747, Verified: 1,885, Survived: 587


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,751,751,1.0,"[restaurants, vietnamese]","[restaurants, vietnamese]"
1,727,727,1.0,"[restaurants, burgers]","[restaurants, burgers]"
2,51,688,1.0,"[restaurants, italian]","[restaurants, italian]"
3,51,463,1.0,"[restaurants, italian]","[restaurants, italian]"
4,51,51,1.0,"[restaurants, italian]","[restaurants, italian]"
...,...,...,...,...,...
582,846,846,1.0,"[restaurants, traditional, american, new, nigh...","[restaurants, traditional, american, new, nigh..."
583,867,867,1.0,"[restaurants, traditional, breakfast, yogurt, ...","[restaurants, traditional, breakfast, yogurt, ..."
584,176,176,1.0,"[decor, jewelry, home, design, gifts, fashion,...","[decor, jewelry, home, design, gifts, fashion,..."
585,531,531,1.0,"[event, supplies, care, party, life, child, lo...","[event, supplies, care, party, life, child, lo..."


# Foreign Jaccard Join with k

In [12]:
from pytokenjoin.jaccard.jaccard_topk import JaccardTokenJoin

In [13]:
df2 = df.sample(1000, random_state=1924).reset_index(drop=True)
df1 = df.sample(100, random_state=1924).reset_index(drop=True)

In [14]:
output_df_1 = JaccardTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text')
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.28. Tokens per Element: 5.843949044585988
Progress 0/100 	: δ: 1e-09
Time elapsed: Init: 0.01, Cand Gen: 0.07, Cand Ref: 0.17, Cand Ver: 1.04
Candidates Generated: 34,018, Refined: 8,170, Verified: 4,973, Survived: 4,669
Final δ is 0.500


Unnamed: 0,score,l_id,r_id,l_text,r_text
766,0.909091,836,303,"[health, general, orthodontists, cosmetic, den...","[health, general, cosmetic, dentistry, medical..."
519,0.875000,338,615,"[salons, nail, beauty, care, skin, hair, spas]","[salons, nail, beauty, care, removal, skin, ha..."
561,0.875000,884,749,"[health, general, endodontists, cosmetic, medi...","[health, general, orthodontists, endodontists,..."
692,0.875000,311,939,"[supplies, repair, dealers, parts, automotive,...","[supplies, repair, dealers, parts, used, autom..."
999,0.857143,844,906,"[health, general, cosmetic, dentistry, dentist...","[health, general, cosmetic, dentistry, dentist..."
...,...,...,...,...,...
106,0.500000,369,532,"[restaurants, american, nightlife, new, bars, ...","[restaurants, traditional, nightlife, bars, sp..."
107,0.500000,793,603,"[coffee, tea, food]","[restaurants, mexican, coffee, bakeries, tea, ..."
108,0.500000,409,592,"[canadian, restaurants, cocktail, nightlife, s...","[canadian, restaurants, nightlife, plates, new..."
109,0.500000,893,949,"[salons, nail, beauty, care, massage, skin, spas]","[makeup, service, nail, salons, care, beauty, ..."
