In [1]:
import pandas as pd
from jaccard_utils import JaccardTokenJoin
from edit_utils import EditTokenJoin

# Load data

In [2]:
file = 'yelp_sample.csv'

delta = 0.7

df = pd.read_csv(file, header=None, nrows=1000)
df = df.reset_index(drop=False)
df.columns = ['id', 'text']
df.text = df.text.apply(lambda x: x.split(';'))
df.text = df.text.apply(lambda x: list(set(x)))
df

Unnamed: 0,id,text
0,0,"[beer, breweries, american, gardens, food, bar..."
1,1,"[delis, soup, vegetarian, salad, sandwiches, r..."
2,2,"[garden, furniture, consignment, stores, antiq..."
3,3,"[salons, beauty, spas, hair]"
4,4,"[life, active, instruction, gyms, training, in..."
...,...,...
995,995,"[planning, event, restaurants, southern, barbe..."
996,996,"[automotive, repair, auto, parts, supplies, de..."
997,997,"[health, therapy, medical, beauty, massage, spas]"
998,998,"[bars, entertainment, adult, nightlife]"


# Self Join

## Jaccard Self Join

In [3]:
output_df_1 = JaccardTokenJoin().tokenjoin_self(df, id='id', join='text', posFilter=True, jointFilter=True)
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Progress 900/1,000 
Time elapsed: Init: 0.02, Cand Gen: 0.05, Cand Ref: 0.04, Cand Ver: 0.13
Candidates Generated: 11,628, Refined: 4,576, Verified: 1,619, Survived: 1,615


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,5,376,1.000000,"[thai, restaurants]","[thai, restaurants]"
1,5,700,1.000000,"[thai, restaurants]","[thai, restaurants]"
2,47,138,1.000000,"[restaurants, indian]","[restaurants, indian]"
3,47,277,1.000000,"[restaurants, indian]","[indian, restaurants]"
4,47,308,1.000000,"[restaurants, indian]","[indian, restaurants]"
...,...,...,...,...,...
1610,866,949,0.785714,"[skin, beauty, eyelash, salons, spas, care, re...","[artists, skin, makeup, beauty, eyelash, massa..."
1611,987,635,0.785714,"[health, therapy, active, life, medical, beaut...","[health, therapy, active, life, medical, beaut..."
1612,993,70,0.714286,"[skin, beauty, eyelash, day, massage, removal,...","[skin, beauty, eyelash, day, massage, salons, ..."
1613,451,707,0.923077,"[health, skin, medical, beauty, surgeons, spas...","[health, skin, medical, beauty, plastic, surge..."


## Edit Self Join

In [4]:
output_df_2 = EditTokenJoin().tokenjoin_self(df, id='id', join='text', posFilter=True, jointFilter=True)
output_df_2

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Progress 900/1,000 
Time elapsed: Init: 0.02, Cand Gen: 0.13, Cand Ref: 0.15, Cand Ver: 0.73
Candidates Generated: 51,166, Refined: 14,726, Verified: 6,097, Survived: 2,056


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,5,376,1.000000,"[thai, restaurants]","[thai, restaurants]"
1,5,700,1.000000,"[thai, restaurants]","[thai, restaurants]"
2,47,138,1.000000,"[restaurants, indian]","[restaurants, indian]"
3,47,277,1.000000,"[restaurants, indian]","[indian, restaurants]"
4,47,308,1.000000,"[restaurants, indian]","[indian, restaurants]"
...,...,...,...,...,...
2051,993,70,0.756098,"[skin, beauty, eyelash, day, massage, removal,...","[skin, beauty, eyelash, day, massage, salons, ..."
2052,451,707,0.923077,"[health, skin, medical, beauty, surgeons, spas...","[health, skin, medical, beauty, plastic, surge..."
2053,451,473,0.714286,"[health, skin, medical, beauty, surgeons, spas...","[health, contouring, skin, medical, beauty, bo..."
2054,70,949,0.820225,"[skin, beauty, eyelash, day, massage, salons, ...","[artists, skin, makeup, beauty, eyelash, massa..."


## Compare

In [5]:
output_df = output_df_2.copy()
output_df['edit_score'] = output_df['score']
output_df = output_df.set_index(['l_id', 'r_id'])
output_df['jaccard_score'] = output_df_1.set_index(['l_id', 'r_id'])['score']
output_df = output_df.reset_index(drop=False)
output_df = output_df.drop('score', axis=1)
output_df

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
0,5,376,"[thai, restaurants]","[thai, restaurants]",1.000000,1.000000
1,5,700,"[thai, restaurants]","[thai, restaurants]",1.000000,1.000000
2,47,138,"[restaurants, indian]","[restaurants, indian]",1.000000,1.000000
3,47,277,"[restaurants, indian]","[indian, restaurants]",1.000000,1.000000
4,47,308,"[restaurants, indian]","[indian, restaurants]",1.000000,1.000000
...,...,...,...,...,...,...
2051,993,70,"[skin, beauty, eyelash, day, massage, removal,...","[skin, beauty, eyelash, day, massage, salons, ...",0.756098,0.714286
2052,451,707,"[health, skin, medical, beauty, surgeons, spas...","[health, skin, medical, beauty, plastic, surge...",0.923077,0.923077
2053,451,473,"[health, skin, medical, beauty, surgeons, spas...","[health, contouring, skin, medical, beauty, bo...",0.714286,
2054,70,949,"[skin, beauty, eyelash, day, massage, salons, ...","[artists, skin, makeup, beauty, eyelash, massa...",0.820225,0.800000


In [6]:
output_df.loc[output_df.jaccard_score.isna()].head()

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
54,158,578,"[mexican, restaurants]","[african, restaurants]",0.714286,
90,310,578,"[mexican, restaurants]","[african, restaurants]",0.714286,
126,578,639,"[african, restaurants]","[mexican, restaurants]",0.714286,
127,578,646,"[african, restaurants]","[mexican, restaurants]",0.714286,
128,578,658,"[african, restaurants]","[mexican, restaurants]",0.714286,


In [7]:
output_df.loc[(output_df.jaccard_score != output_df.edit_score) & (output_df.jaccard_score < 1.0)].head()

Unnamed: 0,l_id,r_id,l_text,r_text,edit_score,jaccard_score
516,145,274,"[health, dermatologists, doctors, medical]","[health, otologists, doctors, medical]",0.818182,0.791045
517,145,411,"[health, dermatologists, doctors, medical]","[health, ophthalmologists, doctors, medical]",0.8,0.708738
690,274,411,"[health, otologists, doctors, medical]","[health, ophthalmologists, doctors, medical]",0.846154,0.756098
1247,115,922,"[apartments, real, estate, home, services]","[real, estate, agents, home, services]",0.791045,0.714286
1271,153,922,"[apartments, real, estate, home, services]","[real, estate, agents, home, services]",0.791045,0.714286


# Foreign Join

In [8]:
df2 = df.sample(1000, random_state=1924).reset_index(drop=True)
df1 = df2.sample(100, random_state=1924).reset_index(drop=True)

## Jaccard Foreign Join

In [9]:
output_df_1 = JaccardTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True)
output_df_1

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.52. Tokens per Element: 5.711656441717792
Progress 0/100 
Time elapsed: Init: 0.01, Cand Gen: 0.02, Cand Ref: 0.02, Cand Ver: 0.03
Candidates Generated: 7,699, Refined: 1,002, Verified: 416, Survived: 414


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,578,578,1.000000,"[african, restaurants]","[african, restaurants]"
1,105,105,1.000000,"[restaurants, chinese]","[restaurants, chinese]"
2,105,75,1.000000,"[restaurants, chinese]","[restaurants, chinese]"
3,105,681,1.000000,"[restaurants, chinese]","[restaurants, chinese]"
4,105,333,1.000000,"[restaurants, chinese]","[restaurants, chinese]"
...,...,...,...,...,...
409,949,866,0.785714,"[artists, skin, makeup, beauty, eyelash, massa...","[skin, beauty, eyelash, salons, spas, care, re..."
410,840,840,1.000000,"[goods, sporting, private, active, life, tutor...","[goods, sporting, private, active, life, tutor..."
411,877,877,1.000000,"[health, physical, therapy, life, active, medi...","[health, physical, therapy, life, active, medi..."
412,916,916,1.000000,"[preschools, government, fitness, active, life...","[preschools, government, fitness, active, life..."


## Edit Foreign Join

In [10]:
output_df_2 = EditTokenJoin().tokenjoin_foreign(df1, df2, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True)
output_df_2

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 1000. Elements per set: 6.293. Tokens per Element: 5.95741299856984
Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 100. Elements per set: 6.52. Tokens per Element: 5.711656441717792
Progress 0/100 
Time elapsed: Init: 0.15, Cand Gen: 0.05, Cand Ref: 0.07, Cand Ver: 0.16
Candidates Generated: 25,651, Refined: 5,710, Verified: 1,756, Survived: 480


Unnamed: 0,l_id,r_id,score,l_text,r_text
0,578,578,1.000000,"[african, restaurants]","[african, restaurants]"
1,578,842,0.714286,"[african, restaurants]","[mexican, restaurants]"
2,578,755,0.714286,"[african, restaurants]","[mexican, restaurants]"
3,578,680,0.714286,"[african, restaurants]","[mexican, restaurants]"
4,578,765,0.714286,"[african, restaurants]","[mexican, restaurants]"
...,...,...,...,...,...
475,949,866,0.785714,"[artists, skin, makeup, beauty, eyelash, massa...","[skin, beauty, eyelash, salons, spas, care, re..."
476,840,840,1.000000,"[goods, sporting, private, active, life, tutor...","[goods, sporting, private, active, life, tutor..."
477,877,877,1.000000,"[health, physical, therapy, life, active, medi...","[health, physical, therapy, life, active, medi..."
478,916,916,1.000000,"[preschools, government, fitness, active, life...","[preschools, government, fitness, active, life..."
