In [1]:
import json
import pandas as pd
import jaccard_utils
import edit_utils

# Load Data

In [2]:
lines = []
with open('predicted_tags.json') as f:
    for line in f:
        lines.append(json.loads(line))
        
df = pd.DataFrame(lines)
df

Unnamed: 0,id,original_tags,predicted_tags
0,d6ebea9d-f084-4362-8703-964e962fa074,[transparency_data],"[annually requirement local government, govern..."
1,deaf7a04-ce65-431e-b712-4b293db1f519,"[em_radiation, ηλεκτρομαγνητικηa_ακτινοβολιaα,...","[mobile telephony digital television, plant sh..."
2,5c8fcc0a-c12c-4c5a-ac05-c7f5fe62a149,"[cyta, mtn, velister, ηλεκτρομαγνητικηa_ακτινο...","[communications list includes mobile, stations..."
3,1310b1f2-2c73-46dd-aadc-7c74ceaecb97,[transparency_data],"[replacement publishing information annually, ..."
4,f462bb0b-7786-46ae-bc8b-d2be716b5c48,"[dual_use, export_control, harmonized_system, ...","[trade period 2016 2020, countries trade strat..."
...,...,...,...
4017,429c0429-00e6-4883-ae77-d49850445e32,"[bleeperbike, community, dublin, mobility, rec...","[500 bikes operation dublin, attachments also ..."
4018,fadca6c7-ba89-414f-a925-0c2268c3e146,"[aca, acas, architectural, areas, city, conser...","[architectural conservation galway city, serif..."
4019,b3f77181-1b84-4d6b-a7e4-a01368db3b14,"[areas, city, control, development, dgitowns, ...","[planners areas galway city, serif font size 1..."
4020,4c11e8a8-9a03-4b4d-82dd-749876bd8e10,"[altitude, altitudine, berg, hohe, montagna, m...","[names heights swissnames3d 2013, tourism data..."


# Find similar datasets

## Predicted Tags

In [3]:
output_df = edit_utils.tokenjoin_self(df, id='id', join='predicted_tags', delta=0.7, posFilter=True, jointFilter=True)
output_df

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 4022. Elements per set: 5.0. Tokens per Element: 25.642665340626554
Progress 4,000/4,022 
Time elapsed: Init: 0.31, Cand Gen: 5.61, Cand Ref: 24.17, Cand Ver: 21.31
Candidates Generated: 4,322,515, Refined: 3,763,197, Verified: 27,736, Survived: 21,556


Unnamed: 0,l_id,r_id,score,l_predicted_tags,r_predicted_tags
0,7,11,1.000000,[swissboundaries3d municipal boundaries consti...,[swissboundaries3d municipal boundaries consti...
1,71,75,1.000000,"[dataset building lines motorways, safety livi...","[dataset building lines motorways, safety livi..."
2,71,77,1.000000,"[dataset building lines motorways, safety livi...","[dataset building lines motorways, safety livi..."
3,75,77,1.000000,"[dataset building lines motorways, safety livi...","[dataset building lines motorways, safety livi..."
4,109,110,1.000000,"[emergency call call three, 144 telephone help...","[emergency call call three, 144 telephone help..."
...,...,...,...,...,...
21551,3981,3984,1.000000,"[council corporation elections county, divisio...","[council corporation elections county, divisio..."
21552,3981,3987,1.000000,"[council corporation elections county, divisio...","[council corporation elections county, divisio..."
21553,3984,3987,1.000000,"[council corporation elections county, divisio...","[council corporation elections county, divisio..."
21554,4013,4016,1.000000,"[2006 edition statistical yearbook, switzerlan...","[2006 edition statistical yearbook, switzerlan..."


## Original Tags

In [4]:
output_df = edit_utils.tokenjoin_self(df, id='id', join='original_tags', delta=0.7, posFilter=True, jointFilter=True)
output_df

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 4022. Elements per set: 7.561163600198906. Tokens per Element: 14.478543947913584
Progress 4,000/4,022 
Time elapsed: Init: 0.16, Cand Gen: 3.46, Cand Ref: 0.98, Cand Ver: 435.12
Candidates Generated: 772,464, Refined: 556,197, Verified: 517,966, Survived: 427,451


Unnamed: 0,l_id,r_id,score,l_original_tags,r_original_tags
0,0,3,1.000000,[transparency_data],[transparency_data]
1,0,1427,0.894737,[transparency_data],[transparency-data]
2,3,1427,0.894737,[transparency_data],[transparency-data]
3,14,16,1.000000,[geodatasamverkan],[geodatasamverkan]
4,25,29,0.800000,[ogd18_bfe],[ogd28_bfe]
...,...,...,...,...,...
427446,3306,3310,0.780657,"[acque-geografia, bgdi-bundesgeodaten-infrastr...","[acque-geografia, aufbewahrungs-und-archivieru..."
427447,373,374,0.838384,"[alesund_and_giske, alesund_og_giske, avsetnin...","[avsetning, barents_sea, barentshavet, bart_fj..."
427448,373,586,0.756151,"[alesund_and_giske, alesund_og_giske, avsetnin...","[avsetning, bart_fjell, botom_current, botom_t..."
427449,374,586,0.820180,"[avsetning, barents_sea, barentshavet, bart_fj...","[avsetning, bart_fjell, botom_current, botom_t..."


# Assess Predicted Tags

In [12]:
scores = {}
for ind, row in df.iterrows():
    if ind % 100 == 0:
        print("Progress {:,}/{:,} \r".format(ind, df.shape[0]), end='')
    scores[ind] = edit_utils.verification(row['original_tags'], row['predicted_tags'])
scores = pd.Series(scores)
scores.describe()

Progress 4,000/4,022 

count    4022.000000
mean        0.075664
std         0.042509
min         0.000000
25%         0.052674
50%         0.082422
75%         0.104370
max         0.202528
dtype: float64

# Tag Linking

In [37]:
def prepare_tags(tags):
    tags = set([item for sublist in tags for item in sublist])
    tags = [[tag] for tag in tags]
    tags = pd.DataFrame(tags)
    tags = tags.reset_index(drop=False)
    tags.columns = ['id', 'text']
    return tags

In [38]:
original_tags = prepare_tags(df['original_tags'])
original_tags

Unnamed: 0,id,text
0,0,veloweg
1,1,air-temperature
2,2,pesca
3,3,-ordnance
4,4,odonati
...,...,...
6519,6519,natural-gas
6520,6520,svalbard
6521,6521,ovins
6522,6522,developpement-urbain


In [39]:
predicted_tags = prepare_tags(df['predicted_tags'])
predicted_tags

Unnamed: 0,id,text
0,0,derived generalised elevation model
1,1,co2 methane ch4 nitrous
2,2,industries also included cement
3,3,norwegian water resources energy
4,4,fit human
...,...,...
13297,13297,fishing areas carinthia territorial
13298,13298,heaters heat pumps
13299,13299,400 data
13300,13300,hong kong double export


In [40]:
output_df = edit_utils.tokenjoin_foreign(original_tags, predicted_tags, 'id', 'id', 'text', 'text',
                              posFilter=True, jointFilter=True, delta=0.9)
output_df

Finished reading file. Lines read: 0. Lines skipped due to errors: 0. Num of sets: 13302. Elements per set: 26.589685761539616. Tokens per Element: 1.0


KeyError: '-$$@0'