Corpus deduplication via binning

In [3]:
import pandas as pd
import py_stringmatching as sm
import networkx as nx

Import in the data

In [87]:
corpus_folder = '../data/corpi/'
corpus_file = 'crime_corpus.csv'
corpus_header = 'crime'

In [58]:
corpus = pd.read_csv(corpus_folder + corpus_file)
corpus[corpus_header] = corpus[corpus_header].str.strip()

Initialize the string similarity metric

In [46]:
jaro = sm.Jaro()
print(jaro.get_sim_score('the', 'theft'))

0.8666666746139526


In [47]:
jw = sm.JaroWinkler()
print(jw.get_sim_score('bay ho alcott', 'bay ho palisades'))

0.8509615659713745


Do cartesian product of corpus for comparisons

In [59]:
corpus_cross = corpus.merge(corpus, how='cross')
corpus_cross

Unnamed: 0,crime_x,crime_y
0,reckless driving,reckless driving
1,reckless driving,stolen vehicle log
2,reckless driving,ambulance call overdose
3,reckless driving,abandoned refrigerator
4,reckless driving,calling for help
...,...,...
508364,photokilling,beguilt
508365,photokilling,ernest klein
508366,photokilling,karl brugmann
508367,photokilling,libertarianism


In [60]:
def get_jaro_sim_score(x):
    return jaro.get_sim_score(x[corpus_header + '_x'], x[corpus_header + '_y'])

In [61]:
def get_jw_sim_score(x):
    return jw.get_sim_score(x[corpus_header + '_x'], x[corpus_header + '_y'])

In [62]:
corpus_cross['jaro_score'] = corpus_cross.apply(get_jaro_sim_score, axis=1)
corpus_cross['jw_score'] = corpus_cross.apply(get_jw_sim_score, axis=1)

In [63]:
corpus_cross

Unnamed: 0,crime_x,crime_y,jaro_score,jw_score
0,reckless driving,reckless driving,1.000000,1.000000
1,reckless driving,stolen vehicle log,0.564815,0.564815
2,reckless driving,ambulance call overdose,0.449275,0.449275
3,reckless driving,abandoned refrigerator,0.546086,0.546086
4,reckless driving,calling for help,0.475000,0.475000
...,...,...,...,...
508364,photokilling,beguilt,0.448413,0.448413
508365,photokilling,ernest klein,0.544444,0.544444
508366,photokilling,karl brugmann,0.463675,0.463675
508367,photokilling,libertarianism,0.456349,0.456349


In [64]:
truth = corpus_cross[(corpus_cross['jaro_score']>=0.9) | (corpus_cross['jw_score']>=0.9)]
G=nx.from_pandas_edgelist(truth, f'{corpus_header}_x', f'{corpus_header}_y')
clustered_groups = list(nx.connected_components(G))

In [81]:
grouping = pd.DataFrame({corpus_header: clustered_groups})
grouping[corpus_header] = grouping[corpus_header].apply(list)
grouping[corpus_header].apply(lambda x: ';'.join(x))

0        reckless driving;reckless driving all units
1               stolen veh lojack;stolen vehicle log
2                            ambulance call overdose
3                             abandoned refrigerator
4      calling for help;calling for help panic alarm
                           ...                      
513                                         butchery
514                                     ernest klein
515                                    karl brugmann
516                                   libertarianism
517                                     photokilling
Name: crime, Length: 518, dtype: object

In [104]:
grouping = pd.DataFrame({corpus_header: clustered_groups})
grouping[corpus_header + '_set'] = grouping[corpus_header]
grouping[corpus_header + '_set'] = grouping[corpus_header + '_set'].apply(list)
grouping[corpus_header + '_set'] = grouping[corpus_header + '_set'].apply(lambda x: ';'.join(x))
grouping.reset_index(inplace = True)
grouping[corpus_header + '_set' + "_id"] = clustering_prefix + grouping["index"].astype("str")
del grouping['index']
# rearrange columns
grouping = grouping[[corpus_header + '_set' + "_id", corpus_header + '_set', corpus_header]]
grouping = grouping.explode(column = corpus_header)
grouping

Unnamed: 0,crime_set_id,crime_set,crime
0,crm0,reckless driving;reckless driving all units,reckless driving
0,crm0,reckless driving;reckless driving all units,reckless driving all units
1,crm1,stolen veh lojack;stolen vehicle log,stolen veh lojack
1,crm1,stolen veh lojack;stolen vehicle log,stolen vehicle log
2,crm2,ambulance call overdose,ambulance call overdose
...,...,...,...
513,crm513,butchery,butchery
514,crm514,ernest klein,ernest klein
515,crm515,karl brugmann,karl brugmann
516,crm516,libertarianism,libertarianism
