In [1]:
import pandas as pd
import numpy as np
import re
from simhash import Simhash, SimhashIndex
from itertools import combinations_with_replacement
from collections import defaultdict
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare the Data

In [2]:
data_ads = pd.read_csv('kaggle_text_classified_ads.csv').reset_index().iloc[:1000]
data_ads = data_ads[['index', 'value']]

In [3]:
def get_features(s):
    """
    Returns list of substrings of a given width.  Example: 'how are' -> ['how', 'owa', 'war', 'are']
    """
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

In [4]:
# view the hash value
Simhash(get_features('How are you? I am fine. Thanks.')).value

5570291454580194887

In [5]:
# calculate hash distance
Simhash(get_features('How are you? We are fine. Thanks.')).distance(Simhash(get_features('How are you? I am fine. Thanks.')))

15

# Hash the Documents

In [6]:
data_ads_dict = data_ads['value'].to_dict()

# hash the documents
data_ads_objs = [(str(k), Simhash(get_features(v))) for k, v in data_ads_dict.items()]

# create an index for efficient searching
distance_threshold = 3  # hashes > this value will not be considered similar
index = SimhashIndex(data_ads_objs, k=distance_threshold)

assert(len(data_ads_objs) == len(data_ads))

In [7]:
# example of finding similar texts for 1 provided text
similar_docs = index.get_near_dups(data_ads_objs[0][1])
similar_docs_text = [data_ads_dict[int(k)] for k in similar_docs]
print(similar_docs_text)

['Overview \r\n \r\nWhy AFFIRMA Rehabilitation? Our innovative occupational therapy clinical programs will challenge new and experienced Occupational Therapists. We have state:of:the:art ACP equipment, modality based programs, well equipped gyms and much more. Our Rehab Directors are experienced, organized, manage schedules well and lead our therapy teams in the ethical care of our residents. If you ve always wanted to work for a company that supports your occupational therapy career development or can help you maintain a work/life balance, you owe it to yourself to find out more about AFFIRMA Rehabilitation.\r\n \r\nAFFIRMA Rehabilitation is changing the way you look at geriatric rehabilitation. With our Homeward Bound Programs, AFFIRMA Rehabilitation facilities are now returning over 75 of their residents to home or community level living. Our interdisciplinary team of Physical, Occupational and Speech Therapists use a collaborative approach to finding the right plan of care of each 

In [8]:
len(index.bucket.keys())

1710

# Assign Document Clusters based on Hash Similarity

In [9]:
# determine how many clusters there are and which docs belong to each cluster
# clusters = {}
# cluster_id = 0
# for simhash_key, hashes in index.bucket.items():
#     similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
#     clusters[cluster_id] = set(similar_doc_ids)
#     cluster_id += 1

clusters = {}
cluster = 0
seen = set()
for doc in data_ads_objs:
    if doc[0] not in seen:
        clusters[cluster] = set(index.get_near_dups(doc[1]))
        seen.update(clusters[cluster])
        cluster += 1

Big bucket found. key:d9dd:0, len:271


Clusters seems like it should contain the cluster assignments for each document, but there are documents appearing in multiple clusters.  To de-duplicate, I will form an adjancency matrix and extract connected components.

In [11]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(data_ads), len(data_ads))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)
data_ads['cluster'] = cluster

In [12]:
print(len(edges), len(data_ads), matrix_shape, len(rows))

43618 1000 (1000, 1000) 43618


In [13]:
# inspect a cluster
data_ads[data_ads['cluster']==3]

Unnamed: 0,index,value,cluster
3,3,Be your own boss working under a Strong Brand ...,3
4,4,Be your own boss working under a Strong Brand ...,3
5,5,Be your own boss working under a Strong Brand ...,3
6,6,Be your own boss working under a Strong Brand ...,3
7,7,Be your own boss working under a Strong Brand ...,3
...,...,...,...
114,114,Be your own boss working under a Strong Brand ...,3
115,115,Be your own boss working under a Strong Brand ...,3
116,116,Be your own boss working under a Strong Brand ...,3
117,117,Be your own boss working under a Strong Brand ...,3


# Try Adding New Docs

New docs can be added to the index without having to rebuild it from scratch.  

In [14]:
data_ads_new = pd.read_csv('kaggle_text_classified_ads.csv').reset_index().iloc[1000:1010]
data_ads_new = data_ads_new[['index', 'value']]

data_ads_new_dict = data_ads_new['value'].to_dict()

# hash the documents
data_ads_new_objs = [(str(k), Simhash(get_features(v))) for k, v in data_ads_new_dict.items()]
assert(len(data_ads_new_objs) == len(data_ads_new))

# update the index
for obj in data_ads_new_objs:
    index.add(*obj)

In [15]:
# determine how many clusters there are and which docs belong to each cluster
# clusters = {}
# cluster_id = 0
# for simhash_key, hashes in index.bucket.items():
#     similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
#     clusters[cluster_id] = set(similar_doc_ids)
#     cluster_id += 1

clusters = {}
cluster = 0
seen = set()
for doc in data_ads_new_objs:
    if doc[0] not in seen:
        clusters[cluster] = set(index.get_near_dups(doc[1]))
        seen.update(clusters[cluster])
        cluster += 1

Big bucket found. key:d9dd:0, len:272
Big bucket found. key:1556:1, len:255
Big bucket found. key:2b86:2, len:255
Big bucket found. key:c6c5:3, len:255


In [16]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(data_ads)+len(data_ads_new), len(data_ads)+len(data_ads_new))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)
data_ads_new['cluster'] = cluster[-len(data_ads_new):]

In [17]:
data_ads_new.head()

Unnamed: 0,index,value,cluster
1000,1000,Job Description:/h3:\r\n\r\nExamples of Import...,723
1001,1001,"Branch Location: San Jose, CA \r\n\r\n Carpet ...",694
1002,1002,Registered Nurses Only \r\n Seasonal Nationwid...,577
1003,1003,"Branch Location: San Jose, CA \r\n\r\n Carpet ...",694
1004,1004,Calling All Production Workers\r\n\r\nJob Desc...,721


# Evaluate on Public Text Similarity Datasets

Datasets used:
1. Google PAWS for paraphrased sentence pairs, https://huggingface.co/datasets/paws
2. Kaggle Text Classified Ads, https://www.kaggle.com/overflow012/playing-with-ads

The text classified ads have duplicates but they are not labeled, so I will do a pairwise comparison of a selection of rows to identify exact, verbatim duplicates.  This will test Simhash's ability to find perfect matches.

In [18]:
paws = load_dataset("paws", "labeled_final", split="train")
print(paws)

Reusing dataset paws (/Users/nicholaslincoln/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)


Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label'],
    num_rows: 49401
})


In [19]:
# the sentences will have to be combined for hashing, but keep the index so they can be split back out
split_idx = len(paws)
paws_all_sents = paws['sentence1'] + paws['sentence2']
paws_dict = {idx: sent for idx, sent in enumerate(paws_all_sents)}

# hash the documents
paws_objs = [(str(k), Simhash(get_features(v))) for k, v in paws_dict.items()]

# create an index for efficient searching
distance_threshold = 3  # hashes > this value will not be considered similar, value must be an integer
index = SimhashIndex(paws_objs, k=distance_threshold)

In [20]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster = 0
seen = set()
for doc in paws_objs:
    if doc[0] not in seen:
        clusters[cluster] = set(index.get_near_dups(doc[1]))
        seen.update(clusters[cluster])
        cluster += 1

In [21]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(paws_dict), len(paws_dict))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)

In [22]:
# split eval data back into 2 columns, add cluster, add label
eval_df = pd.DataFrame({
    'sent1': paws['sentence1'],
    'sent2': paws['sentence2'],
    'label': paws['label'],
    'cluster1': cluster[:split_idx],
    'cluster2': cluster[split_idx:]
})
eval_df['simhash_predicted_similar'] = np.where(eval_df.cluster1 == eval_df.cluster2, 1, 0)

In [23]:
print(
    "Simhash Evaluation on Google PAWS:\n",
    f"accuracy: {accuracy_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"precision: {precision_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"recall: {recall_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"f1: {f1_score(eval_df.label, eval_df.simhash_predicted_similar):2f}"
)

Simhash Evaluation on Google PAWS:
 accuracy: 0.566041
 precision: 0.565714
 recall: 0.077099
 f1: 0.135704


In [24]:
eval_df.simhash_predicted_similar.mean()

0.060221453007024146

So it misses very few texts that are similar, but it has a lot of false positives.  I found that playing with the distance threshold, k, had a big impact, but since it must be an integer, setting it to 2 was too low (it missed everything) and 3 is almost too high (nearly everything is considered similar).  

In [25]:
eval_df.head(25)

Unnamed: 0,sent1,sent2,label,cluster1,cluster2,simhash_predicted_similar
0,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0,0,7214,0
1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1,1,16583,0
2,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0,2,8158,0
3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1,3,3533,0
4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1,4,17586,0
5,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ...",1,5,11176,0
6,Bullion Express - concept is being introduced ...,2011-DGSE Bullion Express concept is introduce...,0,6,2792,0
7,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...,1,7,27765,0
8,The Soviet Union maintained an embassy in Oslo...,The Soviet Union maintained an embassy in Mosc...,0,8,8,1
9,Vocabulary even went to Brazil through leaving...,Vocabulary even went to Brazil by leaving Maca...,0,9,16941,0


### Eval on Perfect Matches

In [26]:
# shrink dataset to make it more manageable
data_ads_sub = data_ads.iloc[:50]

# create labels for verbatim similarity for text classified ads
comparison_dict = {}
comparison_dict_index = 0
data_ads_docs = data_ads_sub.value.tolist()
for doc_1 in data_ads_docs:
    for doc_2 in data_ads_docs:
        if doc_1 == doc_2:
            comparison_dict[comparison_dict_index] = (doc_1, doc_2, 1)
        else:
            comparison_dict[comparison_dict_index] = (doc_1, doc_2, 0)
        comparison_dict_index += 1

In [27]:
eval_df = pd.DataFrame.from_dict(comparison_dict, orient="index", columns=["doc1", "doc2", "label"])
eval_df.head()

Unnamed: 0,doc1,doc2,label
0,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,1
1,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,0
2,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview:\r\n\r\nUnder general supervision by ...,0
3,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0
4,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0


In [28]:
# the sentences will have to be combined for hashing, but keep the index so they can be split back out
split_idx = len(eval_df)
eval_df_all_sents = eval_df['doc1'].tolist() + eval_df['doc2'].tolist()
eval_df_dict = {idx: sent for idx, sent in enumerate(eval_df_all_sents)}

# hash the documents
eval_df_objs = [(str(k), Simhash(get_features(v))) for k, v in eval_df_dict.items()]

# create an index for efficient searching
distance_threshold = 3  # hashes > this value will not be considered similar, value must be an integer
index = SimhashIndex(eval_df_objs, k=distance_threshold)

In [29]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster = 0
seen = set()
for doc in eval_df_objs:
    if doc[0] not in seen:
        clusters[cluster] = set(index.get_near_dups(doc[1]))
        seen.update(clusters[cluster])
        cluster += 1

Big bucket found. key:71d7:0, len:4600
Big bucket found. key:e904:1, len:4600
Big bucket found. key:6d11:2, len:4600
Big bucket found. key:ef67:3, len:4600


In [30]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(eval_df_dict), len(eval_df_dict))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)

In [31]:
# split eval data back into 2 columns, add cluster, add label
eval_df['cluster1'] = cluster[:split_idx]
eval_df['cluster2'] = cluster[split_idx:]
eval_df['simhash_predicted_similar'] = np.where(eval_df.cluster1 == eval_df.cluster2, 1, 0)

In [32]:
print(
    "Simhash Evaluation on Google PAWS:\n",
    f"accuracy: {accuracy_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"precision: {precision_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"recall: {recall_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"f1: {f1_score(eval_df.label, eval_df.simhash_predicted_similar):2f}"
)

Simhash Evaluation on Google PAWS:
 accuracy: 1.000000
 precision: 1.000000
 recall: 1.000000
 f1: 1.000000


In [33]:
eval_df.head(25)

Unnamed: 0,doc1,doc2,label,cluster1,cluster2,simhash_predicted_similar
0,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,1,0,0,1
1,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,0,0,1,0
2,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview:\r\n\r\nUnder general supervision by ...,0,0,2,0
3,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
4,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
5,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
6,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
7,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
8,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
9,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0


So simhash finds identical docs perfectly. There were also no false positives, even for a distance threshold, k, of 3.  This suggests that it works better with longer texts.

# Applying to Blackwing Training Data

In [45]:
blackwing_data = pd.read_csv('blackwing_3m_9k.csv')
blackwing_data = blackwing_data[blackwing_data['disposition'].isin(['SELECT', 'IGNORE'])]

blackwing_data_dict = blackwing_data.text.to_dict()

# hash the documents
blackwing_objs = [(str(k), Simhash(get_features(v))) for k, v in blackwing_data_dict.items()]

# create an index for efficient searching
distance_threshold = 6  # hashes > this value will not be considered similar, value must be an integer
index = SimhashIndex(blackwing_objs, k=distance_threshold)

In [46]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster = 0
seen = set()
for doc in blackwing_objs:
    if doc[0] not in seen:
        clusters[cluster] = set(index.get_near_dups(doc[1]))
        seen.update(clusters[cluster])
        cluster += 1

# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

Big bucket found. key:111:0, len:445
Big bucket found. key:191:2, len:248
Big bucket found. key:ea:3, len:331
Big bucket found. key:1f1:2, len:360
Big bucket found. key:9a:4, len:303
Big bucket found. key:8:5, len:910
Big bucket found. key:3bd:6, len:256
Big bucket found. key:15a:0, len:221
Big bucket found. key:9e:4, len:588
Big bucket found. key:28:5, len:548
Big bucket found. key:d1:2, len:349
Big bucket found. key:9c:4, len:1534
Big bucket found. key:28:5, len:548
Big bucket found. key:d3:2, len:256
Big bucket found. key:ab:3, len:504
Big bucket found. key:128:5, len:235
Big bucket found. key:11d:0, len:672
Big bucket found. key:1ea:1, len:719
Big bucket found. key:153:2, len:787
Big bucket found. key:cb:3, len:800
Big bucket found. key:9c:4, len:1534
Big bucket found. key:19:5, len:642
Big bucket found. key:1b7:6, len:834
Big bucket found. key:157:0, len:314
Big bucket found. key:151:2, len:890
Big bucket found. key:ab:3, len:504
Big bucket found. key:9e:4, len:588
Big bucket foun

Big bucket found. key:113:0, len:339
Big bucket found. key:98:4, len:313
Big bucket found. key:a:5, len:393
Big bucket found. key:157:0, len:314
Big bucket found. key:b:3, len:445
Big bucket found. key:19b:0, len:437
Big bucket found. key:ea:3, len:331
Big bucket found. key:111:0, len:445
Big bucket found. key:111:0, len:445
Big bucket found. key:1d1:2, len:627
Big bucket found. key:a:5, len:393
Big bucket found. key:6a:5, len:225
Big bucket found. key:3ad:6, len:258
Big bucket found. key:68:5, len:405
Big bucket found. key:1bb:6, len:286
Big bucket found. key:15a:0, len:221
Big bucket found. key:155:2, len:324
Big bucket found. key:de:4, len:346
Big bucket found. key:a:5, len:393
Big bucket found. key:153:2, len:787
Big bucket found. key:ea:3, len:331
Big bucket found. key:141:2, len:247
Big bucket found. key:ce:4, len:303
Big bucket found. key:68:5, len:405
Big bucket found. key:16a:1, len:527
Big bucket found. key:151:2, len:890
Big bucket found. key:2a:3, len:289
Big bucket found. 

Big bucket found. key:af:3, len:296
Big bucket found. key:153:2, len:787
Big bucket found. key:b:3, len:445
Big bucket found. key:9e:4, len:588
Big bucket found. key:68:5, len:405
Big bucket found. key:155:2, len:324
Big bucket found. key:1af:6, len:493
Big bucket found. key:11d:0, len:672
Big bucket found. key:15b:0, len:306
Big bucket found. key:68:5, len:405
Big bucket found. key:11b:0, len:363
Big bucket found. key:1f5:2, len:238
Big bucket found. key:151:2, len:890
Big bucket found. key:98:4, len:313
Big bucket found. key:48:5, len:289
Big bucket found. key:113:0, len:339
Big bucket found. key:161:2, len:552
Big bucket found. key:aa:3, len:619
Big bucket found. key:9e:4, len:588
Big bucket found. key:68:5, len:405
Big bucket found. key:157:0, len:314
Big bucket found. key:1d1:2, len:627
Big bucket found. key:af:3, len:296
Big bucket found. key:68:5, len:405
Big bucket found. key:1d1:2, len:627
Big bucket found. key:cc:4, len:319
Big bucket found. key:8:5, len:910
Big bucket found.

Big bucket found. key:1a9:3, len:201
Big bucket found. key:de:4, len:346
Big bucket found. key:2a:5, len:709
Big bucket found. key:3af:6, len:207
Big bucket found. key:15b:0, len:306
Big bucket found. key:aa:3, len:619
Big bucket found. key:28:5, len:548
Big bucket found. key:b:3, len:445
Big bucket found. key:2b:3, len:436
Big bucket found. key:17d:2, len:203
Big bucket found. key:b:3, len:445
Big bucket found. key:16a:1, len:527
Big bucket found. key:151:2, len:890
Big bucket found. key:aa:3, len:619
Big bucket found. key:2a:5, len:709
Big bucket found. key:161:2, len:552
Big bucket found. key:8:5, len:910
Big bucket found. key:48:5, len:289
Big bucket found. key:113:0, len:339
Big bucket found. key:113:0, len:339
Big bucket found. key:171:2, len:550
Big bucket found. key:2b:3, len:436
Big bucket found. key:9a:4, len:303
Big bucket found. key:1af:6, len:493
Big bucket found. key:117:0, len:220
Big bucket found. key:ea:3, len:331
Big bucket found. key:15b:0, len:306
Big bucket found. 

In [47]:
# create an adjency matrix from the edge list
matrix_shape = (len(blackwing_data), len(blackwing_data))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)

In [48]:
blackwing_data['cluster'] = cluster
blackwing_data.reset_index(drop=False, inplace=True)
blackwing_data.sort_values(by=['cluster', 'index']).reset_index(drop=True).to_csv('blackwing_sorted_by_sim.csv', index=False)

Here is proof that syndicated content is captured...

In [49]:
blackwing_data.loc[
    blackwing_data.cluster.isin(
        blackwing_data.groupby(['cluster'])['source_name'].nunique()[
            blackwing_data.groupby(['cluster'])['source_name'].nunique()>1
        ].index
    ),
    ['source_name', 'headline', 'text', 'cluster']
].drop_duplicates().sort_values(['cluster', 'source_name']).head(25)

Unnamed: 0,source_name,headline,text,cluster
1385,RT,"US academics say ‘professionalism’ is racist, ...",At this point in history it seems most of the ...,18
2411,South China Morning Post (HKG),Asia’s obsession with the supernatural haunts ...,A politician?s jibe about Kalimantan being fil...,18
2229,Anchorage (AK) Daily News,Boosters wane in effectiveness after 4 months ...,Booster shots of the Pfizer-BioNTech and Moder...,31
2484,Guam Daily Post,"Booster effectiveness wanes after 4 months, bu...",Booster shots of the Pfizer-BioNTech and Moder...,31
6136,WWLP-TV Springfield (MA),"Pols promote funding for clean water, air efforts",BOSTON Mass. (SHNS)--The infrastructure law pa...,33
2277,worldnewsera.com,Fix The Planet Newsletter: When Net Zero Means...,Science Fix the Planet newsletter: When net ze...,33
2369,Business Standard (IND),M-cap of nine top companies erodes by over Rs ...,Reliance Industries Limited the most valued fi...,39
2787,Business Today (IND),Nine of top-19 most valued firms lose over Rs ...,The only gainer from the top-10 pack was Relia...,39
2644,Hindu Business Line (IND),Mcap of nine of top-10 cos erodes by over Rs 1...,Nine of the 10 most valued companies together ...,39
2621,Livemint (IND),Mcap of nine of top-10 companies erodes by ove...,Bharti Airtel's market capitalisation declined...,39


In [50]:
blackwing_data.loc[
    blackwing_data.cluster.isin(
        blackwing_data.groupby(['cluster'])['source_name'].nunique()[
            blackwing_data.groupby(['cluster'])['source_name'].nunique()>1
        ].index
    ),
    ['source_name', 'headline', 'text', 'cluster']
].drop_duplicates().sort_values(['cluster', 'source_name']).to_csv("syndicated_content_from_simhash.csv", index=False)