In [31]:
import json

from os.path import join
import numpy as np
from collections import Counter, defaultdict
import pandas as pd
from s2and.data import PDData
from s2and.consts import PROJECT_ROOT_PATH
from pys2.pys2 import _evaluate_redshift_query, _load_dataframe_to_redshift


import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='talk')

In [32]:
# load the data
with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/papers.v2.json', 'r') as f:
    papers = json.load(f)


In [29]:
# buckets = pd.read_csv('/net/nfs2.s2-research/shaurya/projects/s2pac/clustering_data_gathering/prod_buckets.800M.v4.csv')
# buckets_map = {}
# for row in buckets.itertuples():
#     for paper_id in eval(row.prod_bucket):
#         buckets_map[str(paper_id)] = row.key_35_w_colon
#         buckets_map[int(paper_id)] = row.key_35_w_colon
        
# buckets_map_sub = {}
# for paper in papers.keys():
#     buckets_map_sub[paper] = buckets_map[paper]
    
# # save buckets_map_sub so we don't have to do this again
# with open('/net/nfs2.s2-research/shaurya/projects/s2pac/clustering_data_gathering/prod_buckets.v2.json', 'w') as f:
#     json.dump(buckets_map_sub, f)

with open('/net/nfs2.s2-research/shaurya/projects/s2pac/clustering_data_gathering/prod_buckets.v2.json', 'r') as f:
    buckets_map = json.load(f)

In [48]:
# step 0: many papers are in the wrong block. delete them
papers_to_delete = []
for paper_id, paper in papers.items():
    if paper['block'] != buckets_map[str(paper_id)]:
        papers_to_delete.append(paper_id)
print(len(papers), len(papers_to_delete))

for paper_id in papers_to_delete:
    del papers[paper_id]

2911289 15520


In [49]:
# step 1: we need to make sure that every paper is in a cluster that is inside the *correct* block
# if not, we delete the paper from the cluster in the incorrect block
with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/clusters.v2.json', 'r') as f:
    clusters = json.load(f)

times_trimmed = 0
cluster_ids_to_delete = set()
for cluster_id in clusters.keys():
    papers_loop = clusters[cluster_id]['paper_ids']
    this_block = cluster_id.split('_')[0]
    papers_for_this_block = [
        p for p in papers_loop 
        if str(p) in papers 
        and papers[str(p)]['block'] == this_block
    ]  
    if len(papers_for_this_block) > 0:
        clusters[cluster_id]['paper_ids'] = papers_for_this_block
    else:
        cluster_ids_to_delete.add(cluster_id)
    if len(papers_for_this_block) < len(papers_loop):
        times_trimmed += 1
print(times_trimmed, len(cluster_ids_to_delete))

for key in cluster_ids_to_delete:
    del clusters[key]

23729 20


In [50]:
# step 2: a bunch of papers are in clusters AND in orphans. we need to remove them from the orphans
papers_to_clusters = defaultdict(set)
for cluster_id, cluster in clusters.items():
    for paper_id in cluster['paper_ids']:
        papers_to_clusters[paper_id].add(cluster_id)
        
total_removed = 0
for paper_id, cluster_ids in papers_to_clusters.items():
    if len(cluster_ids) > 1:
        orphan_clusters = [c for c in cluster_ids if c.endswith('_orphans')]
        orphan_cluster_blocks = [c.split('_')[0] for c in orphan_clusters]
        nonorphan_clusters = [c for c in cluster_ids if not c.endswith('_orphans')]
        nonorphan_clusters_blocks = [c.split('_')[0] for c in nonorphan_clusters]
        # if the paper is in an orphan cluster, and there is a non-orphan cluster in the same block, remove it from the orphan cluster
        to_remove = [c for b, c in zip(orphan_cluster_blocks, orphan_clusters) if b in nonorphan_clusters_blocks]
        total_removed += len(to_remove)
        for cluster_id in to_remove:
            try: # i really don't know the types of these keys
                clusters[cluster_id]['paper_ids'].remove(str(paper_id))
            except:
                clusters[cluster_id]['paper_ids'].remove(int(paper_id))
total_removed

0

In [52]:
# step 3a: a bunch of papers are in multiple clusters in the SAME block. probably easiest to just remove these blocks entirely
papers_to_clusters = defaultdict(set)
for cluster_id, cluster in clusters.items():
    for paper_id in cluster['paper_ids']:
        papers_to_clusters[paper_id].add(cluster_id)
        
papers_in_multiple_clusters = set([int(p) for p, c in papers_to_clusters.items() if len(c) > 1] + [str(p) for p, c in papers_to_clusters.items() if len(c) > 1])
len(papers_in_multiple_clusters)
blocks_to_remove = set([papers[str(p)]['block'] for p in papers_in_multiple_clusters])
len(blocks_to_remove)

200

In [53]:
# step 3b need to find blocks with bad paper names and remove them
from s2and.text import normalize_text

bad_paper_names = {
    'comment', 
    'comments',
    'discussion',
    'book',
    'response', 
    'letter', 
    'reply', 
    'reply to', 
    're', 
    'withdrawn', 
    'note', 
    'notes', 
    'note on', 
    'notes on', 
    "author correction",
    "book review",
    "book reviews",
    "case report",
    "characterization",
    "cheminform abstract",
    "commentary",
    "copyright",
    "correction to",
    "correction",
    "editorial board",
    "editorial",
    "erratum",
    "feature article",
    "in this issue",
    "introduction",
    "libguides",
    "pii",
    "republished",
    "response to",
    "retracted article",
    "retracted",
    "review",
    "withdrawal"
}

paper_ids_that_are_bad = set()
blocks_that_are_bad = blocks_to_remove
for k, v in papers.items():
    if normalize_text(v['title']) in bad_paper_names:
        paper_ids_that_are_bad.add(k)
        blocks_that_are_bad.add(v['block'])

print(len(paper_ids_that_are_bad), len(blocks_that_are_bad))

0 200


In [54]:
# some blocks now have no non-orphan clusters in them, so we can just drop them
non_orphan_blocks = {k.split('_')[0] for k in clusters.keys() if not k.endswith('_orphans')}

In [55]:
# keep only the papers in the blocks we want to keep
papers_sub = {k: v for k, v in papers.items() if v['block'] not in blocks_that_are_bad and v['block'] in non_orphan_blocks}

# keep only clusters where all of the papers are in papers_sub
clusters_sub = {k: v for k, v in clusters.items() if all([str(p) in papers_sub for p in v['paper_ids']])}

# keep only papers that are in a cluster
papers_in_clusters = set()
for cluster_id in clusters_sub.keys():
    papers_in_clusters.update(set([str(i) for i in clusters[cluster_id]['paper_ids']]))
    
assert papers_in_clusters == set(papers_sub.keys())
len(clusters_sub), len(papers_sub)

(256483, 2891909)

In [57]:
# make a table of all of these keys
sourced_paper_ids = pd.DataFrame({'sourced_paper_id': list(papers_sub.keys())})

_load_dataframe_to_redshift(
    sourced_paper_ids,
    "public.temp_clustering_cluster_source_id",
    create_table=True,
    write_privileges=True,
)

In [58]:
query = """select ps.sourced_paper_id, ps.source, ps.source_id, pdf_hash from content.paper_sources ps join public.temp_clustering_cluster_source_id t on ps.sourced_paper_id = t.sourced_paper_id"""
df_source_ids = _evaluate_redshift_query(query)
print(len(df_source_ids))
for row in df_source_ids.itertuples():
    paper = papers_sub[str(row.sourced_paper_id)]
    assert paper['source'] == row.source
    if row.source_id is not None:
        paper['source_id'] = row.source_id
    if row.pdf_hash is not None:
        paper['pdf_hash'] = row.pdf_hash

2766227


In [59]:
# count all of the fields in the papers
fields = Counter()
for v in papers_sub.values():
    fields.update(v.keys())
    
fields.most_common()

[('authors', 2891909),
 ('abstract', 2891909),
 ('references', 2891909),
 ('paper_id', 2891909),
 ('source', 2891909),
 ('title', 2891909),
 ('block', 2891909),
 ('corpus_paper_id', 2891909),
 ('year', 2784230),
 ('source_id', 2766227),
 ('journal_name', 2122651),
 ('doi', 1802724),
 ('venue', 1191272),
 ('fieldsofstudy', 1066676),
 ('pdf_hash', 995240),
 ('publicationdate', 977640),
 ('publicationtypes', 394136),
 ('pmid', 332049),
 ('openaccesslocation', 195667),
 ('publisher', 185063),
 ('updatedate', 1801),
 ('bibliography', 80)]

In [60]:
# save next version
with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/papers.v3.json', 'w') as f:
    json.dump(papers_sub, f)

with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/clusters.v3.json', 'w') as f:
    json.dump(clusters_sub, f)

In [14]:
# now downsample the blocks that have 100% accuracy
# load the data
with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/papers.v3.json', 'r') as f:
    papers = json.load(f)

with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/clusters.v3.json', 'r') as f:
    clusters = json.load(f)

In [16]:
# now let's drop a subset of the "easy" blocks
df = pd.read_csv(join(PROJECT_ROOT_PATH, "data", "block_removal_candidates.csv"))
blocks_to_keep = set(df['block'][df.keep_count > 0].values)
blocks_to_drop = set(df['block'][df.remove_count > 0].values)
easy_blocks = df['block'][df.accuracy == 1.0].values
not_easy_blocks = df['block'][df.accuracy != 1.0].values
print(len(blocks_to_keep.intersection(blocks_to_drop)), len(blocks_to_keep), len(blocks_to_drop), len(easy_blocks), len(not_easy_blocks))
# choose a random half of the easy blocks and keep all the hard blocks
keep_blocks = set(np.random.choice(easy_blocks, size=len(easy_blocks)//2, replace=False))
print(len(keep_blocks))
keep_blocks.update(not_easy_blocks)
print(len(keep_blocks))
keep_blocks.update(blocks_to_keep)
print(len(keep_blocks))
keep_blocks -= blocks_to_drop
print(len(keep_blocks))

58 9978 1410 67217 13098
33608
46706
51023
49897


In [17]:
# keep only the papers in the blocks we want to keep
papers_sub = {k: v for k, v in papers.items() if v['block'] in keep_blocks}

# keep only clusters where all of the papers are papers2
clusters_sub = {k: v for k, v in clusters.items() if all([str(p) in papers_sub for p in v['paper_ids']])}

# keep only papers that are in a cluster
papers_in_clusters = set()
for cluster_id in clusters_sub.keys():
    papers_in_clusters.update(set([str(i) for i in clusters[cluster_id]['paper_ids']]))
    
assert papers_in_clusters == set(papers_sub.keys())

In [18]:
# save next version
with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/papers.v3_hard.json', 'w') as f:
    json.dump(papers_sub, f)

with open('/net/nfs2.s2-research/shaurya/projects/s2pac/final_formatting/data/clusters.v3_hard.json', 'w') as f:
    json.dump(clusters_sub, f)