In [1]:
import os, pickle

In [2]:
CACHE_PATH = '/scratch/datasets/mog29/unarXive'
idf_path = os.path.join(CACHE_PATH, 'meme_idf.pkl')
with open(idf_path, 'rb') as f:
    meme_to_idf = pickle.load(f)

In [3]:
from combine_meme_files import get_combined_n_grams, get_combined_metadata

In [4]:
paper_to_metadata = get_combined_metadata()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:15<00:00,  2.02it/s]


In [5]:
meme_to_articles = get_combined_n_grams()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [19:51<00:00, 37.22s/it]


In [6]:
from compute_meme_scores import compute_overall_frequencies, compute_n_gram_meme_score_terms, save_annual_meme_scores

In [7]:
years = [i for i in range(1991, 2023)]
year_to_frequencies = {year : {'num_papers' : 0, 'weighted_num_papers' : 0} for year in years}
meme_to_score_components = {}

In [8]:
compute_overall_frequencies(year_to_frequencies, paper_to_metadata)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1881346/1881346 [00:01<00:00, 990635.28it/s]


In [10]:
compute_n_gram_meme_score_terms(meme_to_score_components, meme_to_articles, paper_to_metadata, meme_to_idf)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88908424/88908424 [14:21<00:00, 103191.33it/s]


In [12]:
meme_score_terms_path = os.path.join(CACHE_PATH, 'meme_score_terms.pkl')
with open(meme_score_terms_path, 'wb') as f:
    pickle.dump(meme_to_score_components, f)

In [14]:
from tqdm import tqdm

In [15]:
meme_to_year_scores = {}
for meme, meme_year_dict in tqdm(meme_to_score_components.items()):
    meme_to_year_scores[meme] = {}
    for curr_year, year_info in meme_year_dict.items():
        # Compute the frequency scores                                                                                                                                                                                                                                                 
        total_frequency = year_to_frequencies[curr_year]['num_papers']
        meme_frequency = year_info['frequency']
        frequency_score = meme_frequency / total_frequency
        weighted_total_frequency = year_to_frequencies[curr_year]['weighted_num_papers']
        weighted_meme_frequency = year_info['weighted_frequency']
        weighted_frequency_score = weighted_meme_frequency / weighted_total_frequency

        # Compute sticking scores                                                                                                                                                                                                                                                      
        in_paper_in_citations = year_info['in_paper_in_citations']
        in_citations = year_info['in_citations']
        sticking_score = in_paper_in_citations / (3 + in_citations)
        weighted_in_paper_in_citations = year_info['weighted_in_paper_in_citations']
        weighted_in_citations = year_info['weighted_in_citations']
        weighted_sticking_score = weighted_in_paper_in_citations / (3 + weighted_in_citations)

        # Compute sparking scores                                                                                                                                                                                                                                                      
        in_paper_not_in_citations = year_info['in_paper_not_in_citations']
        not_in_citations = year_info['not_in_citations']
        sparking_score = (3+in_paper_not_in_citations) / (3 + not_in_citations)
        weighted_in_paper_not_in_citations = year_info['weighted_in_paper_not_in_citations']
        weighted_not_in_citations = year_info['weighted_not_in_citations']
        weighted_sparking_score = (3+weighted_in_paper_not_in_citations) / (3 + weighted_not_in_citations)

        # Compute meme scores                                                                                                                                                                                                                                                          
        meme_score = frequency_score * sticking_score / sparking_score
        weighted_meme_score = weighted_frequency_score * weighted_sticking_score / weighted_sparking_score
        meme_to_year_scores[meme][curr_year] = {
            "meme_score" : meme_score,
            "weighted_meme_score" : weighted_meme_score
        }

meme_score_path = os.path.join(CACHE_PATH, 'meme_scores.pkl')
with open(meme_score_path, 'wb') as f:
    pickle.dump(meme_to_year_scores, f)        

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14494081/14494081 [03:37<00:00, 66664.29it/s]


In [16]:
# Initial qualitative analysis of the results
def get_sorted_meme_scores_for_year(meme_scores, year):
    meme_score_pairs = []
    weighted_meme_score_pairs = []
    for meme, year_dicts in tqdm(meme_scores.items()):
        if year not in year_dicts:
            continue
            
        curr_scores = year_dicts[year]
        meme_score_pairs.append((meme, curr_scores['meme_score']))
        weighted_meme_score_pairs.append((meme, curr_scores['weighted_meme_score']))
        
    meme_score_pairs = sorted(meme_score_pairs, reverse=True, key=lambda x: x[1])
    weighted_meme_score_pairs = sorted(weighted_meme_score_pairs, reverse=True, key=lambda x: x[1])
    
    return meme_score_pairs, weighted_meme_score_pairs

In [56]:
meme_score, weighted_meme_score = get_sorted_meme_scores_for_year(meme_to_year_scores, 2022)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 14494081/14494081 [00:15<00:00, 908364.39it/s]


In [57]:
for meme, score in meme_score[:100]:
    print(meme, score)

trained 0.1773641118589446
learn 0.16598443555810838
neural networks 0.16518028844999447
stateoftheart 0.1548679215546893
architecture 0.14387493651586314
neural network 0.1331927030825037
machine learning 0.1240704648977611
language 0.12256257281467631
deep learning 0.11123663655511944
the training 0.11087881975606667
train 0.11015153909430844
we propose a 0.10666087363209233
human 0.10580723548133623
a novel 0.10031859363454211
to learn 0.09895728689589807
inference 0.09865282298354792
our method 0.09800538070376905
performance of 0.09290599681119366
learned 0.09284015150305906
the input 0.09219680758046873
improve 0.09159292031201619
recognition 0.09157715283152487
evaluation 0.09125910053534302
attention 0.0910063138371464
text 0.08827277072991527
labels 0.08629191359653055
quality 0.08235825204215229
processing 0.08053670415372756
training data 0.0801298211831231
prior 0.07987070658310638
uses 0.07971238852510326
the network 0.07935667688887404
challenging 0.07900048660714747
visu

In [58]:
for meme, score in weighted_meme_score[:100]:
    print(meme, score)

trained 0.1810522361894
learn 0.16803750218735414
neural networks 0.1672987904513681
stateoftheart 0.15752213537776938
architecture 0.14665979525444198
neural network 0.1348350275379179
machine learning 0.1254334265737546
language 0.1231812482385631
deep learning 0.11348443138850736
the training 0.11295072270276416
train 0.11244076260788748
we propose a 0.10867881388305103
human 0.10670393011344359
a novel 0.10229004628842434
to learn 0.10022245873726071
our method 0.09971765489410662
inference 0.09905207832842905
performance of 0.09420467081678638
improve 0.09360692336798379
learned 0.09353560487798242
the input 0.09347454301353736
attention 0.09336471492914196
evaluation 0.09246951694732855
recognition 0.09102087819100287
text 0.0890931628935262
labels 0.08741792491515536
quality 0.08368288468380995
training data 0.08102298354052631
challenging 0.08032964937065402
processing 0.08024994448734463
prior 0.08021128642837637
uses 0.08021076255500915
the network 0.07980687218860455
visual 