## Count the frequency of single words in soc and econ articles as well as cooc of words with each other

- This notebook creates `wos-soc-alltitles-singlecounts.pkl`, `wos-econ-alltitles-single-counts.pkl`, `wos-econ-limitedtitles-cooc`, `wos-soc-limitedtitles-cooc`

The function cc() can be used to access the counts. The following parameters are recognized:
- cc(t='') - counts of a specific term
- cc(fy=) - counts from a specific year
- cc(fj=) - counts from a specific journal
- cc.summarize() - summary of the dimensions of those matrices
- cc.items('t') - gives the terms

### Load data

In [1]:
import sys
sys.path.append("./../helper") # to find scripts
from load_data import wos_soc, wos_econ   # loads data
from common_imports import *
from helpers import *

Loading articles...
73513 econ articles loaded
190778 soc articles loaded


### Count function

In [1]:
def createCounts(dat, top_terms=False, cooc=False):
       
    """ 
    This function takes in wos_econ or wos_soc and returns the counts. 
    If top_terms is not False, only terms that are in the list top_terms will be counted.
    If cooc is True, co-occurrences are also counted. 
    """
    
    cc = counter()

    for i, rec in enumerate(dat):
        
        # i = counter
        # rec = article in dict form

        if i % 10000 == 0:
            print(f'Record {i:,}')
            cc.summarize()

        tups = rec['title'].lower().split(" ")
        if(top_terms): tups = [t for t in tups if t in top_terms]
        
        if(cooc):
            count(
                cc,
                values = {
                    'fy': tryInt(rec['year']),    # number of articles per year
                    'fj': tryStr(rec['journal']), # number of articles per journal
                },
                terms = tups,
                combinations = [
                    ('fy','t'),  # number of times a term occurs per year
                    ('fj','t'),  # number of times a term occurs per journal
                    ('fy',),     # total number term occurrences per year
                    ('t',),      # total number of terms
                    ('fj',),     # total number of terms per journal
                    ('t1','t2'),
                    #('fj','t1','t2'),
                    ('fy','t1','t2')
                ]
            )
        else:
            count(
                cc,
                values = {
                    'fy': tryInt(rec['year']),    
                    'fj': tryStr(rec['journal']), 
                },
                terms = tups,
                combinations = [
                    ('fy','t'), 
                    ('fj','t'),  
                    ('fy',),     
                    ('t',),      
                    ('fj',)      
                    #('t1','t2'),
                    #('fj','t1','t2'),
                    #('fy','t1','t2')
                ]
            )

    if not cooc: cc.prune_zeros()
        
    return(cc)

### Count single terms

In [3]:
cc_soc_single = createCounts(wos_soc)
cc_econ_single = createCounts(wos_econ)

cc_soc_single.save_counts('wos-soc-alltitles-singlecounts')
cc_econ_single.save_counts('wos-econ-alltitles-singlecounts')

Blank counter with no name
Record 0
[]
Record 10,000
[(('fy', 't'), (10, 31590)), (('fj', 't'), (571, 31590)), (('fy',), (10,)), (('t',), (31590,)), (('fj',), (571,))]
Record 20,000
[(('fy', 't'), (22, 49358)), (('fj', 't'), (891, 49358)), (('fy',), (22,)), (('t',), (49358,)), (('fj',), (891,))]
Record 30,000
[(('fy', 't'), (33, 61697)), (('fj', 't'), (1391, 61697)), (('fy',), (33,)), (('t',), (61697,)), (('fj',), (1391,))]
Record 40,000
[(('fy', 't'), (33, 61697)), (('fj', 't'), (1738, 61697)), (('fy',), (33,)), (('t',), (61697,)), (('fj',), (1738,))]
Record 50,000
[(('fy', 't'), (41, 77121)), (('fj', 't'), (2172, 77121)), (('fy',), (41,)), (('t',), (77121,)), (('fj',), (2172,))]
Record 60,000
[(('fy', 't'), (41, 77121)), (('fj', 't'), (2715, 77121)), (('fy',), (41,)), (('t',), (77121,)), (('fj',), (2715,))]
Record 70,000
[(('fy', 't'), (41, 77121)), (('fj', 't'), (2715, 77121)), (('fy',), (41,)), (('t',), (77121,)), (('fj',), (2715,))]
Record 80,000
[(('fy', 't'), (41, 77121)), (('fj

In [4]:
# Let's look at the top 20 words

topN = 20

top_ids_soc = np.argsort( cc_soc_single.counts[('t',)] )[-topN:]
top_terms_soc = [k for k,i in cc_soc_single.ids['t'].items() if i in top_ids_soc] 

top_ids_econ = np.argsort( cc_econ_single.counts[('t',)] )[-topN:]
top_terms_econ = [k for k,i in cc_econ_single.ids['t'].items() if i in top_ids_econ] 

print("SOC: ", ", ".join( top_terms_soc ))
print("ECON: ", ", ".join( top_terms_econ ))

SOC:  women, family, social, gender, cultural, policy, american, theory, political, class, politics, health, public, community, life, education, work, state, case, sociol
ECON:  united, economic, growth, theory, price, models, effects, model, market, economics, capital, evidence, labor, industry, demand, trade, policy, states, social, income


### Count co-occurences

- To count cooc, we first needs to reduce the number of words as otherwise, the matrices will blow up
- Then we count cooc among the limited set of words

In [5]:
def topTerms(dat, topN):
    
    # Get topN terms  
    terms = dat.items('t') # all terms
    cdict = {t: dat(t=t) for t in terms} # term/count dictionary
    
    top_terms = sorted(terms, key=lambda x:cdict[x])[-topN: ] # sort dicitonary by counts and return terms
    
    return(top_terms)

top_soc = topTerms(cc_soc_single, 1000) # get top X words
top_econ = topTerms(cc_econ_single, 1000) 

In [6]:
cc_soc_cooc = createCounts(dat=wos_soc, top_terms=top_soc, cooc=True)
cc_econ_cooc = createCounts(dat=wos_econ, top_terms=top_econ, cooc=True)

cc_soc_cooc.save_counts('wos-soc-limitedtitles-cooc')
cc_econ_cooc.save_counts('wos-econ-limitedtitles-cooc')

Blank counter with no name
Record 0
[]
Record 10,000
[(('fy', 't'), (10, 1113)), (('fj', 't'), (571, 1113)), (('fy',), (10,)), (('t',), (1113,)), (('fj',), (571,)), (('t1', 't2'), (1113, 1113)), (('fy', 't1', 't2'), (10, 1113, 1113))]
Record 20,000
[(('fy', 't'), (22, 1113)), (('fj', 't'), (891, 1113)), (('fy',), (22,)), (('t',), (1113,)), (('fj',), (891,)), (('t1', 't2'), (1113, 1113)), (('fy', 't1', 't2'), (22, 1113, 1113))]
Record 30,000
[(('fy', 't'), (33, 1113)), (('fj', 't'), (1391, 1113)), (('fy',), (33,)), (('t',), (1113,)), (('fj',), (1391,)), (('t1', 't2'), (1113, 1113)), (('fy', 't1', 't2'), (33, 1113, 1113))]
Record 40,000
[(('fy', 't'), (33, 1113)), (('fj', 't'), (1738, 1113)), (('fy',), (33,)), (('t',), (1113,)), (('fj',), (1738,)), (('t1', 't2'), (1113, 1113)), (('fy', 't1', 't2'), (33, 1113, 1113))]
Record 50,000
[(('fy', 't'), (41, 1113)), (('fj', 't'), (2172, 1113)), (('fy',), (41,)), (('t',), (1113,)), (('fj',), (2172,)), (('t1', 't2'), (1113, 1113)), (('fy', 't1', '