## Count the frequency of single words in soc and econ articles as well as cooc of words with each other

- This notebook creates `wos-soc-alltitles-singlecounts.pkl`, `wos-econ-alltitles-single-counts.pkl`, `wos-econ-limitedtitles-cooc`, `wos-soc-limitedtitles-cooc`

The function cc() can be used to access the counts. The following parameters are recognized:
- cc(t='') - counts of a specific term
- cc(fy=) - counts from a specific year
- cc(fj=) - counts from a specific journal
- cc.summarize() - summary of the dimensions of those matrices
- cc.items('t') - gives the terms

### Load data

In [1]:
import sys
sys.path.append("./../helper") # to find scripts
#from load_data import wos_soc, wos_econ   # loads data
from load_constellate import wos_soc, wos_econ   # loads data
from common_imports import *
from helpers import *

  exec(code_obj, self.user_global_ns, self.user_ns)


### Count function

In [2]:
def createCounts(dat, top_terms=False, cooc=False):
       
    """ 
    This function takes in wos_econ or wos_soc and returns the counts. 
    If top_terms is not False, only terms that are in the list top_terms will be counted.
    If cooc is True, co-occurrences are also counted. 
    """
    
    cc = counter()

    for i, rec in enumerate(dat):
        
        # i = counter
        # rec = article in dict form

        if i % 10000 == 0:
            print(f'Record {i:,}')
            cc.summarize()

        tups = rec['title'].lower().split(" ")
        if(top_terms): tups = [t for t in tups if t in top_terms]
        
        if(cooc):
            count(
                cc,
                values = {
                    'fy': tryInt(rec['year']),    # number of articles per year
                    'fj': tryStr(rec['journal']), # number of articles per journal
                },
                terms = tups,
                combinations = [
                    ('fy','t'),  # number of times a term occurs per year
                    ('fj','t'),  # number of times a term occurs per journal
                    ('fy',),     # total number term occurrences per year
                    ('t',),      # total number of terms
                    ('fj',),     # total number of terms per journal
                    ('t1','t2'),
                    #('fj','t1','t2'),
                    ('fy','t1','t2')
                ]
            )
        else:
            count(
                cc,
                values = {
                    'fy': tryInt(rec['year']),    
                    'fj': tryStr(rec['journal']), 
                },
                terms = tups,
                combinations = [
                    ('fy','t'), 
                    ('fj','t'),  
                    ('fy',),     
                    ('t',),      
                    ('fj',)      
                    #('t1','t2'),
                    #('fj','t1','t2'),
                    #('fy','t1','t2')
                ]
            )

    if not cooc: cc.prune_zeros()
        
    return(cc)

### Count single terms

In [5]:
cc_soc_single = createCounts(wos_soc)
cc_econ_single = createCounts(wos_econ)

cc_soc_single.prune_counts()
cc_econ_single.prune_counts()

cc_soc_single.save_counts('wos-soc-alltitles-singlecounts')
cc_econ_single.save_counts('wos-econ-alltitles-singlecounts')

Blank counter with no name
Record 0
[]
Record 10,000
[(('fy', 't'), (121, 10352)), (('fj', 't'), (571, 10352)), (('fy',), (121,)), (('t',), (10352,)), (('fj',), (571,))]
Record 20,000
[(('fy', 't'), (121, 16175)), (('fj', 't'), (891, 16175)), (('fy',), (121,)), (('t',), (16175,)), (('fj',), (891,))]
Record 30,000
[(('fy', 't'), (121, 20218)), (('fj', 't'), (1113, 20218)), (('fy',), (121,)), (('t',), (20218,)), (('fj',), (1113,))]
Record 40,000
[(('fy', 't'), (121, 20218)), (('fj', 't'), (1391, 20218)), (('fy',), (121,)), (('t',), (20218,)), (('fj',), (1391,))]
Record 50,000
[(('fy', 't'), (121, 25272)), (('fj', 't'), (1391, 25272)), (('fy',), (121,)), (('t',), (25272,)), (('fj',), (1391,))]
Record 60,000
[(('fy', 't'), (121, 25272)), (('fj', 't'), (1738, 25272)), (('fy',), (121,)), (('t',), (25272,)), (('fj',), (1738,))]
Record 70,000
[(('fy', 't'), (151, 25272)), (('fj', 't'), (1738, 25272)), (('fy',), (151,)), (('t',), (25272,)), (('fj',), (1738,))]
Record 80,000
[(('fy', 't'), (151,

AttributeError: 'counter' object has no attribute 'prune_counts'

In [6]:
# Let's look at the top 20 words

topN = 20

top_ids_soc = np.argsort( cc_soc_single.counts[('t',)] )[-topN:]
top_terms_soc = [k for k,i in cc_soc_single.ids['t'].items() if i in top_ids_soc] 

top_ids_econ = np.argsort( cc_econ_single.counts[('t',)] )[-topN:]
top_terms_econ = [k for k,i in cc_econ_single.ids['t'].items() if i in top_ids_econ] 

print("SOC: ", ", ".join( top_terms_soc ))
print("ECON: ", ", ".join( top_terms_econ ))

SOC:  review, article, public, work, family, gender, children, class, effects, social, women, american, life, matter, study, economic, health, education, research, span
ECON:  health, income, economy, analysis, case, policy, economic, development, industry, matter, social, evidence, trade, public, review, article, growth, market, labor, international


### Count co-occurences

- To count cooc, we first needs to reduce the number of words as otherwise, the matrices will blow up
- Then we count cooc among the limited set of words

In [7]:
def topTerms(dat, topN):
    
    # Get topN terms  
    terms = dat.items('t') # all terms
    cdict = {t: dat(t=t) for t in terms} # term/count dictionary
    
    top_terms = sorted(terms, key=lambda x:cdict[x])[-topN: ] # sort dicitonary by counts and return terms
    
    return(top_terms)

top_soc = topTerms(cc_soc_single, 1000) # get top X words
top_econ = topTerms(cc_econ_single, 1000) 

In [8]:
# this one takes just the intersection derived in the other notebook
top = open('intersection_socecon.txt', 'r').read().split("\n")
top = top[:-1]

In [9]:
top_soc = top_econ = top

In [10]:
cc_soc_cooc = createCounts(dat=wos_soc, top_terms=top_soc, cooc=True)
cc_econ_cooc = createCounts(dat=wos_econ, top_terms=top_econ, cooc=True)

cc_soc_cooc.save_counts('wos-soc-limitedtitles-cooc')
cc_econ_cooc.save_counts('wos-econ-limitedtitles-cooc')

Blank counter with no name
Record 0
[]
Record 10,000
[(('fy', 't'), (121, 713)), (('fj', 't'), (571, 713)), (('fy',), (121,)), (('t',), (713,)), (('fj',), (571,)), (('t1', 't2'), (713, 713)), (('fy', 't1', 't2'), (121, 713, 713))]
Record 20,000
[(('fy', 't'), (121, 713)), (('fj', 't'), (891, 713)), (('fy',), (121,)), (('t',), (713,)), (('fj',), (891,)), (('t1', 't2'), (713, 713)), (('fy', 't1', 't2'), (121, 713, 713))]
Record 30,000
[(('fy', 't'), (121, 713)), (('fj', 't'), (1113, 713)), (('fy',), (121,)), (('t',), (713,)), (('fj',), (1113,)), (('t1', 't2'), (713, 713)), (('fy', 't1', 't2'), (121, 713, 713))]
Record 40,000
[(('fy', 't'), (121, 713)), (('fj', 't'), (1391, 713)), (('fy',), (121,)), (('t',), (713,)), (('fj',), (1391,)), (('t1', 't2'), (713, 713)), (('fy', 't1', 't2'), (121, 713, 713))]
Record 50,000
[(('fy', 't'), (121, 713)), (('fj', 't'), (1391, 713)), (('fy',), (121,)), (('t',), (713,)), (('fj',), (1391,)), (('t1', 't2'), (713, 713)), (('fy', 't1', 't2'), (121, 713, 71