In [1]:
from knowknow import *

In [2]:
from knowknow.datasources import jstor

In [8]:
jstor.jstor_counter?

[1;31mInit signature:[0m
[0mjstor[0m[1;33m.[0m[0mjstor_counter[0m[1;33m([0m[1;33m
[0m    [0mjstor_zip_base[0m[1;33m,[0m[1;33m
[0m    [0moutput_database[0m[1;33m=[0m[1;34m'default-database-name'[0m[1;33m,[0m[1;33m
[0m    [0mname_blacklist[0m[1;33m=[0m[1;33m[[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mRUN_EVERYTHING[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mcomplex_parsing[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mgroups[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mgroup_reps[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcitations_filter[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mjournals_filter[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdebug[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mCONSOLIDATE_ITERS[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mterm_whitelist[0m[1;33m=[0m[0mset[0m[1;33m([0m[1;33m

In [12]:
class sparse_counter( jstor.jstor_counter ):

    def account_for(self, doc):

        # year
        self.cnt(doc['year'], 'fy', doc['doi'])

        # journal
        self.cnt(doc['journal'], 'fj', doc['doi'])

        # journal year
        self.cnt((doc['journal'], doc['year']), 'fj.fy', doc['doi'])

        # constructing the tuples set :)
        sp = doc['content'].lower() # debating lowercaseing..
        sp = re.sub("[^a-zA-Z\s]+", "", sp)  # removing extraneous characters
        sp = re.sub("\s+", " ", sp)  # removing extra characters
        sp = sp.strip()
        sp = sp.split()  # splitting into words

        sp = [x for x in sp if x not in self.stopwords]  # strip stopwords

        # print(len(tups),c['contextPure'], "---", tups)

        tups = set("-".join(list(x)) for x in set(zip(sp[:-1], sp[1:])))  # two-word *ordered* tuples
        tups.update(sp)  # one-word tuples

        if len(self.term_whitelist):
            tups = [x for x in tups if x in self.term_whitelist]

        # just term count, in case we are using the `basic` mode
        for t1 in tups:
            # term
            self.cnt((t1,), 't', doc['doi'])
            
            # year
            self.cnt(doc['year'], 'fy', doc['doi'])

            # journal
            self.cnt(doc['journal'], 'fj', doc['doi'])

In [13]:
c = sparse_counter(
    jstor_zip_base = "G:/My Drive/2020 ORGANISATION/1. PROJECTS/qualitative analysis of literature/110 CITATION ANALYSIS/000 data/sociology jstor",
    #jstor_zip_base = "Z:/google_drive/1. PROJECTS/qualitative analysis of literature/110 CITATION ANALYSIS/000 data/sociology jstor",
    output_database = 'sociology-jstor-sparse-simple',
    RUN_EVERYTHING=False, SKIP_N = 10*5 - 1
)

In [14]:
c.count()

Will print updated statistics every 20 documents.
Iterating over  248380 documents
Document 0 ... 0 journals... 0 cited works... 0 authors... 0 terms used... 0 'social' terms
Document 20 ... 11 journals... 0 cited works... 0 authors... 44882 terms used... 13 'social' terms
Document 40 ... 20 journals... 0 cited works... 0 authors... 107982 terms used... 29 'social' terms
Document 60 ... 24 journals... 0 cited works... 0 authors... 171168 terms used... 43 'social' terms
Document 80 ... 27 journals... 0 cited works... 0 authors... 225654 terms used... 55 'social' terms
Document 100 ... 28 journals... 0 cited works... 0 authors... 273704 terms used... 69 'social' terms
Document 120 ... 31 journals... 0 cited works... 0 authors... 330656 terms used... 83 'social' terms
Document 140 ... 33 journals... 0 cited works... 0 authors... 373597 terms used... 91 'social' terms
Document 160 ... 34 journals... 0 cited works... 0 authors... 429571 terms used... 108 'social' terms
Document 180 ... 34 j

In [17]:
c.output_database

'default-database-name'

In [18]:
len(c.doc['t'])

2740481

In [19]:
c.output_database = 'sociology-jstor-sparse-simple'
c.save_counters()

Data file not found. Looking for entry in Harvard Dataverse...
No entry found in Harvard dataverse.


Create new folder with name `sociology-jstor-sparse-simple`?:  yes


loading variable sociology-jstor-sparse-simple/_attributes from disk
loading variable sociology-jstor-sparse-simple/groups from disk


# first filter based on number of times each term was mentioned

We want this to be as wide a net we can stand for the next filtering processes.
I cannot keep track of yearly counts of all terms, or even simple term counts (at least, not without dynamic consolidation), because of RAM limitations.
Remember that I'm keeping track of all words, but also all two-word tuples!
So this first stage is only using 1/10 of the documents (see above, SKIP_N).

<!--At this point, subsetting the data now that we have a terms list will not help our memory issue.-->
The term-term coocurrence network uses N^2 integers, which with 1M terms is 1M^2 = 1TB of memory.
If we can limit to ~30,000 terms, approximately the working vocabulary of an adult, this would fit in 1GB of memory.
We can filter by term *dynamics* instead of just relying on .
We can also limit the 1-tuples and 2-tuples independently, reserving 15,000 terms for each.
2-tuples currently make up the majority of our counts, but are not inherently more important than 1-tuples.

In [2]:
from knowknow import *

In [3]:
c = Dataset('sociology-jstor-sparse-simple')

loading variable sociology-jstor-sparse-simple/_attributes from disk
loading variable sociology-jstor-sparse-simple/groups from disk


In [9]:
cc = c.by('t').docs
list(cc.items())[:10]

[(t(t='social'), 954),
 (t(t='globalization-market'), 2),
 (t(t='beginning-transformation'), 1),
 (t(t='ethnic-problems'), 1),
 (t(t='andrzej-rychard'), 1),
 (t(t='based-strength'), 2),
 (t(t='greatest-contribution'), 4),
 (t(t='partially-explained'), 6),
 (t(t='exactly-differences'), 1),
 (t(t='year'), 591)]

In [7]:
target_N = 500

In [13]:
ndocs = sum(c.by('fy').docs.values())
#cutoff_bot = 0.001 # if present in less than 1/1000 of the documents, discard
#cutoff_count = 2 # if present in less than 2 documents, discard
#cutoff_top = 0.500 # if present in more than 50% of documents, discard

assert( not any( c>ndocs for c in cc.values() ) )

#ndoc_bot = cutoff_bot*ndocs
#ndoc_bot = max(cutoff_count, ndoc_bot)

#terms_to_keep = [t for t,c in c.doc['t'].items() if ndoc_bot <= c <= cutoff_top*ndocs]

tups1 = sorted(
    [x for x in cc if '-' not in x[0]], 
    key=lambda x:cc[x]
)[ -target_N: ]

tups2 = sorted(
    [x for x in cc if '-' in x[0]], 
    key=lambda x:cc[x]
)[ -target_N: ]

terms_to_keep = tups1 + tups2

loading variable sociology-jstor-sparse-simple/doc ___ fy from disk


In [14]:
terms_to_keep = [x[0] for x in terms_to_keep]

In [29]:
from random import sample
print(sample(terms_to_keep, 50))

['indicate', 'one', 'despite', 'recent', 'london-routledge', 'descriptive-statistics', 'ed', 'sociological-theory', 'social-system', 'social-scientists', 'order', 'policy', 'processes', 'article', 'per-cent', 'source', 'sociology-social', 'compared', 'decision-making', 'great-deal', 'high-levels', 'seems', 'cultural', 'j-c', 'seen', 'percent-percent', 'persons', 'among', 'may-seem', 'factor-analysis', 'basis', 'york-mcgrawhill', 'provide', 'two-groups', 'government', 'behavior', 'changing', 'reported-table', 'another-way', 'c-j', 'social-mobility', 'oaks-ca', 'yale-university', 'much-higher', 'provides', 'must', 'literature', 'sample-size', 'institute-social', 'higher-education']


In [31]:
print(f"Reducing the number of terms from {len(cc):0,} to {len(terms_to_keep):0,}.")
#print(f"To those terms present in a minimum of {ndoc_bot} docs, and maximum of {cutoff_top*ndocs} docs.")

Reducing the number of terms from 2,740,481 to 1,000.


In [32]:
import pickle
with open('terms.pickle', 'wb') as outf:
    pickle.dump(terms_to_keep,outf)

In [None]:
raise

In [None]:
# store terms_to_keep

# recount with fuller detail

In [1]:
from knowknow import *

In [2]:
from knowknow.datasources import jstor

In [3]:
class full_counter( jstor.jstor_counter ):

    def account_for(self, doc):

        # year
        self.cnt(doc['year'], 'fy', doc['doi'])

        # journal
        self.cnt(doc['journal'], 'fj', doc['doi'])

        # journal year
        self.cnt((doc['journal'], doc['year']), 'fj.fy', doc['doi'])

        # constructing the tuples set :)
        
        sp = doc['content'].lower() # debating lowercaseing..
        sp = re.sub("[^a-zA-Z\s]+", "", sp)  # removing extraneous characters
        sp = re.sub("\s+", " ", sp)  # removing extra characters
        sp = sp.strip()
        sp = sp.split()  # splitting into words

        sp = [x for x in sp if x not in self.stopwords]  # strip stopwords

        # print(len(tups),c['contextPure'], "---", tups)

        # keep everything in order
        tups = ["-".join(list(x)) for x in zip(sp[:-1], sp[1:])]  # two-word *ordered* tuples
        tups = [ sp[i//2] if i%2==0 else tups[(i-1)//2] for i in range(2*len(sp)-1) ]

        if len(self.term_whitelist):
            tups = [x for x in tups if x in self.term_whitelist]

        # just term count, in case we are using the `basic` mode
        for i1, t1 in enumerate(tups):
            # term
            self.cnt((t1,), 't', doc['doi'])

            if self.RUN_EVERYTHING:
                # term year
                self.cnt((doc['year'], t1), 'fy.t', doc['doi'])

                # term journal
                self.cnt((doc['journal'], t1), 'fj.t', doc['doi'])
                """
                # author loop
                for a in doc['authors']:
                    # term author
                    self.cnt((a, t1), 'fa.t', doc['doi'])
                """

                for t2 in tups[i1+1:i1+1+5]:
                    self.cnt((t1, t2), 't1.t2', doc['doi'])

In [4]:
import pickle

with open('terms.pickle', 'rb') as inf:
    terms_to_keep = pickle.load(inf)

In [5]:
len(terms_to_keep)

1000

In [7]:
c2 = full_counter(
    jstor_zip_base = "G:/My Drive/2020 ORGANISATION/1. PROJECTS/qualitative analysis of literature/110 CITATION ANALYSIS/000 data/sociology jstor",
    output_database = 'sociology-jstor-all',
    RUN_EVERYTHING=True, complex_parsing=False,
    term_whitelist=terms_to_keep,
    SKIP_N=9
)

In [8]:
c2.count()

Will print updated statistics every 100 documents.
Iterating over  248380 documents
Document 0 ... 0 journals... 0 cited works... 0 authors... 0 terms used... 0 'social' terms
Document 100 ... 29 journals... 0 cited works... 0 authors... 1000 terms used... 74 'social' terms
Document 200 ... 38 journals... 0 cited works... 0 authors... 1000 terms used... 150 'social' terms
Document 300 ... 41 journals... 0 cited works... 0 authors... 1000 terms used... 218 'social' terms
Document 400 ... 42 journals... 0 cited works... 0 authors... 1000 terms used... 286 'social' terms
Document 500 ... 44 journals... 0 cited works... 0 authors... 1000 terms used... 357 'social' terms
Document 600 ... 44 journals... 0 cited works... 0 authors... 1000 terms used... 431 'social' terms
Document 700 ... 44 journals... 0 cited works... 0 authors... 1000 terms used... 502 'social' terms
Document 800 ... 44 journals... 0 cited works... 0 authors... 1000 terms used... 570 'social' terms
Document 900 ... 45 journ

In [9]:
len(c2.doc['t1.t2'])

745670

In [10]:
len(c2.doc['t'])

1000

In [11]:
sample(list(c2.doc['t1.t2']), 10)

[('next', 'simply'),
 ('years-ago', 'resources'),
 ('social-forces', 'negative-effect'),
 ('socioeconomic-status', 'course'),
 ('history', 'may-lead'),
 ('variable', 'effect'),
 ('following', 'appear'),
 ('independent-variables', 'remain'),
 ('across', 'per-capita'),
 ('relationships', 'asked')]

In [12]:
c2.save_counters()

loading variable sociology-jstor-all/_attributes from disk
loading variable sociology-jstor-all/groups from disk


# older and slower sentence-based cooccurrence method

In [None]:
class full_counter( jstor.jstor_counter ):

    def account_for(self, doc):

        # year
        self.cnt(doc['year'], 'fy', doc['doi'])

        # journal
        self.cnt(doc['journal'], 'fj', doc['doi'])

        # journal year
        self.cnt((doc['journal'], doc['year']), 'fj.fy', doc['doi'])

        # constructing the tuples set :)
        
        for sent in nlp(doc['content']).sents:
            sent = str(sent)
            
            sp = sent.lower() # debating lowercaseing..
            sp = re.sub("[^a-zA-Z\s]+", "", sp)  # removing extraneous characters
            sp = re.sub("\s+", " ", sp)  # removing extra characters
            sp = sp.strip()
            sp = sp.split()  # splitting into words

            sp = [x for x in sp if x not in self.stopwords]  # strip stopwords

            # print(len(tups),c['contextPure'], "---", tups)

            tups = set("-".join(list(x)) for x in set(zip(sp[:-1], sp[1:])))  # two-word *ordered* tuples
            tups.update(sp)  # one-word tuples

            if len(self.term_whitelist):
                tups = [x for x in tups if x in self.term_whitelist]

            # just term count, in case we are using the `basic` mode
            for t1 in tups:
                # term
                self.cnt((t1,), 't', doc['doi'])

                if self.RUN_EVERYTHING:
                    # term year
                    self.cnt((doc['year'], t1), 'fy.t', doc['doi'])

                    # term journal
                    self.cnt((doc['journal'], t1), 'fj.t', doc['doi'])

                    if False:
                        # author loop
                        for a in doc['authors']:
                            # term author
                            self.cnt((a, t1), 'fa.t', doc['doi'])

                    if len(self.term_whitelist):  # really don't want to do this too early. wait until it's narrowed down to the 5k
                        # term term...
                        for t2 in tups:
                            # if they intersect each other, continue...
                            #if len(set(t1).intersection(set(t2))) >= min(len(t1), len(t2)):
                            #    continue

                            # term term
                            self.cnt((t1, t2), 't1.t2', doc['doi'])