In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

In [2]:
showdocs("top1")

# Zooming in on the top 1%

I would like to look at the most successful cited authors, cited works, and cited terms. Unfortunately, this isn't so simple. There has been a dramatic increase in the supply of citations over the last 100 years, so the group with the most total citations would be skewed towards the citation preferences of recent papers. In order to account for this bias,
I choose among items cited by articles published in each decade 1940-1950, 1941-1951, 1942-1952, all the way to 1980-1990. In each of these decades I determine which were the top-cited 1%. The set of all these top 1%s, from all these decade spans, comprise the 1% I will study in this paper.

# User Parameters

Just pick the database `database_name` and the type of count atom you want to analyze (e.g. `"ta"` for cited author, `"c"` for cited work, etc.)

Note that `"t"` is only available for `jstor` databases

In [3]:
database_name = 'sociology-wos'
ctype = 'ta'
top_percentile = 0.01

In [4]:
# Parameters
database_name = "sociology-jstor"
ctype = "t"


# Load data

In [5]:
cysum = load_variable("%s.%s.ysum" % (database_name,ctype))
cits = get_cnt("%s.doc" % database_name, [comb(ctype,'fy')])

Loaded keys: dict_keys(['fy.t'])
Available keys: ['a', 'a.c', 'a.fj.fy', 'c', 'c.c', 'c.fa', 'c.fj', 'c.fy', 'c.t', 'fa', 'fa.fj.fy', 'fj', 'fj.fy', 'fj.t', 'fy', 'fy.t', 't']


# loop through all the decades!

In [6]:
all_tops = set()

print("%s total entries" % len(cysum))


# ranges loop from 1940-1950 to 1980-1990, in 1-year increments
for RANGE_START, RANGE_END in zip( 
    range(1940,1980+1,1),
    range(1950,1990+1,1),
):
    
    # create a copy of cysum
    cysum_copy = {k:dict(v) for k,v in cysum.items()}

    count_in_range = defaultdict(int)
    for cross, count in cits[comb(ctype,'fy')].items():
        if RANGE_END >= cross.fy >= RANGE_START:
            count_in_range[ getattr(cross, ctype) ] += count
            
    q99 = np.quantile(np.array( list(count_in_range.values()) ), 1-top_percentile)
    top1 = {k for k in count_in_range if count_in_range[k]>=q99}
    all_tops.update(top1)
    
    print("%s /%s in the top %0.1f%% in %s,%s (%s total accumulated)" % (
        len(top1),
        len(count_in_range),
        top_percentile*100,
        RANGE_START, RANGE_END,
        len(all_tops)
    ))

    
alldf = pd.DataFrame.from_records([
    c
    for name, c in cysum.items()
    if name in all_tops
])

alldf.fillna(value=np.nan, inplace=True)

print(alldf.shape)

3961 total entries
1 /92 in the top 1.0% in 1940,1950 (1 total accumulated)
1 /97 in the top 1.0% in 1941,1951 (1 total accumulated)
2 /107 in the top 1.0% in 1942,1952 (2 total accumulated)
3 /112 in the top 1.0% in 1943,1953 (4 total accumulated)
2 /121 in the top 1.0% in 1944,1954 (4 total accumulated)
2 /127 in the top 1.0% in 1945,1955 (4 total accumulated)


2 /142 in the top 1.0% in 1946,1956 (4 total accumulated)
2 /186 in the top 1.0% in 1947,1957 (4 total accumulated)
3 /203 in the top 1.0% in 1948,1958 (4 total accumulated)
3 /225 in the top 1.0% in 1949,1959 (5 total accumulated)
5 /265 in the top 1.0% in 1950,1960 (6 total accumulated)
3 /296 in the top 1.0% in 1951,1961 (6 total accumulated)
5 /340 in the top 1.0% in 1952,1962 (7 total accumulated)


4 /377 in the top 1.0% in 1953,1963 (7 total accumulated)
5 /403 in the top 1.0% in 1954,1964 (7 total accumulated)
6 /427 in the top 1.0% in 1955,1965 (8 total accumulated)
5 /463 in the top 1.0% in 1956,1966 (8 total accumulated)
6 /532 in the top 1.0% in 1957,1967 (9 total accumulated)
8 /722 in the top 1.0% in 1958,1968 (10 total accumulated)


12 /1188 in the top 1.0% in 1959,1969 (13 total accumulated)
18 /1722 in the top 1.0% in 1960,1970 (19 total accumulated)
24 /2112 in the top 1.0% in 1961,1971 (25 total accumulated)
25 /2386 in the top 1.0% in 1962,1972 (28 total accumulated)
27 /2618 in the top 1.0% in 1963,1973 (31 total accumulated)
28 /2797 in the top 1.0% in 1964,1974 (35 total accumulated)


30 /2976 in the top 1.0% in 1965,1975 (37 total accumulated)
32 /3133 in the top 1.0% in 1966,1976 (39 total accumulated)
33 /3289 in the top 1.0% in 1967,1977 (41 total accumulated)
35 /3412 in the top 1.0% in 1968,1978 (44 total accumulated)


36 /3513 in the top 1.0% in 1969,1979 (48 total accumulated)
37 /3605 in the top 1.0% in 1970,1980 (49 total accumulated)
38 /3679 in the top 1.0% in 1971,1981 (50 total accumulated)
38 /3719 in the top 1.0% in 1972,1982 (52 total accumulated)
39 /3791 in the top 1.0% in 1973,1983 (55 total accumulated)


39 /3837 in the top 1.0% in 1974,1984 (59 total accumulated)
39 /3869 in the top 1.0% in 1975,1985 (60 total accumulated)
41 /3920 in the top 1.0% in 1976,1986 (63 total accumulated)


40 /3944 in the top 1.0% in 1977,1987 (64 total accumulated)
40 /3961 in the top 1.0% in 1978,1988 (65 total accumulated)
40 /3999 in the top 1.0% in 1979,1989 (66 total accumulated)
41 /4026 in the top 1.0% in 1980,1990 (67 total accumulated)
(60, 13)


In [7]:
alldf.shape

(60, 13)

In [8]:
save_variable("%s.%s.top1" % (database_name,ctype), alldf)

In [9]:
save_variable("%s.%s.top1" % (database_name,ctype), alldf)