In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

In [2]:
showdocs("top1")

# Zooming in on the top 1%

I would like to look at the most successful cited authors, cited works, and cited terms. Unfortunately, this isn't so simple. There has been a dramatic increase in the supply of citations over the last 100 years, so the group with the most total citations would be skewed towards the citation preferences of recent papers. In order to account for this bias,
I choose among items cited by articles published in each decade 1940-1950, 1941-1951, 1942-1952, all the way to 1980-1990. In each of these decades I determine which were the top-cited 1%. The set of all these top 1%s, from all these decade spans, comprise the 1% I will study in this paper.



# User Parameters

Just pick the database `database_name` and the type of count atom you want to analyze (e.g. `"ta"` for cited author, `"c"` for cited work, etc.)

Note that `"t"` is only available for `jstor` databases

In [3]:
database_name = 'sociology-wos'
ctype = 'ta'
top_percentile = 0.01

In [3]:
# Parameters
database_name = "sociology-wos"
ctype = "fa"


# Load data

In [4]:
cysum = load_variable("%s.%s.ysum" % (database_name,ctype))
cits = get_cnt("%s.doc" % database_name, [comb(ctype,'fy')])

Loaded keys: dict_keys(['fa.fy'])
Available keys: ['a', 'c', 'c.c', 'c.fj', 'c.fy', 'c.fy.j', 'fa', 'fa.c', 'fa.fj', 'fa.fj.fy', 'fa.fy', 'fj', 'fj.fy', 'fj.ta', 'fj.ty', 'fy', 'fy.ta', 'fy.ty', 'ta', 'ty', 'ty.ty']


In [5]:
any("-" in x for x in cysum)

False

# loop through all the decades!

In [7]:
all_tops = set()

print("%s total entries" % len(cysum))


# ranges loop from 1940-1950 to 1980-1990, in 1-year increments
for RANGE_START, RANGE_END in zip( 
    range(1940,1980+1,1),
    range(1950,1990+1,1),
):
    
    # create a copy of cysum
    cysum_copy = {k:dict(v) for k,v in cysum.items()}

    count_in_range = defaultdict(int)
    for cross, count in cits[comb(ctype,'fy')].items():
        if RANGE_END >= cross.fy >= RANGE_START:
            count_in_range[ getattr(cross, ctype) ] += count
            
    counts = list(count_in_range.values())
    if not len(counts):
        print("Skipping %s" % RANGE_START)
        continue
        
    q99 = np.quantile(np.array( counts ), 1-top_percentile)
    top1 = {k for k in count_in_range if count_in_range[k]>=q99}
    all_tops.update(top1)
    
    print("%s /%s in the top %0.1f%% in %s,%s (%s total accumulated)" % (
        len(top1),
        len(count_in_range),
        top_percentile*100,
        RANGE_START, RANGE_END,
        len(all_tops)
    ))

    
alldf = pd.DataFrame.from_records([
    c
    for name, c in cysum.items()
    if name in all_tops
])

alldf.fillna(value=np.nan, inplace=True)

print(alldf.shape)

5852 total entries
12 /587 in the top 1.0% in 1940,1950 (12 total accumulated)
13 /657 in the top 1.0% in 1941,1951 (15 total accumulated)
12 /727 in the top 1.0% in 1942,1952 (17 total accumulated)
13 /794 in the top 1.0% in 1943,1953 (20 total accumulated)
14 /856 in the top 1.0% in 1944,1954 (24 total accumulated)
16 /930 in the top 1.0% in 1945,1955 (28 total accumulated)
10 /996 in the top 1.0% in 1946,1956 (28 total accumulated)
18 /1085 in the top 1.0% in 1947,1957 (32 total accumulated)
18 /1183 in the top 1.0% in 1948,1958 (34 total accumulated)
21 /1290 in the top 1.0% in 1949,1959 (38 total accumulated)
14 /1399 in the top 1.0% in 1950,1960 (39 total accumulated)
17 /1521 in the top 1.0% in 1951,1961 (40 total accumulated)
20 /1626 in the top 1.0% in 1952,1962 (46 total accumulated)
19 /1733 in the top 1.0% in 1953,1963 (51 total accumulated)
22 /1856 in the top 1.0% in 1954,1964 (53 total accumulated)
33 /1954 in the top 1.0% in 1955,1965 (63 total accumulated)
37 /2218 in 

In [8]:
alldf.shape

(229, 20)

In [9]:
alldf.sort_values("total", ascending=False).head()

Unnamed: 0,total,maxcount,first,maxcounty,death_4,death_8,maxpropy,death_9,death_max,death_7,death_last,totalprop,last,death_1,maxprop,death_3,death_5,death_2,death_6,name
4,98,5,1965,1992,,1993.0,1966,1993.0,1993.0,1993.0,,0.085647,2020,,0.006696,,1997.0,,1993.0,"smith, d"
8,86,5,1973,1979,,1981.0,1979,1981.0,1980.0,,,0.059211,2018,,0.00497,,,,,"hagan, j"
2,72,4,1979,2000,,,2020,,2001.0,,,0.045352,2020,,0.002915,,,,,"massey, d"
29,69,7,1972,2004,,,1973,,,,,0.042169,2020,,0.004392,,,,,"johnson, m"
82,67,4,1966,2001,,,1972,,,,,0.05346,2018,,0.004717,,,,,"johnson, d"


In [10]:
save_variable("%s.%s.top1" % (database_name,ctype), alldf)

In [11]:
save_variable("%s.%s.top1" % (database_name,ctype), alldf)