In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

In [2]:
showdocs("top1")

# Zooming in on the top 1%

I would like to look at the most successful cited authors, cited works, and cited terms. Unfortunately, this isn't so simple. There has been a dramatic increase in the supply of citations over the last 100 years, so the group with the most total citations would be skewed towards the citation preferences of recent papers. In order to account for this bias,
I choose among items cited by articles published in each decade 1940-1950, 1941-1951, 1942-1952, all the way to 1980-1990. In each of these decades I determine which were the top-cited 1%. The set of all these top 1%s, from all these decade spans, comprise the 1% I will study in this paper.



# User Parameters

Just pick the database `database_name` and the type of count atom you want to analyze (e.g. `"ta"` for cited author, `"c"` for cited work, etc.)

Note that `"t"` is only available for `jstor` databases

In [3]:
database_name = 'sociology-wos'
ctype = 'ta'
top_percentile = 0.01

In [4]:
# Parameters
database_name = "sociology-wos"
ctype = "ffa"


# Load data

In [5]:
cysum = load_variable("%s.%s.ysum" % (database_name,ctype))
cits = get_cnt("%s.doc" % database_name, [comb(ctype,'fy')])

Loaded keys: dict_keys(['ffa.fy'])
Available keys: ['a', 'c', 'c.c', 'c.fj', 'c.fy', 'c.fy.j', 'fa', 'fa.c', 'fa.fj', 'fa.fj.fy', 'fa.fy', 'ffa', 'ffa.c', 'ffa.fj', 'ffa.fy', 'fj', 'fj.fy', 'fj.ta', 'fj.ty', 'fy', 'fy.ta', 'fy.ty', 'ta', 'ty', 'ty.ty']


In [6]:
any("-" in x for x in cysum)

False

# loop through all the decades!

In [7]:
all_tops = set()

print("%s total entries" % len(cysum))


# ranges loop from 1940-1950 to 1980-1990, in 1-year increments
for RANGE_START, RANGE_END in zip( 
    range(1940,1980+1,1),
    range(1950,1990+1,1),
):
    
    # create a copy of cysum
    cysum_copy = {k:dict(v) for k,v in cysum.items()}

    count_in_range = defaultdict(int)
    for cross, count in cits[comb(ctype,'fy')].items():
        if RANGE_END >= cross.fy >= RANGE_START:
            count_in_range[ getattr(cross, ctype) ] += count
            
    counts = list(count_in_range.values())
    if not len(counts):
        print("Skipping %s" % RANGE_START)
        continue
        
    q99 = np.quantile(np.array( counts ), 1-top_percentile)
    top1 = {k for k in count_in_range if count_in_range[k]>=q99}
    all_tops.update(top1)
    
    print("%s /%s in the top %0.1f%% in %s,%s (%s total accumulated)" % (
        len(top1),
        len(count_in_range),
        top_percentile*100,
        RANGE_START, RANGE_END,
        len(all_tops)
    ))

    
alldf = pd.DataFrame.from_records([
    c
    for name, c in cysum.items()
    if name in all_tops
])

alldf.fillna(value=np.nan, inplace=True)

print(alldf.shape)

2911 total entries
6 /492 in the top 1.0% in 1940,1950 (6 total accumulated)
7 /547 in the top 1.0% in 1941,1951 (8 total accumulated)
8 /602 in the top 1.0% in 1942,1952 (10 total accumulated)
7 /649 in the top 1.0% in 1943,1953 (10 total accumulated)
16 /694 in the top 1.0% in 1944,1954 (19 total accumulated)
18 /753 in the top 1.0% in 1945,1955 (21 total accumulated)


18 /799 in the top 1.0% in 1946,1956 (24 total accumulated)
17 /871 in the top 1.0% in 1947,1957 (26 total accumulated)
18 /936 in the top 1.0% in 1948,1958 (28 total accumulated)
19 /1008 in the top 1.0% in 1949,1959 (32 total accumulated)
21 /1099 in the top 1.0% in 1950,1960 (36 total accumulated)
12 /1187 in the top 1.0% in 1951,1961 (38 total accumulated)


18 /1261 in the top 1.0% in 1952,1962 (40 total accumulated)
17 /1337 in the top 1.0% in 1953,1963 (42 total accumulated)
16 /1428 in the top 1.0% in 1954,1964 (42 total accumulated)
16 /1510 in the top 1.0% in 1955,1965 (45 total accumulated)
24 /1674 in the top 1.0% in 1956,1966 (54 total accumulated)
23 /1792 in the top 1.0% in 1957,1967 (58 total accumulated)


25 /1865 in the top 1.0% in 1958,1968 (60 total accumulated)
21 /2095 in the top 1.0% in 1959,1969 (62 total accumulated)
35 /2315 in the top 1.0% in 1960,1970 (73 total accumulated)
42 /2575 in the top 1.0% in 1961,1971 (82 total accumulated)
45 /2782 in the top 1.0% in 1962,1972 (88 total accumulated)
43 /3015 in the top 1.0% in 1963,1973 (93 total accumulated)
43 /3308 in the top 1.0% in 1964,1974 (98 total accumulated)
46 /3579 in the top 1.0% in 1965,1975 (105 total accumulated)
46 /3895 in the top 1.0% in 1966,1976 (113 total accumulated)


53 /4161 in the top 1.0% in 1967,1977 (125 total accumulated)
66 /4525 in the top 1.0% in 1968,1978 (141 total accumulated)
49 /4892 in the top 1.0% in 1969,1979 (146 total accumulated)
93 /5217 in the top 1.0% in 1970,1980 (173 total accumulated)
55 /5461 in the top 1.0% in 1971,1981 (174 total accumulated)
60 /5748 in the top 1.0% in 1972,1982 (180 total accumulated)
64 /6019 in the top 1.0% in 1973,1983 (192 total accumulated)


70 /6298 in the top 1.0% in 1974,1984 (199 total accumulated)
69 /6520 in the top 1.0% in 1975,1985 (209 total accumulated)
79 /6742 in the top 1.0% in 1976,1986 (222 total accumulated)
88 /7014 in the top 1.0% in 1977,1987 (242 total accumulated)
90 /7277 in the top 1.0% in 1978,1988 (255 total accumulated)


99 /7491 in the top 1.0% in 1979,1989 (272 total accumulated)
98 /7709 in the top 1.0% in 1980,1990 (284 total accumulated)
(185, 42)


In [8]:
alldf.shape

(185, 42)

In [9]:
alldf.sort_values("total", ascending=False).head()

Unnamed: 0,maxcount,name,rebirth_5_7,rebirth_5_3,rebirth_5_2,death_2,first,rebirth_2_5,rebirth_3_5,rebirth_1_20,...,maxpropy,rebirth_0_20,rebirth_5_1,rebirth_5_6,rebirth_5_8,rebirth_3_20,death_5,rebirth_0_5,total,rebirth_0_10
10,4,"smith, d",,2007.0,,,1965,,,,...,1966,,,,,,1997.0,,61,
41,4,"hagan, j",,,,,1973,,,,...,1977,,,,,,,,59,
32,4,"portes, a",,1984.0,,,1968,,,,...,1972,,,,,,1973.0,,46,
61,5,"burt, r",,,,2005.0,1973,,,,...,1980,,,,,,1992.0,,46,
63,3,"turner, r",,2003.0,,1993.0,1947,,,,...,1949,,,,,,1989.0,,45,


In [10]:
save_variable("%s.%s.top1" % (database_name,ctype), alldf)