# Analysis of basic scholar profiles

In [2]:
import csv
import numpy as np
import sys
from pathlib import Path
import pickle
import pandas as pd
import json

In [None]:
scholar_all = np.load(
    '../gs_scholars_all_upd.npy', allow_pickle=True
)


In [None]:
df = pd.DataFrame.from_records(scholar_all)

In [None]:
!pip install efficiency

In [None]:
import pandas as pd
class Name2Gender:
    def __init__(self):
        self.full_name2gender = self._load_full_name_gender()
        self.first_name2gender = self._load_first_name_gender()

    def _load_first_name_gender(self):
        # 100711 lines
        from collections import defaultdict
        df = pd.read_csv("/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/gender_files/firstname_gender.csv")
        first_name_n_gender = df.to_dict(orient='records')
        first_name2gender = {dic['name']: dic['gender'] for dic in first_name_n_gender}
        return first_name2gender

    def _load_full_name_gender(self):
        def _file2full_name(file):
            from efficiency.log import fread
            full_names = fread(file, delete_empty=True, if_strip=True)
            # full_names = U.read_file(file)
            full_names = [' '.join(i.split(', ', 1)[::-1]).lower() for i in full_names]
            return full_names

        gender2file = {'M': '/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/gender_files/acl-male.txt',
                       'F': '/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/gender_files/acl-female.txt',
                       }
        full_name2gender = {}
        for gender, file in gender2file.items():
            full_names = _file2full_name(file)
            full_name2gender.update({i: gender for i in full_names})
        return full_name2gender

    def lookup_gender(self, full_name):
        gender = self.full_name2gender.get(full_name.lower(), '-')
        if gender == '-':
            first_name = full_name.rsplit(' ', 1)[0].lower()
            gender = self.first_name2gender.get(first_name, '-')
        return gender

gender_table = Name2Gender()

In [None]:
res = []
for scholar in scholar_all:
  if gender_table.lookup_gender(scholar['name']) == 'F':
    coauthors = scholar['co_authors']
    if not coauthors:
      continue
    male = 0
    female = 0
    for each in coauthors:
      gender = gender_table.lookup_gender(each[1])
      if gender == 'M':
        male += 1
      elif gender == 'F':
        female += 1
    if female+male != 0:
      res.append(female/(female+male))

In [None]:
scholar_all[0].describe()


count    4203.000000
mean        0.234781
std         0.244564
min         0.000000
25%         0.000000
50%         0.200000
75%         0.333333
max         1.000000
Name: 0, dtype: float64

In [None]:
scholar_all[0]


In [28]:
scholars = [scholar for scholar in scholar_all if int(
    scholar["citation_table"][0]) >= 100]


## Profile statistics

In [42]:
rows = []
for each in scholars:
  if each["gender"] == 'F':

    citation_num = int(each["citation_table"][0])
    h_index = int(each["citation_table"][1])
    paper_num = each['paper_num']
    academic_age = each["academic_age"]
    academic_span = each["academic_lifespan"]
    row = [citation_num, h_index, paper_num, academic_age, academic_span]
    rows.append(row)

scholar_df = pd.DataFrame(rows, columns=["cit", "h", "paper", "age", "span"])
print(scholar_df["cit"].describe())
print(scholar_df["h"].describe())
print(scholar_df["paper"].describe())
print(scholar_df["age"].describe())
print(scholar_df["span"].describe())

count      7015.000000
mean       1762.111190
std        6246.008524
min         100.000000
25%         197.000000
50%         414.000000
75%        1164.500000
max      209549.000000
Name: cit, dtype: float64
count    7015.000000
mean       13.248040
std        12.670904
min         1.000000
25%         6.000000
50%         9.000000
75%        15.000000
max       211.000000
Name: h, dtype: float64
count    7015.000000
mean       60.199857
std       103.959251
min         1.000000
25%        16.000000
50%        29.000000
75%        64.000000
max      2125.000000
Name: paper, dtype: float64
count    7015.000000
mean       16.329579
std         9.742991
min         2.000000
25%        10.000000
50%        14.000000
75%        20.000000
max        73.000000
Name: age, dtype: float64
count    7015.000000
mean       14.871561
std         9.878902
min         1.000000
25%         8.000000
50%        12.000000
75%        19.000000
max        72.000000
Name: span, dtype: float64


## Newbies and Elders Analysis

In [None]:
rows = []
elders = []
newbies_ = []
for each in scholars:
  # if each["gender"] == 'F':
  prev_citation = None
  elder_citation = None
  newbies = None
  if '2011' in each["cit_sum_before_year"]:
    prev_citation = each["cit_sum_before_year"]["2011"]
    elder_citation = list(each["cit_sum_before_year"].items())[-1][1] - prev_citation
    elders.append(each)
  else:
    try:
      newbies = list(each["cit_sum_before_year"].items())[-1][1]
      newbies_.append(each)
    except:
      continue
  row = [prev_citation, elder_citation, newbies]
  rows.append(row)
newbies_elders_df = pd.DataFrame(rows, columns = ["prev_citation", "elder", "newbies"])

In [None]:
elder_df = pd.DataFrame(elders)
rows = []
for index, each in elder_df.iterrows():
  # if each['gender'] == 'F':

    num_per_age = each["paper_num"] / each["academic_age"]
    cit_per_age = int(each["citation_table"][0]) / each["academic_age"]
    year2hundred = 1
    for key, value in each["cit_sum_before_year"].items():
      if value > 100:
        break
      year2hundred += 1
    row = [num_per_age, cit_per_age, year2hundred]
    rows.append(row)
temp = pd.DataFrame(rows, columns = ["num_per_age", "cit_per_age", "year2hundred"])

In [None]:
rows = []
for each in scholar_all:
  if "2011" in each["cites"]["years"] and gender_table.lookup_gender(each["name"]) == 'F':
    max_cit = 0
    cit_year = 0
    for paper in each["papers"]:
      try:
        cit_num = int(paper[3])
      except:
        continue
      if cit_num > max_cit:
        try:
          cit_year = int(paper[5])
          max_cit = cit_num
        except:
          continue
    row = [max_cit, cit_year]
    rows.append(row)
max_cit_df = pd.DataFrame(rows, columns = ["max_cit", "cit_year"])

In [None]:
max_cit_df["max_cit"].describe()

count     3606.000000
mean       553.632557
std       1828.994814
min         11.000000
25%         76.000000
50%        160.500000
75%        407.000000
max      53236.000000
Name: max_cit, dtype: float64

In [None]:
max_cit_df["cit_year"].mode()

0    2011
dtype: int64

In [None]:
temp["num_per_age"].describe()

count    40177.000000
mean         3.964080
std          4.322088
min          0.035088
25%          1.478261
50%          2.692308
75%          4.900000
max         95.333333
Name: num_per_age, dtype: float64

In [None]:
newbies_elders_df["elder"].describe()

count      3603.000000
mean       2120.100472
std        6901.535664
min           0.000000
25%         261.000000
50%         639.000000
75%        1670.500000
max      203008.000000
Name: elder, dtype: float64

In [None]:
citation = []
h_index = []
paper_num = []
academic_age = []
academic_lifespan = []
for each in scholars:
  if each["gender"] == 'F':
    citation.append(int(each["citation_table"][0]))
    h_index.append(int(each["citation_table"][1]))
    paper_num.append(each["paper_num"])
    academic_age.append(each["academic_age"])
    academic_lifespan.append(each["academic_lifespan"])

In [None]:
df = pd.DataFrame(citation, columns=["citation"])
df["h_index"] = h_index
df["paper_num"] = paper_num
df["academic_age"] = academic_age
df["academic_lifespan"] = academic_lifespan

In [None]:
df["academic_age"].describe()

count    7036.000000
mean       16.127061
std         9.136308
min         2.000000
25%        10.000000
50%        14.000000
75%        20.000000
max        53.000000
Name: academic_age, dtype: float64