In [None]:
!pip install efficiency

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
from bs4 import BeautifulSoup
import json
import pandas as pd
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
scholars = np.load("/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/result_data/scholar_gs/gs_scholars.npy", allow_pickle=True)

In [None]:
domains = ['natural language processing', 'computer vision', 'reinforcement learning',  'machine learning', 'artificial intelligence']

In [None]:
str2clean = lambda i: re.sub("[^a-zA-Z]+", "", i.lower())

In [None]:
domain2scholars = {d: [i for i in scholars if str2clean(d) in str2clean(i['extra_info'][-1])] for d in domains}

In [None]:
print(scholars[1]['extra_info'][-1])

<a href="/citations?view_op=search_authors&amp;hl=en&amp;mauthors=label:natural_language_processing" class="gsc_prf_inta gs_ibl">natural language processing</a><a href="/citations?view_op=search_authors&amp;hl=en&amp;mauthors=label:data_management" class="gsc_prf_inta gs_ibl">data management</a><a href="/citations?view_op=search_authors&amp;hl=en&amp;mauthors=label:information_retrieval" class="gsc_prf_inta gs_ibl">information retrieval</a>


In [None]:
fullname_data = np.load("/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/previous_version/paper_83k_info_with_fullnames.npy", allow_pickle=True)
new = []
for each in fullname_data:
    if "Authors" not in each:
        if "Inventors" not in each:
            continue
        temp = {'gs_id': each['gs_id'], 'author_name': each["Inventors"]}
    else:
        temp = {'gs_id': each['gs_id'], 'author_name': each["Authors"]}
    new.append(temp)
fullname_df = pd.DataFrame.from_dict(new, orient='columns')

In [None]:
fullname_df.head()

Unnamed: 0,gs_id,author_name
0,10251033406930934735,"Peter Clark, Phil Harrison, Niranjan Balasubra..."
1,13633406419057296907,"Jojanneke M Jukes, Clemens A Van Blitterswijk,..."
2,16750122778147439803,"Riqiang Gao, Lingfeng Li, Yucheng Tang, Sanja ..."
3,1372166043251834298,"Jiang-Chun Chen, Chun-Jen Lee, Shuo-Pin Hsu, J..."
4,18345820347968721525,"Erdem Akagunduz, Adrian G Bors, Karla K Evans"


In [None]:
ml_scholars = domain2scholars['artificial intelligence']
# ml_scholars.extend(domain2scholars['reinforcement learning'])

ml_papers= [len(i['papers']) for i in ml_scholars]

ml_papers_repetitive = [paper for s in ml_scholars for paper in s['papers']]
id2ml_paper = {paper[-2].split('&cites=')[-1]: paper for paper in ml_papers_repetitive}

In [None]:
female = 0
male = 0
for each in ml_scholars:
  if gender_table.lookup_gender(each["name"]) == 'F':
    female += 1
  elif gender_table.lookup_gender(each["name"]) == 'M':
    male += 1

print(female/(female+male))

0.17273828379763348


In [None]:
for each in id2ml_paper:
  print(id2ml_paper[each])
  break

['https://scholar.google.com/citations?view_op=view_citation&hl=en&user=IIrX5SMAAAAJ&citation_for_view=IIrX5SMAAAAJ:9ZlFYXVOiuMC', 'XSB as an efficient deductive database engine', ['K Sagonas, T Swift, DS Warren', 'ACM SIGMOD Record 23 (2), 442-453'], '548', 'https://scholar.google.com/scholar?oi=bibs&hl=en&cites=5410481471975561459', '1994']


In [None]:
from tqdm import tqdm
domain_rows = []
for each in tqdm(id2ml_paper):
  if each:
    try:
      row = fullname_df.loc[fullname_df['gs_id'] == each].values.tolist()[0]
      domain_rows.append(row)
    except:
      continue

domain_df = pd.DataFrame(domain_rows, columns=['gs_id', 'author_name'])

  0%|          | 1851/1680513 [00:32<7:40:43, 60.72it/s]

In [None]:
class Name2Gender:
    def __init__(self):
        self.full_name2gender = self._load_full_name_gender()
        self.first_name2gender = self._load_first_name_gender()

    def _load_first_name_gender(self):
        # 100711 lines
        from collections import defaultdict
        df = pd.read_csv("/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/gender_files/firstname_gender.csv")
        first_name_n_gender = df.to_dict(orient='records')
        first_name2gender = {dic['name']: dic['gender'] for dic in first_name_n_gender}
        return first_name2gender

    def _load_full_name_gender(self):
        def _file2full_name(file):
            from efficiency.log import fread
            full_names = fread(file, delete_empty=True, if_strip=True)
            # full_names = U.read_file(file)
            full_names = [' '.join(i.split(', ', 1)[::-1]).lower() for i in full_names]
            return full_names

        gender2file = {'M': '/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/gender_files/acl-male.txt',
                       'F': '/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/intermediate_data_for_preprocessing/gender_files/acl-female.txt',
                       }
        full_name2gender = {}
        for gender, file in gender2file.items():
            full_names = _file2full_name(file)
            full_name2gender.update({i: gender for i in full_names})
        return full_name2gender

    def lookup_gender(self, full_name):
        gender = self.full_name2gender.get(full_name.lower(), '-')
        if gender == '-':
            first_name = full_name.rsplit(' ', 1)[0].lower()
            gender = self.first_name2gender.get(first_name, '-')
        return gender
gender_table = Name2Gender()

In [None]:
for each in scholars:
  citation_by_year = each["cites"]
  year2cit_this_year = dict(zip(citation_by_year['years'], citation_by_year['cites']))
  year2cit_this_year = {int(k): int(v) for k, v in year2cit_this_year.items()}
  cit_sum_before_year = {given_year: sum(v for k, v in year2cit_this_year.items() if k <= given_year)
                          for given_year in year2cit_this_year}
  print(cit_sum_before_year)
  cit_sum_before_year.pop(2022, None)
  print(cit_sum_before_year)
  break

{1994: 39, 1995: 86, 1996: 187, 1997: 287, 1998: 383, 1999: 500, 2000: 615, 2001: 727, 2002: 833, 2003: 951, 2004: 1070, 2005: 1203, 2006: 1357, 2007: 1543, 2008: 1719, 2009: 1866, 2010: 2005, 2011: 2129, 2012: 2323, 2013: 2487, 2014: 2615, 2015: 2758, 2016: 2935, 2017: 3051, 2018: 3167, 2019: 3261, 2020: 3335, 2021: 3411, 2022: 3435}
{1994: 39, 1995: 86, 1996: 187, 1997: 287, 1998: 383, 1999: 500, 2000: 615, 2001: 727, 2002: 833, 2003: 951, 2004: 1070, 2005: 1203, 2006: 1357, 2007: 1543, 2008: 1719, 2009: 1866, 2010: 2005, 2011: 2129, 2012: 2323, 2013: 2487, 2014: 2615, 2015: 2758, 2016: 2935, 2017: 3051, 2018: 3167, 2019: 3261, 2020: 3335, 2021: 3411}


In [None]:
college_keyword = ["faculty", "professor", "college", "university", "academy", "institute", "phd", "department"]
with open("/content/drive/MyDrive/AI_Scholar_gender/AI_scholar_data/result_data/scholar_gs/gs_scholar_new_features.jsonl", 'w') as f:
  for each in scholars:
    gs_sid = each["url"].split("user=")[1].split("&")[0]
    organization = BeautifulSoup(each["extra_info"][0], 'html.parser').text
    try:
      organization_code = each["extra_info"][0].split("org=")[1].split('"')[0]
    except:
      organization_code = None
    if any(keyword in organization.lower() for keyword in college_keyword):
      academic = True
    else:
      academic = False
    gender = gender_table.lookup_gender(each["name"])
    mi = 9999
    ma = 0
    citations = []
    for p in each["papers"]:
      if (p[-1] == ''):
        continue
      if (int(p[-1])<1950):
        continue
      mi = min(mi, int(p[-1]))
      ma = max(ma, int(p[-1]))
      if p[3] != '' and p[-1] != '2022':
        citations.append(int(p[3]))
    ma = min(ma, 2022)
    if (mi!=9999):
      academic_lifespan = ma - mi + 1
      academic_age = 2022 - mi + 1
    else:
      academic_lifespan = None
      academic_age = None
    citations = np.array(citations)
    n         = citations.shape[0]
    array     = np.arange(1, n+1)
        
    # total number of citations for each k; k varies from 1 to n
    result    = citations >= array.reshape((-1,1))
    result    = result.sum(axis=1) 

    # selecting articles with least k citations for each k; 
    result    = result >= array

    # choosing the highest value of k
    try:
      h_idx    = array[result][-1]
    except:
      h_idx = int(each["citations_table"][2])
    current_citation = int(each["citations_table"][0])
    if "2022" in each["cites"]["years"]:
      current_citation -= int(each["cites"]["cites"][-1])
    citation_by_year = each["cites"]
    year2cit_this_year = dict(zip(citation_by_year['years'], citation_by_year['cites']))
    year2cit_this_year = {int(k): int(v) for k, v in year2cit_this_year.items()}
    cit_sum_before_year = {given_year: sum(v for k, v in year2cit_this_year.items() if k <= given_year)
                            for given_year in year2cit_this_year}
    cit_sum_before_year.pop(2022, None) # delete 2022
    domain_labels = re.findall(r'=label\:(.*?)"', each["extra_info"][2])
    citation_table = [str(current_citation), str(h_idx)]
    current_scholar = {'gs_sid': gs_sid,
                       'organization': organization,
                       'organization_code': organization_code,
                       'academic': academic,
                       'gender': gender,
                       'academic_lifespan': academic_lifespan,
                       'academic_age': academic_age,
                       'cit_sum_before_year': cit_sum_before_year,
                       'citation_table': citation_table,
                       'domain_labels': domain_labels,
                       'paper_num': len(each["papers"]), #delete papers in 2022 / add coauthors
                       }
    f.write(json.dumps(current_scholar) + '\n')

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
with open("gs_scholar_new_features.jsonl") as f:
  scholars = [json.loads(line) for line in f if line.strip()]

In [None]:
len(scholars)

78066

Create new cluster data for time-series clustering

In [None]:
with open("cluster_data.jsonl") as f:
  cluster_data = [json.loads(line) for line in f if line.strip()]

In [None]:
cluster = {}
for each in cluster_data:
  cluster[each["url"].split("user=")[1].split("&")[0]] = 1
print(cluster)

{'IIrX5SMAAAAJ': 1, '9gVmuMAAAAAJ': 1, 'EmTFUdQAAAAJ': 1, 'zBPIt3QAAAAJ': 1, 'vM9772wAAAAJ': 1, 'LaKNyhQAAAAJ': 1, 'x8ju9EEAAAAJ': 1, '6RxMYNEAAAAJ': 1, 'sd20LLwAAAAJ': 1, 'WR1ImCMAAAAJ': 1, '3mYqjKwAAAAJ': 1, 'SC2xBNwAAAAJ': 1, 'fyEqU5oAAAAJ': 1, 'sm7-kIwAAAAJ': 1, '5vp_6WQAAAAJ': 1, 'b-LJkLQAAAAJ': 1, 'a5WsBc0AAAAJ': 1, 'NaVob24AAAAJ': 1, 'TvhkG-kAAAAJ': 1, 'CXlp-fcAAAAJ': 1, 'HBpQRqwAAAAJ': 1, 'pkHCpdYAAAAJ': 1, 'mkFFpuAAAAAJ': 1, '9gMgFPQAAAAJ': 1, 'TAZbfksAAAAJ': 1, 'sdfKs_sAAAAJ': 1, 'k9g68LwAAAAJ': 1, 'Im4IeIEAAAAJ': 1, 'BeTUvHIAAAAJ': 1, '_0zTg1UAAAAJ': 1, '3goZUhYAAAAJ': 1, 'BiunNCQAAAAJ': 1, 'phgBJXYAAAAJ': 1, 'HPio5bcAAAAJ': 1, 'FQjTCfwAAAAJ': 1, 'bndKId8AAAAJ': 1, 'GD6iyTsAAAAJ': 1, 'dZQOltMAAAAJ': 1, '4kOdChQAAAAJ': 1, 'Cr-HGNAAAAAJ': 1, 'g_G_SNAAAAAJ': 1, 'dKpTY7IAAAAJ': 1, 'K3Z9UiAAAAAJ': 1, 'ugGdl58AAAAJ': 1, '_NEbOj4AAAAJ': 1, 'yHGvAkYAAAAJ': 1, 'w-70g6gAAAAJ': 1, 'eeyGXWoAAAAJ': 1, 'l9y5N3QAAAAJ': 1, 'o_DIR1sAAAAJ': 1, '0ABQsM4AAAAJ': 1, 'RHhqpWAAAAAJ': 1, 'ROpnY-gAAA

In [None]:
with open("new_cluster_data.jsonl", 'w') as output:
  for each in scholars:
    if each["gs_sid"] in cluster:
      output.write(json.dumps(each) +'\n')

In [None]:
print(scholars[0])

{'gs_sid': 'IIrX5SMAAAAJ', 'organization': 'N/A', 'organization_code': None, 'academic': False, 'gender': 'F', 'academic_lifespan': 28, 'academic_age': 31, 'cit_sum_before_year': {'1994': 39, '1995': 86, '1996': 187, '1997': 287, '1998': 383, '1999': 500, '2000': 615, '2001': 727, '2002': 833, '2003': 951, '2004': 1070, '2005': 1203, '2006': 1357, '2007': 1543, '2008': 1719, '2009': 1866, '2010': 2005, '2011': 2129, '2012': 2323, '2013': 2487, '2014': 2615, '2015': 2758, '2016': 2935, '2017': 3051, '2018': 3167, '2019': 3261, '2020': 3335, '2021': 3411, '2022': 3435}, 'domain_labels': ['logic_programming', 'machine_learning', 'natural_language_processing'], 'paper_num': 151}


In [None]:
female = 0
female_academic = 0
academic = 0
for each in scholars:
  if each["gender"] == 'F':
    female += 1
    if each["academic"] == True:
      female_academic += 1
  if each["academic"] == True:
    academic += 1
print(female_academic / female)
print(academic / len(scholars))

0.617876424715057
0.6025105175886972
