In [2]:
import dimcli
import numpy as np
import pandas as pd

In [7]:
# dimcli.login(
#     key="", endpoint="https://app.dimensions.ai"
# )
dsl = dimcli.Dsl()

get_new_data = False



In [9]:
def get_split(ser: pd.Series):
    return np.array_split(ser, ser.shape[0] / 400)


def get_data():
    if get_new_data:
        control = pd.read_csv("control-pmid.csv")
        """Query:
        https://pubmed.ncbi.nlm.nih.gov/?term=%28%28%22machine+learning%22%5BMajr%5D+OR+%28%22machine%22%5Bti%5D+AND+%22learning%22%5Bti%5D%29+OR+%22machine+learning%22%5Bti%5D+OR+%22AI%22%5Bti%5D+OR+%22Artificial+Intelligence%22%5Bti%5D+OR+%22artificially+intelligent%22%5Bti%5D+OR+%22Artificial+Intelligence%22%5BMeSH%5D+OR+%22Algorithms%22%5BMeSH%5D+OR+%22algorithm*%22%5Bti%5D+OR+%22deep+learning%22%5Bti%5D+OR+%22computer+vision%22%5Bti%5D+OR+%22natural+language+processing%22%5Bti%5D+OR+%22neural+network*%22%5Bti%5D+OR+%22neural+networks%2C+computer%22%5BMeSH%5D+OR+%22intelligent+machine*%22%5Bti%5D%29+AND+%28exp+%22Intensive+Care+Units%22%2F+OR+exp+%22Critical+Care%22%2F+OR+%28ICU+OR+IC+OR+%28%28intensive+OR+critical%29+ADJ3+%28care+OR+therapy+OR+unit*+OR+patient*+OR+department%5C*%29%29%29%29%29&filter=years.2010-2023
        """
        df_pubs_control = pd.DataFrame()
        df_auths_control = pd.DataFrame()
        df_affils_control = pd.DataFrame()

        for pmid_list in get_split(control.PMID):
            query = fr"""search publications where pmid in {list(pmid_list)} and year in [2010:2022]
            return publications[basics+abstract+authors_count+doi+concepts_scores+times_cited+mesh_terms
            +journal+editors+field_citation_ratio+funder_countries+funders+open_access
            +relative_citation_ratio+publisher+pmid+supporting_grant_ids+research_org_cities+pmid
            +research_org_countries+research_org_country_names+research_org_names+research_org_state_codes
            +research_org_state_names+research_orgs+researchers+category_bra+category_for+category_hra+category_hrcs_hc+
            category_hrcs_rac+category_icrp_cso+category_icrp_ct+category_rcdc+category_sdg+category_uoa]"""
            data = dsl.query_iterative(query)
            df_pubs_control = pd.concat([df_pubs_control, data.as_dataframe()])
            df_auths_control = pd.concat(
                [df_auths_control, data.as_dataframe_authors()]
            )
            df_affils_control = pd.concat(
                [df_affils_control, data.as_dataframe_authors_affiliations()]
            )

        df_pubs_control.to_csv("control_group_pubs.csv")
        df_auths_control.to_csv("control_group_auths.csv")
        df_affils_control.to_csv("control_group_affils.csv")

        compare_query = fr"""search publications where year in [2010:2022] for "\"datamed org display item php\" OR \"physionet mimic ii database\" 
        OR \"mimic iii medical information mart for intensive care\" OR
        \"10.13026 s6n6 xd98\" OR \"mimic iv\" OR \"physionet org content mimiciv\" OR
        \"eicu crd\" OR \"mimic cxr\"" return publications[basics+abstract+authors_count+doi+concepts_scores+times_cited
            +mesh_terms+journal+editors+field_citation_ratio+funder_countries+funders+open_access
            +relative_citation_ratio+publisher+pmid+supporting_grant_ids+research_org_cities+pmid
            +research_org_countries+research_org_country_names+research_org_names+research_org_state_codes
            +research_org_state_names+research_orgs+researchers+category_bra+category_for+category_hra+category_hrcs_hc+
            category_hrcs_rac+category_icrp_cso+category_icrp_ct+category_rcdc+category_sdg+category_uoa]"""
        compare = dsl.query_iterative(compare_query)

        df_pubs_compare = compare.as_dataframe()
        df_auths_compare = compare.as_dataframe_authors()
        df_affils_compare = compare.as_dataframe_authors_affiliations()

        df_pubs_compare.to_csv("mimic_pubs.csv")
        df_auths_compare.to_csv("mimic_auths.csv")
        df_affils_compare.to_csv("mimic_affils.csv")

    else:
        df_pubs_compare = pd.read_csv("MIMIC Files/mimic_pubs (1).csv")
        df_auths_compare = pd.read_csv("MIMIC Files/mimic_auths.csv")
        df_affils_compare = pd.read_csv("MIMIC Files/mimic_affils.csv")

        df_pubs_control = pd.read_csv("MIMIC Files/control_group_pubs.csv")
        df_auths_control = pd.read_csv("MIMIC Files/control_group_auths.csv")
        df_affils_control = pd.read_csv("MIMIC Files/control_group_affils (1).csv")

    return (
        df_pubs_compare,
        df_auths_compare,
        df_affils_compare,
        df_pubs_control,
        df_auths_control,
        df_affils_control,
    )

In [11]:
(
    df_pubs_compare,
    df_auths_compare,
    df_affils_compare,
    df_pubs_control,
    df_auths_control,
    df_affils_control,
) = get_data()

In [5]:
def get_missingness(df_pubs, df_auths, df_affils):
    df_completeness = pd.DataFrame()
    df_completeness["id"] = df_pubs["id"]

    df_completeness["Publications"] = True
    df_completeness["Authors"] = df_pubs.id.isin(df_auths.pub_id)
    df_completeness["Affiliations"] = df_pubs.id.isin(df_affils.pub_id)

    df_completeness.head()
    print(f"Papers w/ Missing Publications:   0")
    print(
        f"Papers w/ Missing Authors:        {df_completeness.Authors.value_counts()[False]}"
    )
    print(
        f"Papers w/ Missing Affiliations:   {df_completeness.Affiliations.value_counts()[False]}"
    )

    return None  # df_completeness


print("Compare")
get_missingness(df_pubs_compare, df_auths_compare, df_affils_compare)
print()
print("Control")
get_missingness(df_pubs_control, df_auths_control, df_affils_control)

Compare
Papers w/ Missing Publications:   0
Papers w/ Missing Authors:        110
Papers w/ Missing Affiliations:   635

Control
Papers w/ Missing Publications:   0
Papers w/ Missing Authors:        2
Papers w/ Missing Affiliations:   35


In [21]:
df_gender_dictionary = pd.read_csv("_name_genderize_output (1).csv")
df_gender_dictionary.drop_duplicates("ga_first_name", inplace=True)
df_gender_dictionary.set_index("ga_first_name", inplace=True)


CountryClass = pd.read_excel("CountryClassWordBank (1).xlsx")
CountryClass_2 = CountryClass.copy()
CountryClass_2.set_index("Code", inplace=True)
missing_codes = []


gender_dict = dict(zip(df_gender_dictionary.index, df_gender_dictionary.ga_gender))
country_dict = dict(zip(CountryClass_2.index, CountryClass_2["Income group"]))
gender_map = {"male": 0, "female": 1, "unknown": np.nan}
income_map = {
    "High income": 4,
    "Upper middle income": 3,
    "Lower middle income": 2,
    "Low income": 1,
}


def get_gender(name):
    if pd.isna(name):
        return np.nan
    name = name.strip()
    name = name.split(sep=" ")[0]
    if name in gender_dict:
        return gender_dict[name]
    else:
        return np.nan


def get_gender_ints(gender):
    if gender in gender_map:
        return gender_map[gender]
    else:
        return np.nan


def get_country_income(country_code):
    if pd.isna(country_code):
        return np.nan
    country_ISO3 = pycountry.countries.get(alpha_2=country_code)
    if country_ISO3 is None:
        missing_codes.append(country_code)
        return np.nan
    elif country_ISO3.alpha_3 in country_dict:
        return country_dict[country_ISO3.alpha_3]
    else:
        missing_codes.append(country_code)
        return np.nan


def map_country_income(income_group):
    if income_group in income_map:
        return income_map[income_group]
    else:
        return np.nan


def get_pub_year(pub_id, df_pubs):
    return df_pubs.at[pub_id, "year"]


def categorize_author_counts(num_authors):
    if pd.isna(num_authors) or num_authors == 0:
        return np.nan
    if num_authors == 1:
        return "1"
    if num_authors == 2:
        return "2"
    if num_authors >= 3 and num_authors <= 5:
        return "3-5"
    if num_authors >= 6:
        return "6+"
    return np.nan


def pipeline(list_of_dataframes: list):
    df_pubs, df_auths, df_affils = list_of_dataframes
    df_pubs_copy = df_pubs.copy()
    df_pubs_copy.set_index("id", inplace=True)

    df_auths["year"] = df_auths["pub_id"].apply(
        lambda pid: get_pub_year(pid, df_pubs_copy)
    )
    df_affils["year"] = df_affils["pub_id"].apply(
        lambda pid: get_pub_year(pid, df_pubs_copy)
    )

    df_auths["first_name"] = df_auths["first_name"].str.title()
    df_affils["first_name"] = df_affils["first_name"].str.title()

    df_auths["gender"] = df_auths["first_name"].map(get_gender)
    df_affils["gender"] = df_affils["first_name"].map(get_gender)

    df_auths["gender_ints"] = df_auths["gender"].map(get_gender_ints)
    df_affils["gender_ints"] = df_affils["gender"].map(get_gender_ints)

    df_pubs["author_count_categories"] = df_pubs["authors_count"].apply(
        categorize_author_counts
    )

    lfirstnames = dict.fromkeys(df_pubs.id, np.nan)
    lgenders = dict.fromkeys(df_pubs.id, np.nan)
    lpctfemale = dict.fromkeys(df_pubs.id, np.nan)
    lffirst = dict.fromkeys(df_pubs.id, np.nan)
    lflast = dict.fromkeys(df_pubs.id, np.nan)
    lnfemale = dict.fromkeys(df_pubs.id, np.nan)
    lnmale = dict.fromkeys(df_pubs.id, np.nan)
    lnnan = dict.fromkeys(df_pubs.id, np.nan)

    for pub_id, df_info in df_auths.groupby("pub_id"):
        lfirstnames[pub_id] = df_info["first_name"]
        lgenders[pub_id] = df_info["gender_ints"]

        genders = df_info["gender_ints"]

        lffirst[pub_id] = genders.iloc[0]
        lflast[pub_id] = genders.iloc[-1]

        num_female = (genders == 1).sum()
        num_male = (genders == 0).sum()
        num_missing = genders.isna().sum()
        lnfemale[pub_id] = num_female  
        lnmale[pub_id] = num_male 
        lnnan[pub_id] = num_missing  
        total = num_male + num_female
        if total == 0:
            lpctfemale[pub_id] = np.nan
        else:
            lpctfemale[pub_id] = num_female / total

    df_pubs["first_names"] = lfirstnames.values()
    df_pubs["genders"] = lgenders.values()
    df_pubs["num_female"] = lnfemale.values()
    df_pubs["num_male"] = lnmale.values()
    df_pubs["num_na"] = lnnan.values()
    df_pubs["pct_female_real"] = lpctfemale.values()
    df_pubs["female_first"] = lffirst.values()
    df_pubs["female_last"] = lflast.values()

    cols = ["num_female", "num_male", "num_na"]
    total = df_pubs[cols[0]] + df_pubs[cols[1]] + df_pubs[cols[2]]
    df_pubs["pct_female_pessimistic"] = df_pubs[cols[0]] / total
    df_pubs["pct_female_optimistic"] = (df_pubs[cols[0]] + df_pubs[cols[2]]) / total
    df_pubs["female_first_pessimistic"] = df_pubs["female_first"].apply(
        lambda x: 0 if pd.isna(x) else x
    )
    df_pubs["female_first_optimistic"] = df_pubs["female_first"].apply(
        lambda x: 1 if pd.isna(x) else x
    )
    df_pubs["female_last_pessimistic"] = df_pubs["female_last"].apply(
        lambda x: 0 if pd.isna(x) else x
    )
    df_pubs["female_last_optimistic"] = df_pubs["female_last"].apply(
        lambda x: 1 if pd.isna(x) else x
    )

    df_affils["income_class"] = df_affils["aff_country_code"].apply(get_country_income)
    df_affils["income_class_num"] = df_affils["income_class"].map(map_country_income)

    return [df_pubs, df_auths, df_affils]

FileNotFoundError: [Errno 2] No such file or directory: '_name_genderize_output (1).csv'

In [20]:
df_pubs_compare, df_auths_compare, df_affils_compare = pipeline(
    [df_pubs_compare, df_auths_compare, df_affils_compare]
)

df_pubs_control, df_auths_control, df_affils_control = pipeline(
    [df_pubs_control, df_auths_control, df_affils_control]
)

df_pubs_control = df_pubs_control[~df_pubs_control.id.isin(df_pubs_compare.id)]
df_auths_control = df_auths_control[~df_auths_control.pub_id.isin(df_pubs_compare.id)]
df_affils_control = df_affils_control[
    ~df_affils_control.pub_id.isin(df_pubs_compare.id)
]

NameError: name 'pipeline' is not defined

In [None]:
# df_pubs_control.to_csv("control_group_pubs.csv")
# df_auths_control.to_csv("control_group_auths.csv")
# df_affils_control.to_csv("control_group_affils.csv")
# df_pubs_compare.to_csv("mimic_pubs.csv")
# df_auths_compare.to_csv("mimic_auths.csv")
# df_affils_compare.to_csv("mimic_affils.csv")

In [22]:
num_compare = df_pubs_compare.shape[0]
num_control = df_pubs_control.shape[0]
num_total = num_compare + num_control

print(f"Compare group papers: {num_compare}. Pct: {num_compare / num_total}")
print(f"Control group papers: {num_control}. Pct: {num_control / num_total}")
print(f"Total:                {num_total}")

NameError: name 'df_pubs_compare' is not defined

In [18]:
num_compare_auths = df_auths_compare.shape[0]
num_control_auths = df_auths_control.shape[0]
num_total_auths = num_compare_auths + num_control_auths

print(
    f"Compare group authors: {num_compare_auths}. Pct: {num_compare_auths / num_total_auths}"
)
print(
    f"Control group authors: {num_control_auths}. Pct: {num_control_auths / num_total_auths}"
)
print(f"Total:                 {num_total_auths}")

NameError: name 'df_auths_compare' is not defined

In [23]:
num_unique_auths_compare = df_auths_compare.researcher_id.nunique()
num_unique_auths_control = df_auths_control.researcher_id.nunique()
num_total_unique = num_unique_auths_compare + num_unique_auths_control

print(
    f"Compare group authors: {num_unique_auths_compare}. Pct: {num_unique_auths_compare / num_total_auths}"
)
print(
    f"Control group authors: {num_unique_auths_control}. Pct: {num_unique_auths_control / num_total_auths}"
)
print(f"Total:                {num_total_unique}")

NameError: name 'df_auths_compare' is not defined

In [22]:
def process(df_pubs, df_auths, df_affils):
    # Get rid of papers with missing genders
    pubs = df_pubs[df_pubs.num_na == 0]
    auths = df_auths[df_auths.pub_id.isin(pubs.id)]
    affils = df_affils[df_affils.pub_id.isin(pubs.id)]
    return pubs, auths, affils


def print_col(col):
    print(col)
    print()
    print("Compare:")
    print(df_pubs_compare[col].describe())
    print()
    print("Control:")
    print(df_pubs_control[col].describe())
    print()


df_pubs_compare, df_auths_compare, df_affils_compare = process(
    df_pubs_compare, df_auths_compare, df_affils_compare
)

df_pubs_control, df_auths_control, df_affils_control = process(
    df_pubs_control, df_auths_control, df_affils_control
)

print_col("pct_female_real")
print_col("female_first")
print_col("female_last")

pct_female_real

Compare:
count    1426.000000
mean        0.261378
std         0.230498
min         0.000000
25%         0.000000
50%         0.250000
75%         0.400000
max         1.000000
Name: pct_female_real, dtype: float64

Control:
count    1823.000000
mean        0.290255
std         0.252795
min         0.000000
25%         0.000000
50%         0.250000
75%         0.500000
max         1.000000
Name: pct_female_real, dtype: float64

female_first

Compare:
count    1426.000000
mean        0.281206
std         0.449746
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: female_first, dtype: float64

Control:
count    1823.000000
mean        0.309380
std         0.462365
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: female_first, dtype: float64

female_last

Compare:
count    1426.000000
mean        0.235624
std         0.424537
min         0.000000
25%  

In [23]:
def get_female_any(df_auths):
    atleast_one = (
        df_auths.groupby("pub_id")["gender_ints"].any().value_counts()[True]
    )
    none = df_auths.groupby("pub_id")["gender_ints"].any().value_counts()[False]
    total = df_auths["pub_id"].nunique()

    print(f"Any female author: {atleast_one}  /  {atleast_one / total}")
    print(f"No female authors: {none}   /  {none / total}")
    print(f"Total:  {total}")


print("Compare group:")
get_female_any(df_auths_compare)
print()
print("Control group:")
get_female_any(df_auths_control)

Compare group:
Any female author: 1024  /  0.7180925666199158
No female authors: 402   /  0.28190743338008417
Total:  1426

Control group:
Any female author: 1330  /  0.7295666483817883
No female authors: 493   /  0.27043335161821175
Total:  1823


In [24]:
def get_female_first(df_pubs):
    female_first = df_pubs["female_first"].value_counts()[1]
    male_first = df_pubs["female_first"].value_counts()[0]
    total = female_first + male_first

    print(f"Female first author: {female_first}  /  {female_first / total}")
    print(f"Male first author  : {male_first}   /  {male_first / total}")
    print(f"Total:  {total}")


print("Compare group:")
get_female_first(df_pubs_compare)
print()
print("Control group:")
get_female_first(df_pubs_control)

Compare group:
Female first author: 401  /  0.2812061711079944
Male first author  : 1025   /  0.7187938288920056
Total:  1426

Control group:
Female first author: 564  /  0.30938014262205155
Male first author  : 1259   /  0.6906198573779484
Total:  1823


In [25]:
def get_female_last(df_pubs):
    female_last = df_pubs["female_last"].value_counts()[1]
    male_last = df_pubs["female_last"].value_counts()[0]
    total = female_last + male_last

    print(f"Female last author: {female_last}  /  {female_last / total}")
    print(f"Male last author  : {male_last}   /  {male_last / total}")
    print(f"Total:  {total}")


print("Compare group:")
get_female_last(df_pubs_compare)
print()
print("Control group:")
get_female_last(df_pubs_control)

Compare group:
Female last author: 336  /  0.23562412342215988
Male last author  : 1090   /  0.7643758765778401
Total:  1426

Control group:
Female last author: 395  /  0.21667580910586945
Male last author  : 1428   /  0.7833241908941305
Total:  1823


In [27]:
from scipy.stats import ttest_ind


def get_female_any(df):
    return df.groupby("pub_id").gender_ints.any()


def get_without_na(df):
    return df.dropna()


print(
    f"t-test: female any {ttest_ind(get_female_any(df_auths_compare), get_female_any(df_auths_control), nan_policy='omit', alternative='greater')}"
)
print(
    f"t-test: female first {ttest_ind(df_pubs_compare.female_first.dropna(), df_pubs_control.female_first.dropna(), alternative='greater')}"
)
print(
    f"t-test: female last {ttest_ind(df_pubs_compare.female_last.dropna(), df_pubs_control.female_last.dropna(), nan_policy='omit', alternative='greater')}"
)

t-test: female any Ttest_indResult(statistic=-0.7263278762737502, pvalue=0.7661549263237635)
t-test: female first Ttest_indResult(statistic=-1.7443514853693605, pvalue=0.959403780265583)
t-test: female last Ttest_indResult(statistic=1.2834779402615775, pvalue=0.09970813645867084)
