In [None]:
import pandas as pd
import numpy as np

In [None]:
df_gender_dictionary = pd.read_csv("") #Gender Dictionary
df_gender_dictionary.drop_duplicates("First Name Column", inplace=True) #Drop duplicate first names
df_gender_dictionary.set_index("First Name Column", inplace=True) #Index by first name


CountryClass = pd.read_excel("World Bank Income Data") #World Bank Income Data
CountryClass_2 = CountryClass.copy()
CountryClass_2.set_index("Country Code Column", inplace=True)

In [None]:
missing_codes = []

def get_country(country_code):
    if pd.isna(country_code):
        return np.nan
    country_ISO3 = pycountry.countries.get(alpha_2=country_code)
    if country_ISO3 is None:
        missing_codes.append(country_code)
        return np.nan
    try:
        return CountryClass_2.loc[country_ISO3.alpha_3]["Income group"]
    except:
        missing_codes.append(country_code)
        return np.nan


def get_income_level(x):
    m = {
        "High income": 4,
        "Upper middle income": 3,
        "Lower middle income": 2,
        "Low income": 1,
    }
    if x in m:
        return m[x]
    return np.nan


def pipeline(list_of_dataframes: list):
    df_pubs, df_auths, df_affils = list_of_dataframes
    df_pubs_copy = df_pubs.copy()
    df_pubs_copy.set_index("id", inplace=True)

    def get_year(pub_id):
        return df_pubs_copy.at[pub_id, "year"]

    df_auths["year"] = df_auths["pub_id"].apply(lambda x: get_year(x))
    df_affils["year"] = df_affils["pub_id"].apply(lambda x: get_year(x))

    df_auths["first_name"] = df_auths["first_name"].apply(
        lambda x: x.title() if not pd.isna(x) else None
    )
    df_affils["first_name"] = df_affils["first_name"].apply(
        lambda x: x.title() if not pd.isna(x) else None
    )

    df_auths["gender"] = np.nan
    # If want to filter for US names only, do df[df.aff_country == 'United States']
    def get_gender(
        name,
    ):  # Needed for vertical application (Otherwise this takes too long)
        if pd.isna(name):
            return np.nan
        name = name.strip()
        name = name.split(sep=" ")[0]
        try:
            x = df_gender_dictionary.at[name, "ga_gender"]
            if x != "unknown":
                return x
            return np.nan
        except:
            return np.nan

    df_auths["gender"] = df_auths["first_name"].apply(lambda x: get_gender(x))
    df_affils["gender"] = df_affils["first_name"].apply(lambda x: get_gender(x))

    def get_gender_int(gender):
        if pd.isna(gender):
            return np.nan
        if gender == "male":
            return 1
        return 0

    df_auths["gender_ints"] = df_auths["gender"].apply(lambda x: get_gender_int(x))
    df_affils["gender_ints"] = df_affils["gender"].apply(lambda x: get_gender_int(x))

    def author_buckets(x):
        if pd.isna(x) or x == 0:
            return np.nan
        if x == 1:
            return "1"
        if x == 2:
            return "2"
        if x >= 3 and x <= 5:
            return "3-5"
        if x >= 6:
            return "6+"
        return np.nan

    df_pubs["author_count_categories"] = df_pubs["authors_count"].apply(
        lambda x: author_buckets(x)
    )

    lfirstnames = dict.fromkeys(df_pubs.id, np.nan)
    lgenders = dict.fromkeys(df_pubs.id, np.nan)
    lpctfemale = dict.fromkeys(df_pubs.id, np.nan)
    lffirst = dict.fromkeys(df_pubs.id, np.nan)
    lflast = dict.fromkeys(df_pubs.id, np.nan)
    lnfemale = dict.fromkeys(df_pubs.id, np.nan)
    lnmale = dict.fromkeys(df_pubs.id, np.nan)
    lnnan = dict.fromkeys(df_pubs.id, np.nan)

    for pub_id, df_info in df_auths.groupby("pub_id"):
        lfirstnames[pub_id] = list(df_info["first_name"])
        lgenders[pub_id] = list(df_info["gender_ints"])

        genders = str(list(df_info["gender_ints"]))

        if df_info["gender"].iloc[0] == "female":
            lffirst[pub_id] = 1
        if df_info["gender"].iloc[0] == "male":
            lffirst[pub_id] = 0

        if df_info["gender"].iloc[-1] == "female":
            lflast[pub_id] = 1
        if df_info["gender"].iloc[-1] == "male":
            lflast[pub_id] = 0

        lnfemale[pub_id] = genders.count("0.0")
        lnmale[pub_id] = genders.count("1.0")
        lnnan[pub_id] = genders.count("nan")
        total = genders.count("0.0") + genders.count("1.0")
        if total == 0:
            lpctfemale[pub_id] = np.nan
        else:
            lpctfemale[pub_id] = genders.count("0.0") / total

    df_pubs["first_names"] = lfirstnames.values()
    df_pubs["genders"] = lgenders.values()
    df_pubs["num_female"] = lnfemale.values()
    df_pubs["num_male"] = lnmale.values()
    df_pubs["num_na"] = lnnan.values()
    df_pubs["pct_female_real"] = lpctfemale.values()
    df_pubs["female_first"] = lffirst.values()
    df_pubs["female_last"] = lflast.values()

    cols = ["num_female", "num_male", "num_na"]
    total = df_pubs[cols[0]] + df_pubs[cols[1]] + df_pubs[cols[2]]
    df_pubs["pct_female_pessimistic"] = df_pubs[cols[0]] / total
    df_pubs["pct_female_optimistic"] = (df_pubs[cols[0]] + df_pubs[cols[2]]) / total
    df_pubs["female_first_pessimistic"] = df_pubs["female_first"].apply(
        lambda x: 0 if pd.isna(x) else x
    )
    df_pubs["female_first_optimistic"] = df_pubs["female_first"].apply(
        lambda x: 1 if pd.isna(x) else x
    )
    df_pubs["female_last_pessimistic"] = df_pubs["female_last"].apply(
        lambda x: 0 if pd.isna(x) else x
    )
    df_pubs["female_last_optimistic"] = df_pubs["female_last"].apply(
        lambda x: 1 if pd.isna(x) else x
    )

    df_affils["income_class"] = df_affils["aff_country_code"].apply(lambda x: get_country(x))
    df_affils["income_class_num"] = df_affils["income_class"].apply(lambda x: get_income_level(x))

    return [df_pubs, df_auths, df_affils]