# Minority Serving Institutions Present in Affiliation Data

* The fuzzy matching pipeline below was developed to discern the associations between authors and specific Minority Serving Institutions (MSIs).

In [1]:
from fuzzywuzzy import fuzz

In [2]:
import pandas as pd
import numpy as np

In [3]:
# author affiliation data for the compare and control datasets
controlData = pd.read_csv("GuruData/control_group_affils(7).csv", encoding='utf-8')
compareData = pd.read_csv("GuruData/mimic_affils(7).csv", encoding='utf-8')

# below are the datasets containing Minority Serving Institutions for the year 2020 and 2022
minorityList2020 = pd.read_csv("GuruData/2020_Minority_Serving_Institutions-1.csv")
minorityList2022 = pd.read_csv("GuruData/2022 CMSI Eligibility Matrix .csv")

(25676, 19)

In [4]:
controlData.head()

Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,aff_state_code,pub_id,researcher_id,first_name,last_name,year,gender,gender_ints,income_class,income_class_num
0,6,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.015256430001.18,Ellie,D’Hondt,2022,female,1.0,High income,4.0
1,7,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.010363506743.89,Thomas J.,Ashby,2022,male,0.0,High income,4.0
2,8,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.012727306525.02,Imen,Chakroun,2022,female,1.0,High income,4.0
3,9,Leuven,2792482.0,Belgium,BE,,"Independent Consultant, Leuven, Belgium","Independent Consultant, Leuven, Belgium",Flanders,,pub.1153863516,,Thomas,Koninckx,2022,male,0.0,High income,4.0
4,10,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.012704525025.77,Roel,Wuyts,2022,male,0.0,High income,4.0


In [5]:
controlData.shape[0]

25676

In [6]:
compareData.head()

Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,aff_state_code,pub_id,researcher_id,first_name,last_name,year,gender,gender_ints,income_class,income_class_num
0,0,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.014116573320.23,Xiaxuan,Huang,2022,male,0.0,Upper middle income,3.0
1,1,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.016241001520.36,Shiqi,Yuan,2022,female,1.0,Upper middle income,3.0
2,2,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.013553641416.27,Yitong,Ling,2022,female,1.0,Upper middle income,3.0
3,3,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.014351222016.81,Shanyuan,Tan,2022,female,1.0,Upper middle income,3.0
4,4,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Clinical Research, The First Aff...",,,pub.1154144039,ur.014227217615.79,Tao,Huang,2022,male,0.0,Upper middle income,3.0


In [7]:
compareData.shape[0]

13967

In [8]:
minorityList2020.head()

Unnamed: 0,MSI Type,Institution,Unnamed: 2,City,State/Territory,Type
0,ANNH,University of Alaska Fairbanks,,Fairbanks,AK,Pub 4yr
1,ANNH,University of Alaska Southeast,,Juneau,AK,Pub 4yr
2,ANNH & NASNTI,Alaska Christian College,,Soldotna,AK,Pri 2yr
3,NASNTI,Alaska Pacific University,,Anchorage,AK,Pri 4yr
4,TCU,Ilisagvik College,,Barrow,AK,Pub 4yr


In [9]:
# dropping all the NA values
minorityList2022['Institution Name'].dropna()

0              Aaniiih Nakoda College
1              Adams State University
2                  Adelphi University
3             AdventHealth University
4            Alabama A & M University
                    ...              
858        Woodland Community College
859          World Mission University
860          World Mission University
861    Xavier University of Louisiana
862             Yakima Valley College
Name: Institution Name, Length: 863, dtype: object

In [10]:
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [11]:
msi = []

# compiling MSI type information for the 2022 list
for i in range(865):
        if(isfloat(minorityList2022.loc[i]['ANNH']) == False and minorityList2022.loc[i]['ANNH'] == 'Yes'):
            msi.append('ANNH')
        elif(isfloat(minorityList2022.loc[i]['AANAPISI']) == False and minorityList2022.loc[i]['AANAPISI'] == 'Yes'):
            msi.append('AANAPISI')
        elif(isfloat(minorityList2022.loc[i]['HSI']) == False and minorityList2022.loc[i]['HSI'] == 'Yes'):
            msi.append('HSI')
        elif(isfloat(minorityList2022.loc[i]['NASNTI']) == False and minorityList2022.loc[i]['NASNTI'] == 'Yes'):
            msi.append('NASNTI')
        elif(isfloat(minorityList2022.loc[i]['PBI']) == False and minorityList2022.loc[i]['PBI'] == 'Yes'):
            msi.append('PBI')
        elif(isfloat(minorityList2022.loc[i]['HBCU']) == False and minorityList2022.loc[i]['HBCU'] == 'Yes'):
            msi.append('HBCU')
        elif(isfloat(minorityList2022.loc[i]['TCU']) == False and minorityList2022.loc[i]['TCU'] == 'Yes'):
            msi.append('TCU')
        else:
            msi.append('No MSI Type')
    

In [12]:
#appending the MSI type information to the 2022 file
minorityList2022['MSI Type'] = msi

In [13]:
#understanding the number of unique institutions (non-exact matches) in each respective file
uniqueInstitutions = set(minorityList2020['Institution']).intersection(set(minorityList2022['Institution Name']))
print(uniqueInstitutions)

{'CUNY Hunter College', 'SUNY Westchester Community College', 'Trinity Washington University', 'Metropolitan State University of Denver', 'Heritage University', 'Laney College', 'Seminole State College of Florida', 'University of Guam', 'Cossatot Community College of the University of Arkansas', 'East Los Angeles College', 'Aurora University', 'Nevada State College', 'Rogers State University', 'Palomar College', 'University of North Carolina at Pembroke', 'Hope International University', 'Pacific Islands University', 'Antelope Valley College', 'Whittier College', 'Century College', 'University of North Texas at Dallas', 'Cisco College', 'Mercy College', 'South Plains College', 'Texas State Technical College', 'Georgia State University', 'Nova Southeastern University', 'Fisk University', 'McMurry University', 'Belhaven University', 'Modesto Junior College', 'Southwestern Christian College', 'Winston-Salem State University', 'Leech Lake Tribal College', 'El Paso Community College', 'Las 

In [14]:
minorityList2020['Institution'].isin(minorityList2022['Institution Name']).value_counts()

True     495
False    279
Name: Institution, dtype: int64

In [15]:
minorityList2022['Institution Name'].isin(minorityList2020['Institution']).value_counts()

True     496
False    369
Name: Institution Name, dtype: int64

In [16]:
minorityList2020.shape[0]

774

In [17]:
minorityList2022.shape[0]

865

### MSI List Similiarity Analysis (2020 vs 2022):
* A total of 496 institutions were shared by the two sources (exact matches)
    * Some more institutions are shared but their institution name is formatted differently

### Using fuzzy match (threshold value of 97) percent to guage a more accurate approximation

In [18]:
def matchMSIFuzzy(x):
    for i in range(862):
        if(isfloat(x) == False and fuzz.partial_ratio(minorityList2022.loc[i]['Institution Name'], x) > 97):
            return "Yes"
    return "No"

In [19]:
minorityList2020['isIn2022'] = minorityList2020['Institution'].apply(matchMSIFuzzy)

In [20]:
minorityList2020['isIn2022'].value_counts()

Yes    566
No     208
Name: isIn2022, dtype: int64

### MSI List Similiarity Analysis w/ Fuzzy Match (2020 vs 2022):
* A total of 566 institutions were shared by the two sources

In [21]:
# dropping all the affiliation data that doesn't originate within the United States or has no country affilation
comparI = compareData[compareData['aff_country'] != 'United States'].index
compareData = compareData.drop(comparI)
controlI = controlData[controlData['aff_country'] != 'United States'].index
controlData = controlData.drop(controlI)
compareData['aff_country'].dropna()
controlData['aff_country'].dropna()

6        United States
38       United States
39       United States
40       United States
41       United States
             ...      
25660    United States
25661    United States
25662    United States
25663    United States
25664    United States
Name: aff_country, Length: 7654, dtype: object

### Fuzzy Matching Affiliation Data for Compare and Control Groups with MSI Lists

In [22]:
# fuzzy matching methods for both 2020 and 2022 lists

def fuzzTest(x):
    for i in range(774):
        if(isfloat(x) == False and fuzz.partial_ratio(minorityList2020.loc[i]['Institution'], x) > 97):
            return minorityList2020.loc[i]['Institution']
    return "Not Minority"

def fuzzTest2(x):
    for i in range(862):
        if(isfloat(x) == False and fuzz.partial_ratio(minorityList2022.loc[i]['Institution Name'], x) > 97):
            return minorityList2022.loc[i]['Institution Name']
    return "Not Minority"



In [23]:
# applying the method to the affiliation name column
compareData['isMinority_2020'] = compareData['aff_name'].apply(fuzzTest)
controlData['isMinority_2020'] = controlData['aff_name'].apply(fuzzTest)
compareData['isMinority_2022'] = compareData['aff_name'].apply(fuzzTest2)
controlData['isMinority_2022'] = controlData['aff_name'].apply(fuzzTest2)

In [24]:
#understanding the distributions of the values

In [25]:
compareData['isMinority_2020'].value_counts()

Not Minority                                                3370
University of Washington - Seattle                            70
Cheyney University of Pennsylvania                            59
University of Minnesota - Twin Cities                         41
Arizona State University                                      24
Vanguard University of Southern California                    16
University of California, Irvine                              12
University of Illinois at Chicago                              8
Texas A&M University - Corpus Christi                          8
University of Connecticut - Hartford Campus                    7
Colorado State University - Pueblo                             7
University of Texas at Arlington                               6
University of Arizona (The)                                    6
Winston-Salem State University                                 5
Rutgers, the State University of New Jersey                    5
Florida International Uni

In [26]:
compareMinorityValues2020 = compareData['isMinority_2020'].value_counts().to_frame()

In [27]:
controlData['isMinority_2020'].value_counts()

Not Minority                                                              7031
Cheyney University of Pennsylvania                                         173
Vanguard University of Southern California                                  87
University of Washington - Seattle                                          77
University of Minnesota - Twin Cities                                       40
University of Arizona (The)                                                 39
University of California, Davis                                             34
University of California, Irvine                                            28
University of Texas Health Science Center at San Antonio                    27
University of Connecticut - Hartford Campus                                 14
Rutgers, the State University of New Jersey                                 13
Pennsylvania State University - Penn State Abington                         11
Georgia State University                            

In [28]:
controlMinorityValues2020 = controlData['isMinority_2020'].value_counts().to_frame()

In [29]:
compareData['isMinority_2022'].value_counts()

Not Minority                                                    3377
University of Washington-Tacoma Campus                            70
Cheyney University of Pennsylvania                                59
The University of Texas at Austin                                 44
University of Minnesota-Morris                                    41
Vanguard University of Southern California                        16
Texas A&M University-Central Texas                                 8
University of Connecticut-Hartford Campus                          7
Colorado State University Pueblo                                   7
University of Arizona                                              6
The University of Texas at Arlington                               6
Winston-Salem State University                                     5
University of Houston                                              4
Loma Linda University                                              4
Florida International University  

In [30]:
compareMinorityValues2022 = compareData['isMinority_2022'].value_counts().to_frame()

In [31]:
controlData['isMinority_2022'].value_counts()

Not Minority                                                    7062
Cheyney University of Pennsylvania                               173
Vanguard University of Southern California                        87
University of Washington-Tacoma Campus                            77
University of Minnesota-Morris                                    40
University of Arizona                                             39
The University of Texas at Austin                                 25
Wayne State University                                            23
The University of Texas Health Science Center at San Antonio      18
University of Connecticut-Hartford Campus                         14
Georgia State University                                          10
Texas Tech University                                              9
Virginia Commonwealth University                                   8
Loma Linda University                                              7
California Polytechnic State Unive

In [32]:
controlMinorityValues2022 = controlData['isMinority_2022'].value_counts().to_frame()

### Understanding MSI types 
* The OIC supports colleges and universities designated as minority serving institutions:

    * Asian American Native American Pacific Islander-Serving Institutions (ANNAPISIs)
    * Alaska Native and Native Hawaiian Serving Institutions (ANNHs)
    * Historically Black Colleges & Universities (HBCUs)
    * Hispanic Serving institutions (HSIs)
    * Native American-Serving Non-Tribal Institutions (NASNTIs)
    * Predominantly Black Institutions (PBIs)
    * Tribal Colleges and Universities (TCUs)


In [49]:
# Type
def typeInfo(x):
    if (x != 'Not Minority'):
        indexArray = minorityList2020[minorityList2020['Institution']== x].index.values
        index = indexArray[0]
        return minorityList2020.loc[index]['Type']
    return 'Not Minority'
def typeInfo2(x):
    if (x != 'Not Minority'):
        indexArray = minorityList2022[minorityList2022['Institution Name']== x].index.values
        index = indexArray[0]
        return minorityList2022.loc[index]['Type']
    return 'Not Minority'

In [50]:
compareData['Type2020'] = compareData['isMinority_2020'].apply(typeInfo)
controlData['Type2020'] = controlData['isMinority_2020'].apply(typeInfo)
compareData['Type2022'] = compareData['isMinority_2022'].apply(typeInfo2)
controlData['Type2022'] = controlData['isMinority_2022'].apply(typeInfo2)

In [51]:
# MSI Type
def msiInfo(x):
    if (x != 'Not Minority'):
        indexArray = minorityList2020[minorityList2020['Institution'] == x].index.values
        index = indexArray[0]
        return minorityList2020.loc[index]['MSI Type']
    return 'Not Minority'
def msiInfo2(x):
    if (x != 'Not Minority'):
        indexArray = minorityList2022[minorityList2022['Institution Name'] == x].index.values
        index = indexArray[0]
        return minorityList2022.loc[index]['MSI Type']
    return 'Not Minority'

In [52]:
compareData['MSI Type2020'] = compareData['isMinority_2020'].apply(msiInfo)
controlData['MSI Type2020'] = controlData['isMinority_2020'].apply(msiInfo)
compareData['MSI Type2022'] = compareData['isMinority_2022'].apply(msiInfo2)
controlData['MSI Type2022'] = controlData['isMinority_2022'].apply(msiInfo2)

In [53]:
compareData['MSI Type2020'].value_counts()

Not Minority      3370
AANAPISI           125
HSI                 79
HBCU                65
AANAPISI & HSI      30
AANAPISI & PBI       1
Name: MSI Type2020, dtype: int64

In [54]:
controlData['MSI Type2020'].value_counts()

Not Minority      7031
HSI                198
AANAPISI           176
HBCU               175
AANAPISI & HSI      63
AANAPISI & PBI      10
PBI                  1
Name: MSI Type2020, dtype: int64

In [55]:
compareData['MSI Type2022'].value_counts()

Not Minority    3377
AANAPISI         139
HBCU              66
HSI               46
NASNTI            42
Name: MSI Type2022, dtype: int64

In [56]:
controlData['MSI Type2022'].value_counts()

Not Minority    7062
HSI              190
AANAPISI         185
HBCU             176
NASNTI            40
PBI                1
Name: MSI Type2022, dtype: int64

### Cleaning Process
* To confirm the accuracy of the mapping between institutional affiliations and their potential MSI status, we performed manual verification on a subset of the results yielded by the aforementioned fuzzy-matching process. In cases when an author’s institutional affiliation was incorrectly mapped to an MSI, we rectified the mistake manually. Verification was limited to reducing false positives, i.e., we only determined institutional affiliations that were erroneously linked with MSIs.
* This allowed us to understand which MSIs appeared in the datasets 
    * Used this information in gender and first vs last author pipelines

# Gender Distributions of Authors Affiliated w/ MSIs

In [67]:
remControl = pd.read_csv("/Users/gurucharanlingamallu/Downloads/controlUnionType.xlsx - Sheet 1 - controlUnion.csv")
remMIMIC = pd.read_csv("/Users/gurucharanlingamallu/Downloads/mimicUnionType.xlsx - Sheet 1 - mimicUnion.csv")
mimic20 = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/minorityMimicCorrectX(d).csv")
mimic22 = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/minorityMimicCorrectX(d).csv")
control20 = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/minorityContolCorrectX(d).csv")
control22 = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/minorityContolCorrectX(d).csv")
mimicAffils = pd.read_csv("/Users/gurucharanlingamallu/Downloads/mimic_affils(7).csv")
controlAffils = pd.read_csv("/Users/gurucharanlingamallu/Downloads/control_group_affils(7).csv")
controlGender = pd.read_csv("/Users/gurucharanlingamallu/Downloads/control_group_affils_v2.csv")

##### following is for cleaning the datasets

In [68]:
def filter_rows_by_values(df, col, values):
    return df[~df[col].isin(valuexs)]

In [69]:
remControl.head()

Unnamed: 0,controlUnion,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Institution,isMinority_Union,MSI_Type
1,0.0,Not Minority,7750,No MSI Type
2,1.0,University of Washington - Seattle,66,AANAPISI
3,2.0,University of Minnesota - Twin Cities,48,AANAPISI
4,3.0,The University of Texas at Austin,35,AANAPISI


In [70]:
remMIMIC.head()

Unnamed: 0,mimicUnion,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Institution,isMinority_Union,MSI_Type
1,0.0,Not Minority,3395,No MSI Type
2,1.0,University of Washington - Seattle,70,AANAPISI
3,2.0,University of Minnesota - Twin Cities,48,AANAPISI
4,3.0,The University of Texas at Austin,44,AANAPISI


In [71]:
remMIMIC = remMIMIC.iloc[: , 1:]
remMIMIC = remMIMIC.rename(columns=remMIMIC.iloc[0])
remControl = remControl.iloc[: , 1:]
remControl = remControl.rename(columns=remControl.iloc[0])
remMIMIC = remMIMIC.iloc[1: , :]
remControl = remControl.iloc[1: , :]
remMIMIC = remMIMIC.iloc[1: , :]
remControl = remControl.iloc[1: , :]

#### below is MSI value counts for the control and compare datasets (union of 2020 and 2022) after manual cleaning process

In [72]:
remMIMIC

Unnamed: 0,Institution,isMinority_Union,MSI_Type
2,University of Washington - Seattle,70,AANAPISI
3,University of Minnesota - Twin Cities,48,AANAPISI
4,The University of Texas at Austin,44,AANAPISI
5,Arizona State University,24,HSI
6,"University of California, Irvine",12,AANAPISI & HSI
7,University of Illinois at Chicago,8,AANAPISI & HSI
8,Texas A&M University-Central Texas,8,HSI
9,University of Texas at Arlington,6,HSI
10,University of Arizona (The),6,HSI
11,Winston-Salem State University,5,HBCU


In [73]:
remControl

Unnamed: 0,Institution,isMinority_Union,MSI_Type
2,University of Washington - Seattle,66,AANAPISI
3,University of Minnesota - Twin Cities,48,AANAPISI
4,The University of Texas at Austin,35,AANAPISI
5,"University of California, Irvine",10,AANAPISI & HSI
6,University of Illinois at Chicago,7,AANAPISI & HSI
7,University of Connecticut-Hartford Campus,7,AANAPISI
8,Texas A&M University-Central Texas,7,HSI
9,University of Arizona (The),6,HSI
10,University of Texas at Arlington,6,HSI
11,"Rutgers, the State University of New Jersey",5,AANAPISI & HSI


In [74]:
#removing the authors and their corresoponding data if they are not affiliated with a MSI
#the following datasets are outputs of the previous MSI identification pipline
mimic20 = mimic20[mimic20.Type2020 != "Not Minority"]
mimic22 = mimic22[mimic22.Type2022 != "Not Minority"]
control20 = control20[control20.Type2020 != "Not Minority"]
control22 = control22[control22.Type2022 != "Not Minority"]

In [75]:
mimic20

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
4,230,230,Tempe,5317058.0,United States,US,grid.215654.1,Arizona State University,"Fulton Schools of Engineering, Arizona State U...",Arizona,...,male,0.0,High income,4.0,Arizona State University,Not Minority,Pub 4yr,Not Minority,HSI,Not Minority
77,704,704,Davis,5341704.0,United States,US,grid.27860.3b,"University of California, Davis","IFM Lab, Department of Computer Science, Unive...",California,...,male,0.0,High income,4.0,"University of California, Davis",Not Minority,Pub 4yr,Not Minority,AANAPISI,Not Minority
78,707,707,Chicago,4887398.0,United States,US,grid.185648.6,University of Illinois at Chicago,"Computer Science Department, University of Ill...",Illinois,...,male,0.0,High income,4.0,University of Illinois at Chicago,Not Minority,Pub 4yr,Not Minority,AANAPISI & HSI,Not Minority
161,1162,1162,Lubbock,5525577.0,United States,US,grid.264784.b,Texas Tech University,"Department of computer science, Texas Tech Uni...",Texas,...,male,0.0,High income,4.0,Texas Tech University,Texas Tech University,Pub 4yr,Public 4yr,HSI,HSI
204,1611,1611,Irvine,5359777.0,United States,US,grid.266093.8,"University of California, Irvine",University of California Irvine,California,...,male,0.0,High income,4.0,"University of California, Irvine",Not Minority,Pub 4yr,Not Minority,AANAPISI & HSI,Not Minority
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563,13803,13803,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Department of Epidemiology, University of Wash...",Washington,...,male,0.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3595,13862,13862,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,female,1.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3596,13864,13864,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,female,1.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3597,13865,13865,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,male,0.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI


In [76]:
mimic22

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
100,969,969,Nashville,4644585.0,United States,US,grid.259870.1,Meharry Medical College,Department of Computer Science and Data Scienc...,Tennessee,...,male,0.0,High income,4.0,Not Minority,Meharry Medical College,Not Minority,Private 4yr,Not Minority,HBCU
161,1162,1162,Lubbock,5525577.0,United States,US,grid.264784.b,Texas Tech University,"Department of computer science, Texas Tech Uni...",Texas,...,male,0.0,High income,4.0,Texas Tech University,Texas Tech University,Pub 4yr,Public 4yr,HSI,HSI
259,1817,1817,Winston-Salem,4499612.0,United States,US,grid.268294.3,Winston-Salem State University,"Department of Computer Science, Winston-Salem ...",North Carolina,...,male,0.0,High income,4.0,Winston-Salem State University,Winston-Salem State University,Pub 4yr,Public 4yr,HBCU,HBCU
260,1818,1818,Winston-Salem,4499612.0,United States,US,grid.268294.3,Winston-Salem State University,"Department of Computer Science, Winston-Salem ...",North Carolina,...,,,High income,4.0,Winston-Salem State University,Winston-Salem State University,Pub 4yr,Public 4yr,HBCU,HBCU
261,1819,1819,Winston-Salem,4499612.0,United States,US,grid.268294.3,Winston-Salem State University,"Center for Applied Data Science (CADS), Winsto...",North Carolina,...,,,High income,4.0,Winston-Salem State University,Winston-Salem State University,Pub 4yr,Public 4yr,HBCU,HBCU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563,13803,13803,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Department of Epidemiology, University of Wash...",Washington,...,male,0.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3595,13862,13862,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,female,1.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3596,13864,13864,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,female,1.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3597,13865,13865,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,male,0.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI


In [77]:
control20

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
18,158,277,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Department of Surgery, University of Arizona, ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
19,159,278,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
20,160,279,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,male,0.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
21,161,280,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
22,162,281,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7624,25602,3560,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,female,1.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU
7627,25607,3565,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,male,0.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU
7630,25612,3570,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,male,0.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU
7633,25617,3575,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,male,0.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU


In [78]:
control22

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
18,158,277,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Department of Surgery, University of Arizona, ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
19,159,278,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
20,160,279,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,male,0.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
21,161,280,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
22,162,281,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7624,25602,3560,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,female,1.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU
7627,25607,3565,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,male,0.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU
7630,25612,3570,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,male,0.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU
7633,25617,3575,Philadelphia,4560349.0,United States,US,grid.25879.31,University of Pennsylvania,Authors' Affiliations: 1Division of Oncology a...,Pennsylvania,...,male,0.0,High income,4.0,Cheyney University of Pennsylvania,Cheyney University of Pennsylvania,Pub 4yr,Public 4yr,HBCU,HBCU


#### following are the affiliation datasets

In [79]:
mimicAffils

Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,aff_state_code,pub_id,researcher_id,first_name,last_name,year,gender,gender_ints,income_class,income_class_num
0,0,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.014116573320.23,Xiaxuan,Huang,2022,male,0.0,Upper middle income,3.0
1,1,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.016241001520.36,Shiqi,Yuan,2022,female,1.0,Upper middle income,3.0
2,2,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.013553641416.27,Yitong,Ling,2022,female,1.0,Upper middle income,3.0
3,3,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Neurology, The First Affiliated ...",,,pub.1154144039,ur.014351222016.81,Shanyuan,Tan,2022,female,1.0,Upper middle income,3.0
4,4,Guangzhou,1809858.0,China,CN,grid.412601.0,First Affiliated Hospital of Jinan University,"Department of Clinical Research, The First Aff...",,,pub.1154144039,ur.014227217615.79,Tao,Huang,2022,male,0.0,Upper middle income,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13962,13962,Borstel,2945988.0,Germany,DE,grid.418187.3,Research Center Borstel - Leibniz Lung Center,"Division of Structural Biochemistry, Departmen...",,,pub.1008209435,ur.012014630262.69,Anthony P.,Moran,2010,male,0.0,High income,4.0
13963,13963,Fort Collins,5577147.0,United States,US,grid.47894.36,Colorado State University,"Department of Microbiology, Immunology and Pat...",Colorado,US-CO,pub.1008209435,ur.012014630262.69,Anthony P.,Moran,2010,male,0.0,High income,4.0
13964,13964,Brisbane,2174003.0,Australia,AU,grid.1022.1,Griffith University,"Institute for Glycomics, Gold Coast Campus, Gr...",Queensland,AU-QLD,pub.1008209435,ur.012014630262.69,Anthony P.,Moran,2010,male,0.0,High income,4.0
13965,13965,Galway,2964180.0,Ireland,IE,grid.6142.1,"National University of Ireland, Galway","School of Natural Sciences, National Universit...",,,pub.1008209435,ur.012014630262.69,Anthony P.,Moran,2010,male,0.0,High income,4.0


In [80]:
controlAffils

Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,aff_state_code,pub_id,researcher_id,first_name,last_name,year,gender,gender_ints,income_class,income_class_num
0,6,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.015256430001.18,Ellie,D’Hondt,2022,female,1.0,High income,4.0
1,7,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.010363506743.89,Thomas J.,Ashby,2022,male,0.0,High income,4.0
2,8,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.012727306525.02,Imen,Chakroun,2022,female,1.0,High income,4.0
3,9,Leuven,2792482.0,Belgium,BE,,"Independent Consultant, Leuven, Belgium","Independent Consultant, Leuven, Belgium",Flanders,,pub.1153863516,,Thomas,Koninckx,2022,male,0.0,High income,4.0
4,10,Leuven,2792482.0,Belgium,BE,grid.15762.37,Imec,"Exascience Life Lab, imec, Leuven, Belgium",,,pub.1153863516,ur.012704525025.77,Roel,Wuyts,2022,male,0.0,High income,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25671,3629,Amsterdam,2759794.0,Netherlands,NL,grid.5650.6,Academic Medical Center,"Academic Medical Center, Department of Urology...",,,pub.1024393212,ur.011026565017.96,Theo M.,de Reijke,2010,male,0.0,High income,4.0
25672,3630,Amsterdam,2759794.0,Netherlands,NL,grid.5650.6,Academic Medical Center,"Academic Medical Center Amsterdam, Department ...",,,pub.1024393212,ur.01015326152.89,Mike,Visser,2010,male,0.0,High income,4.0
25673,3631,Amsterdam,2759794.0,Netherlands,NL,grid.5650.6,Academic Medical Center,"Academic Medical Center, Department of Urology...",,,pub.1024393212,ur.015161315017.52,Jean J. M. C. H.,de la Rosette,2010,male,0.0,High income,4.0
25674,3632,Enschede,2756071.0,Netherlands,NL,grid.6214.1,University of Twente,"University of Twente, MIRA Institute for Biome...",,,pub.1024393212,ur.01052046002.44,Ton G.,van Leeuwen,2010,male,0.0,High income,4.0


#### the following is used to find which insititutions are affiliated MSIs (after cleaning)

In [None]:
mimicList = remMIMIC.Institution.values.tolist()

In [82]:
controlList = remControl.Institution.values.tolist()

In [83]:
mimInsti = mimic20.loc[mimic20['isMinority_2020'].isin(mimicList)]

In [84]:
mimInsti

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
4,230,230,Tempe,5317058.0,United States,US,grid.215654.1,Arizona State University,"Fulton Schools of Engineering, Arizona State U...",Arizona,...,male,0.0,High income,4.0,Arizona State University,Not Minority,Pub 4yr,Not Minority,HSI,Not Minority
77,704,704,Davis,5341704.0,United States,US,grid.27860.3b,"University of California, Davis","IFM Lab, Department of Computer Science, Unive...",California,...,male,0.0,High income,4.0,"University of California, Davis",Not Minority,Pub 4yr,Not Minority,AANAPISI,Not Minority
78,707,707,Chicago,4887398.0,United States,US,grid.185648.6,University of Illinois at Chicago,"Computer Science Department, University of Ill...",Illinois,...,male,0.0,High income,4.0,University of Illinois at Chicago,Not Minority,Pub 4yr,Not Minority,AANAPISI & HSI,Not Minority
204,1611,1611,Irvine,5359777.0,United States,US,grid.266093.8,"University of California, Irvine",University of California Irvine,California,...,male,0.0,High income,4.0,"University of California, Irvine",Not Minority,Pub 4yr,Not Minority,AANAPISI & HSI,Not Minority
207,1614,1614,Irvine,5359777.0,United States,US,grid.266093.8,"University of California, Irvine","University of California, Irvine",California,...,male,0.0,High income,4.0,"University of California, Irvine",Not Minority,Pub 4yr,Not Minority,AANAPISI & HSI,Not Minority
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3562,13802,13802,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Department of Epidemiology, University of Wash...",Washington,...,male,0.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3563,13803,13803,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Department of Epidemiology, University of Wash...",Washington,...,male,0.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3595,13862,13862,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,female,1.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
3596,13864,13864,Seattle,5809844.0,United States,US,grid.34477.33,University of Washington,"Institute for health metrics and evaluation, U...",Washington,...,female,1.0,High income,4.0,University of Washington - Seattle,University of Washington-Tacoma Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI


In [85]:
mimInsti2 = mimic22.loc[mimic22['isMinority_2022'].isin(mimicList)]

In [86]:
mimInsti2

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
259,1817,1817,Winston-Salem,4499612.0,United States,US,grid.268294.3,Winston-Salem State University,"Department of Computer Science, Winston-Salem ...",North Carolina,...,male,0.0,High income,4.0,Winston-Salem State University,Winston-Salem State University,Pub 4yr,Public 4yr,HBCU,HBCU
260,1818,1818,Winston-Salem,4499612.0,United States,US,grid.268294.3,Winston-Salem State University,"Department of Computer Science, Winston-Salem ...",North Carolina,...,,,High income,4.0,Winston-Salem State University,Winston-Salem State University,Pub 4yr,Public 4yr,HBCU,HBCU
261,1819,1819,Winston-Salem,4499612.0,United States,US,grid.268294.3,Winston-Salem State University,"Center for Applied Data Science (CADS), Winsto...",North Carolina,...,,,High income,4.0,Winston-Salem State University,Winston-Salem State University,Pub 4yr,Public 4yr,HBCU,HBCU
383,2502,2502,Austin,4671654.0,United States,US,grid.89336.37,The University of Texas at Austin,"The University of Texas at Austin, Austin, TX,...",Texas,...,male,0.0,High income,4.0,Not Minority,The University of Texas at Austin,Not Minority,Public 4yr,Not Minority,AANAPISI
384,2503,2503,Austin,4671654.0,United States,US,grid.89336.37,The University of Texas at Austin,"The University of Texas at Austin, Austin, TX,...",Texas,...,male,0.0,High income,4.0,Not Minority,The University of Texas at Austin,Not Minority,Public 4yr,Not Minority,AANAPISI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3017,12763,12763,Miami,4164138.0,United States,US,grid.65456.34,Florida International University,Department of Electrical and Computer Engineer...,Florida,...,female,1.0,High income,4.0,Florida International University,Florida International University,Pub 4yr,Public 4yr,HSI,HSI
3018,12764,12764,Miami,4164138.0,United States,US,grid.65456.34,Florida International University,"School of Computing and Information Sciences, ...",Florida,...,female,1.0,High income,4.0,Florida International University,Florida International University,Pub 4yr,Public 4yr,HSI,HSI
3019,12765,12765,Miami,4164138.0,United States,US,grid.65456.34,Florida International University,"School of Computing and Information Sciences, ...",Florida,...,female,1.0,High income,4.0,Florida International University,Florida International University,Pub 4yr,Public 4yr,HSI,HSI
3020,12766,12766,Miami,4164138.0,United States,US,grid.65456.34,Florida International University,"School of Computing and Information Sciences, ...",Florida,...,male,0.0,High income,4.0,Florida International University,Florida International University,Pub 4yr,Public 4yr,HSI,HSI


In [87]:
controlInsti = control20.loc[control20['isMinority_2020'].isin(controlList)]

In [88]:
controlInsti

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
18,158,277,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Department of Surgery, University of Arizona, ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
19,159,278,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
20,160,279,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,male,0.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
21,161,280,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
22,162,281,Tucson,5318313.0,United States,US,grid.134563.6,University of Arizona,"Program in Applied Mathematics, University of ...",Arizona,...,female,1.0,High income,4.0,University of Arizona (The),University of Arizona,Pub 4yr,Public 4yr,HSI,HSI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7546,25231,3189,Minneapolis,5037649.0,United States,US,grid.17635.36,University of Minnesota,"The Hormel Institute, University of Minnesota,...",Minnesota,...,male,0.0,High income,4.0,University of Minnesota - Twin Cities,University of Minnesota-Morris,Pub 4yr,Public 4yr,AANAPISI,NASNTI
7547,25232,3190,Minneapolis,5037649.0,United States,US,grid.17635.36,University of Minnesota,"The Hormel Institute, University of Minnesota,...",Minnesota,...,male,0.0,High income,4.0,University of Minnesota - Twin Cities,University of Minnesota-Morris,Pub 4yr,Public 4yr,AANAPISI,NASNTI
7548,25233,3191,Minneapolis,5037649.0,United States,US,grid.17635.36,University of Minnesota,"The Hormel Institute, University of Minnesota,...",Minnesota,...,female,1.0,High income,4.0,University of Minnesota - Twin Cities,University of Minnesota-Morris,Pub 4yr,Public 4yr,AANAPISI,NASNTI
7550,25235,3193,Minneapolis,5037649.0,United States,US,grid.17635.36,University of Minnesota,"The Hormel Institute, University of Minnesota,...",Minnesota,...,female,1.0,High income,4.0,University of Minnesota - Twin Cities,University of Minnesota-Morris,Pub 4yr,Public 4yr,AANAPISI,NASNTI


In [89]:
controlInsti2 = control22.loc[control22['isMinority_2022'].isin(controlList)]

In [90]:
controlInsti2

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_raw_affiliation,aff_state,...,gender,gender_ints,income_class,income_class_num,isMinority_2020,isMinority_2022,Type2020,Type2022,MSI Type2020,MSI Type2022
694,2726,3224,Loma Linda,5367696.0,United States,US,grid.411390.e,Loma Linda University Medical Center,"Faculty of School of Medicine, Loma Linda Univ...",California,...,male,0.0,High income,4.0,Not Minority,Loma Linda University,Not Minority,Private 4yr,Not Minority,AANAPISI
805,3043,3541,Loma Linda,5367696.0,United States,US,grid.411390.e,Loma Linda University Medical Center,"Department of Anesthesiology, Loma Linda Unive...",California,...,male,0.0,High income,4.0,Not Minority,Loma Linda University,Not Minority,Private 4yr,Not Minority,AANAPISI
832,3155,3653,Orlando,4167147.0,United States,US,grid.170430.1,University of Central Florida,"UCF College of Medicine, 6850 Lake Nona Blvd, ...",Florida,...,male,0.0,High income,4.0,University of Central Florida,University of Central Florida,Pub 4yr,Public 4yr,HSI,HSI
1797,6437,2921,New York,5128581.0,United States,US,grid.264091.8,St. John's University,"From the Department of Pathology (F.S.A.), Yal...",New York,...,male,0.0,High income,4.0,Not Minority,St. John's University-New York,Not Minority,Private 4yr,Not Minority,AANAPISI
2319,8411,902,Austin,4671654.0,United States,US,grid.89336.37,The University of Texas at Austin,"Department of Biomedical Engineering, The Univ...",Texas,...,,,High income,4.0,Not Minority,The University of Texas at Austin,Not Minority,Public 4yr,Not Minority,AANAPISI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7525,25026,2984,Storrs,4843786.0,United States,US,grid.63054.34,University of Connecticut,"Department of Computer Science & Engineering, ...",Connecticut,...,male,0.0,High income,4.0,University of Connecticut - Hartford Campus,University of Connecticut-Hartford Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
7526,25027,2985,Storrs,4843786.0,United States,US,grid.63054.34,University of Connecticut,"Department of Computer Science & Engineering, ...",Connecticut,...,,,High income,4.0,University of Connecticut - Hartford Campus,University of Connecticut-Hartford Campus,Pub 4yr,Public 4yr,AANAPISI,AANAPISI
7584,25541,3499,Austin,4671654.0,United States,US,grid.89336.37,The University of Texas at Austin,"Institute of Cellular and Molecular Biology, D...",Texas,...,male,0.0,High income,4.0,Not Minority,The University of Texas at Austin,Not Minority,Public 4yr,Not Minority,AANAPISI
7585,25542,3500,Austin,4671654.0,United States,US,grid.89336.37,The University of Texas at Austin,"Institute of Cellular and Molecular Biology, D...",Texas,...,female,1.0,High income,4.0,Not Minority,The University of Texas at Austin,Not Minority,Public 4yr,Not Minority,AANAPISI


In [99]:
# concatenating the 2020 and 2022 lists for control and compare
controlMaster = pd.concat([controlInsti, controlInsti2], ignore_index = True)
mimicMaster = pd.concat([mimInsti, mimInsti2], ignore_index = True)

In [100]:
mimicMaster['full_name'] = mimicMaster[['first_name', 'last_name']].apply(lambda x: ' '.join(x), axis=1)
controlMaster['full_name'] = controlMaster[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [101]:
#dropping duplicates to find distinct authors affiliated with MSIs
controlMaster = controlMaster.drop_duplicates(subset = 'full_name', keep = 'first')
mimicMaster = mimicMaster.drop_duplicates(subset = 'full_name', keep = 'first')

In [102]:
mimicMaster['gender'].value_counts()

male      135
female     35
Name: gender, dtype: int64

In [103]:
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [104]:
controlGender['full_name'] = controlGender[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [105]:
def genderFind (x):
    try:
        gender = controlGender.loc[controlGender['full_name'] == x, 'gender'].iloc[0]
    except IndexError:
        gender = 'None'
    return gender

In [106]:
controlGender['gender'].dropna()

0        male
1        male
2        male
3        male
4        male
         ... 
26820    male
26821    male
26822    male
26823    male
26824    male
Name: gender, Length: 23004, dtype: object

In [107]:
controlMaster['gender'] = controlMaster['full_name'].apply(genderFind)

In [109]:
# the control affil dataset I originially used didn't have genders so I used another dataset 
# which had the genders of the authors

controlMaster['gender'].value_counts()

male      163
female     86
Name: gender, dtype: int64

### Gender Results:
* MIMIC: 170 authors w/ non missing gender; 135 male; 35 female
* Control: 249 authors w/ non missing gender; 163 male; 86 female

# First vs Last Authors Affiliated w/ MSIs

In [120]:
#list of authors affiliated with MSIs (after manual verification)
mimic = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/mimicAuthors(d).csv")
control = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/controlAuthors(d).csv")

#these lists are used to find the author sequence (by research paper) 
conAuth = pd.read_csv("/Users/gurucharanlingamallu/Downloads/control_group_auths_v2.csv")
mimAuth = pd.read_csv("/Users/gurucharanlingamallu/Downloads/mimic_auths_v2.csv")

In [121]:
mimAuth['authorSeq'] = 'Any'

In [122]:
conAuth['authorSeq'] = 'Any'

In [123]:
#adding sequence of authors for control
for i in range(21364):
    if i == 0:
        conAuth.loc[0, 'authorSeq'] = 'First' 
    elif i == 21363:
        conAuth.loc[21363, 'authorSeq'] = 'Last'
    else:
        if(conAuth.loc[i - 1, 'pub_id'] != conAuth.loc[i, 'pub_id']):
            conAuth.loc[i, 'authorSeq'] = 'First'
        elif(conAuth.loc[i + 1, 'pub_id'] != conAuth.loc[i, 'pub_id']):
            conAuth.loc[i, 'authorSeq'] = 'Last'

In [124]:
#adding sequence of authors for compare
for i in range(13838):
    if i == 0:
        mimAuth.loc[0, 'authorSeq'] = 'First' 
    elif i == 13837:
        mimAuth.loc[13837, 'authorSeq'] = 'First'
    else:
        if(mimAuth.loc[i - 1, 'pub_id'] != mimAuth.loc[i, 'pub_id']):
            mimAuth.loc[i, 'authorSeq'] = 'First'
        elif(mimAuth.loc[i + 1, 'pub_id'] != mimAuth.loc[i, 'pub_id']):
            mimAuth.loc[i, 'authorSeq'] = 'Last'


In [125]:
#adding rows for full names
#some authors don't have a unique research id associated with them so this allows for a more accurate mapping
conAuth['full_name'] = conAuth[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)
mimAuth['full_name'] = mimAuth[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [126]:
#function for mapping MSI affiliated authors with their correct position
def authFindC (x):
    try:
        seq = conAuth.loc[conAuth['full_name'] == x, 'authorSeq'].iloc[0]
    except IndexError:
        seq = 'Any'
    return seq

def authFindM (x):
    try:
        seq = mimAuth.loc[mimAuth['full_name'] == x, 'authorSeq'].iloc[0]
    except IndexError:
        seq = 'Any'
    return seq

### Results

In [127]:
control['authorSeq'] = control['full_name'].apply(authFindC)

In [128]:
control['authorSeq'].value_counts()

Any      182
Last      56
First     48
Name: authorSeq, dtype: int64

In [129]:
mimic['authorSeq'] = mimic['full_name'].apply(authFindM)

In [130]:
mimic['authorSeq'].value_counts()

Any      126
First     36
Last      28
Name: authorSeq, dtype: int64

# T-Tests

In [131]:
#author affiliation datasets
controlT = pd.read_csv("/Users/gurucharanlingamallu/Downloads/control_group_affils(7).csv", encoding='utf-8')
mimicT = pd.read_csv("/Users/gurucharanlingamallu/Downloads/mimic_affils(7).csv", encoding='utf-8')

In [132]:
#list of authors affiliated with MSIs and their corresponding information (after manual verification)
mimic = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/mimicAuthors(d).csv")
control = pd.read_csv("/Users/gurucharanlingamallu/Documents/MIT Work/controlAuthors(d).csv")

In [133]:
control['full_name'] = control[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)
mimic['full_name'] = mimic[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [134]:
controlList = control.full_name.values.tolist()

In [135]:
mimicList = mimic.full_name.values.tolist()

In [136]:
controlT['full_name'] = controlT[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [137]:
mimicT['full_name'] = mimicT[['first_name', 'last_name']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [138]:
#mapping the list of authors affiliated with MSIs to the overall datasets in order to do t-tests on the entire author population
def isMinCon (x):
    if x in controlList:
        return 1
    else:
        return 0
def isMinMim (x):
    if x in mimicList:
        return 1
    else:
        return 0

In [139]:
controlT['minority'] = controlT['full_name'].apply(isMinCon)

In [140]:
mimicT['minority'] = mimicT['full_name'].apply(isMinMim)

### T-test for MSI Author Distribution

In [142]:
from scipy.stats import ttest_ind

def get_ttest_any(df):
    return df.groupby("pub_id").minority.any()

print(f"t-test: minority {ttest_ind(get_ttest_any(mimicT), get_ttest_any(controlT), alternative= 'greater')}")

t-test: minority Ttest_indResult(statistic=5.125750776843266, pvalue=1.5432410937220337e-07)


### T-Test for First vs Last Author Distribution

In [143]:
controlT['authorSeq'] = 'Any'

In [144]:
mimicT['authorSeq'] = 'Any'

In [145]:
#adding author sequence to overall datastes
for i in range(mimicT.shape[0]):
    if i == 0:
        mimicT.loc[0, 'authorSeq'] = 'First' 
    elif i == mimicT.shape[0] - 1:
        mimicT.loc[mimicT.shape[0] - 1, 'authorSeq'] = 'First'
    else:
        if(mimicT.loc[i - 1, 'pub_id'] != mimicT.loc[i, 'pub_id']):
            mimicT.loc[i, 'authorSeq'] = 'First'
        elif(mimicT.loc[i + 1, 'pub_id'] != mimicT.loc[i, 'pub_id']):
            mimicT.loc[i, 'authorSeq'] = 'Last'

for i in range(controlT.shape[0]):
    if i == 0:
        controlT.loc[0, 'authorSeq'] = 'First' 
    elif i == controlT.shape[0] - 1:
        controlT.loc[controlT.shape[0] - 1, 'authorSeq'] = 'First'
    else:
        if(controlT.loc[i - 1, 'pub_id'] != controlT.loc[i, 'pub_id']):
            controlT.loc[i, 'authorSeq'] = 'First'
        elif(controlT.loc[i + 1, 'pub_id'] != controlT.loc[i, 'pub_id']):
            controlT.loc[i, 'authorSeq'] = 'Last'

        

In [149]:
mimicT['firstSeq'] = (mimicT['minority'] == 1) & (mimicT['authorSeq'] == 'First')
mimicT['firstSeq'] = mimicT['firstSeq'].astype(int)
controlT['firstSeq'] = (controlT['minority'] == 1) & (controlT['authorSeq'] == 'First')
controlT['firstSeq'] = controlT['firstSeq'].astype(int)

In [150]:
mimicT['lastSeq'] = (mimicT['minority'] == 1) & (mimicT['authorSeq'] == 'Last')
controlT['lastSeq'] = (controlT['minority'] == 1) & (controlT['authorSeq'] == 'Last')
mimicT['lastSeq'] = mimicT['lastSeq'].astype(int)
controlT['lastSeq'] = controlT['lastSeq'].astype(int)

In [152]:
#first authors
def get_ttest_any2(df):
    return df.groupby("pub_id").firstSeq.any()

print(f"t-test: minority {ttest_ind(get_ttest_any2(mimicT), get_ttest_any2(controlT), alternative= 'greater')}")

t-test: minority Ttest_indResult(statistic=2.969446562475738, pvalue=0.0014994760337273376)


In [153]:
#last authors
def get_ttest_any3(df):
    return df.groupby("pub_id").lastSeq.any()

print(f"t-test: minority {ttest_ind(get_ttest_any3(mimicT), get_ttest_any3(controlT), alternative= 'greater')}")

t-test: minority Ttest_indResult(statistic=1.7684355004194527, pvalue=0.038527582218135036)


### T-Test for Gender Distribution

In [154]:
controlT['isFemale'] = (controlT['minority'] == 1) & (controlT['gender'] == 'female')
mimicT['isFemale'] = (mimicT['minority'] == 1) & (mimicT['gender'] == 'female')

In [156]:
def get_ttest_any4(df):
    return df.groupby("pub_id").isFemale.any()

print(f"t-test: minority {ttest_ind(get_ttest_any4(mimicT), get_ttest_any4(controlT), alternative= 'greater')}")

t-test: minority Ttest_indResult(statistic=1.099853909556267, pvalue=0.1357270025390229)
