# Import required libraries

In [1]:
# import basic libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# import libraries for machine learning models
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

# import libraries to solve LP
from pulp import *

# Output $p_i$ and $q_i$ for each candidate i $\in$ [n]

We process the data and train the selected algorithm with the training data. After that, we output $p_i$ and $q_i$ on the testing data.

In [2]:
# read data
df = pd.read_csv('clean_law_school.csv', index_col = 0)

# split data into training and testing part
target = ['admit', 'enroll']
y = df[target]
X = df.drop(target, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, shuffle = True, random_state = 1)

# implement the machine learning model to predict p_i and q_i
lg_clf = ClassifierChain(RandomForestClassifier())
lg_clf.fit(X_train, y_train)
y_pred = lg_clf.predict_proba(X_test)
X_test = pd.merge(X_test, y_test, left_index = True, right_index = True)
X_test[['p_i', 'q_i']] = np.round(y_pred.toarray(), 3)

In APD-S, since we assume there is only one academic unit (e.g, department) and one admission committe (AC), we would only select applicants from colleges with similar pass interview rate and accept offer rate for the input instance.

In [3]:
# summary of college acceptance rate
X_test['cpir'].describe()

count    22754.000000
mean         0.347415
std          0.238890
min          0.000000
25%          0.192000
50%          0.315000
75%          0.432000
max          1.000000
Name: cpir, dtype: float64

In [4]:
# summary of college admission rate
X_test['caor'].describe()

count    22754.000000
mean         0.356386
std          0.237919
min          0.000000
25%          0.180000
50%          0.285000
75%          0.433000
max          1.000000
Name: caor, dtype: float64

Based on the summary of the statistics of colleges' pass interview and accept offer rate, we would select applicants from college in the range of 25th percentile and 75th percentile

In [5]:
X_test = X_test[(X_test['caor'] >= np.percentile(X_test['caor'], 25)) & \
                (X_test['caor'] <= np.percentile(X_test['caor'], 75)) & \
                (X_test['cpir'] >= np.percentile(X_test['cpir'], 25)) & \
                (X_test['cpir'] <= np.percentile(X_test['cpir'], 75))]

# Generating input instance for APD-S

An input instance of APD can be characterized as $I = ([n], \{p_i, q_i |i \in [n]\}, \{B_g, g|g \in G_I\}, \{b_g, g|g \in G_E\}, \{w_{ig}, \tau_g|g \in G_P, i \in g\})$.

Note: 
- In this dataset, we assume all candidates are qualified for an interview and they will be automatically offered once passing it. Hence, it can be explained why we use the result of *admit* as *passing the interview* and *enroll* as *accepting the offer*.

Input details:
- [n] = 1000 candidates
- $p_i, q_i$ for each i $\in$ [n]
- $G_I$:
    - In APD-S, we only have single interview-related constraint that can be captured as {g, B} with g = [n] being the only group in $G_I$. So $G_I$ contains only 1 group with n candidates.
    - In reality, the dataset should miss a lot of candidates that fail to get an interview since it only includes those who qualify for an interview. In this case, we can simulate how colleges with similar acceptance rate (~20-30%) as schools in this data perform. We found [Admissions report of Oxford Law](https://www.law.ox.ac.uk/sites/files/oxlaw/ug_admissions_report_2021.pdf) and identified that we can use its application-to-interview success rate to set the cap on interview-related group g $B_g$ accordingly.
- $G_E$:
    - We have enrollment-related budget constraints {g, $b_g|g \in G_E$}. In this dataset, there are two groups inside $G_E$ which indicate candidates who are in-state and those who are out-of-state.
    - Since the data should not miss any candidate who successfully enrolls, we can set the cap on enrollment-related group g $b_g$ as the actual statistics of the dataset (of those who enroll, who are in-state applicants, who are out-of-state applicants?)
- $G_P$:
    - We capture the collection of protected groups of interest $G_P$ as the combination of the race and gender of the candidates. That means $G_P$ will have 8 groups by combining Race: {Black, Hispanic, Asian, White} and Gender: {Male, Female}.
    - We identify the target quota $\tau_g$ for protected group g using the admission statistics of universities that are known for applying Affirmative Action in their admission process. 
        + Specifically, we would use the statistics of Harvard Law School, known for its [yearly commitment to Affirmative Action in the admission/employment process](https://hr.harvard.edu/files/humanresources/files/reaffirmation_statement.pdf)
        + Based on the [demographics of Hardvard Fall 2020 applications](https://www.ilrg.com/rankings/law/view/49), we calculate the percentage of each protected group in the enrollment number and set $\tau_g$ accordingly. 
    - For each candidate i $\in g$ of $G_P, w_{ig}$ (the degree of relevance of i to g) is calculated by [XXX]

In [6]:
# create 10 random input instances from the testing data set, each with 2000 applicants
n = 1000
test_data = [X_test.sample(n=n).reset_index(drop=True) for num in range(10)]

In [7]:
def generate_input(data):
    # collection of interview-related groups
    G_I = data
    
    # collection of enrollment-related groups
    in_state = (data['resident'] == 1)
    out_of_state = (data['resident'] == 0)
    G_E = [data[in_state], data[out_of_state]]
    
    # collection of protected groups
    female_black = data[(data['gender'] == 0) & (data['black']==1)]
    female_hispanic = data[(data['gender'] == 0) & (data['hispanic'] == 1)]
    female_asian = data[(data['gender'] == 0) & (data['asian'] == 1)]
    female_white = data[(data['gender'] == 0) & (data['white'] == 1)]
    female_other = data[(data['gender'] == 0) & (data['other_race'] == 1)]
    
    male_black = data[(data['gender'] == 1) & (data['black'] == 1)]
    male_hispanic = data[(data['gender'] == 1) & (data['hispanic'] == 1)]
    male_asian = data[(data['gender'] == 1) & (data['asian'] == 1)]
    male_white = data[(data['gender'] == 1) & (data['white'] == 1)]
    male_other = data[(data['gender'] == 1) & (data['other_race'] == 1)]

    G_P = [female_black, female_hispanic, female_asian, female_white, female_other,\
           male_black, male_hispanic, male_asian, male_white, male_other] 
    
    # cap imposed on interview-related group g
    B_g = int(len(data) * 0.3705)
    
    # cap imposed on enrollment-related group g
    b_g = [len(data[in_state & (data['enroll'] == 1)]), len(data[out_of_state & (data['enroll'] == 1)])]
    
    # target quota for protected group g to achieve
    target_quota = [0.0345, 0.0415, 0.0535, 0.252, 0.1185] * 2
    enroll = (data['enroll'] == 1)
    tau_g = np.array(np.round([len(data[enroll]) * quota for quota in target_quota]), int)
    
    # if there is a difference in the sum of G_P and G_E due to rounding, randomly increase one group
    # in either G_P or G_E to balance the difference (as both cap on final enrollment)
    if sum(tau_g) > sum(b_g):
        index = np.random.randint(0, len(b_g)) 
        b_g[index] += sum(tau_g) - sum(b_g)
    elif sum(tau_g) < sum(b_g):
            index = np.random.randint(0, len(tau_g))
            tau_g[index] += sum(b_g) - sum(tau_g)
    
    # relevance of i to protected group g
    w_ig = []
    for index_g in range(len(G_P)):
        g = G_P[index_g]
        arr = []
        for index_i in range(len(g)):
            arr.append(g.iloc[index_i]['es'])
        w_ig.append(arr)
    
    return G_I, G_E, G_P, B_g, b_g, tau_g, w_ig

The objective model is max min$_{g \in G_P} (\sum_{i \in g} w_{ig} y_i q_i / \tau_g)$.

We can rewrite it as the following to solve:

max z

s.t $ \space \space$  z $\le \sum_{i \in g} w_{ig} y_i q_i / \tau_g \space \space \space \space \space \space$ for $g \in G_P$

Other constraints will be kept as original.

In [11]:
def solveLP(data):
    # create input instance
    G_I, G_E, G_P, B_g, b_g, tau_g, w_ig = generate_input(data)
    
    # create model
    model = LpProblem(name='APD-S', sense = LpMaximize)

    # define decision variables
    x_name = []
    y_name = []
    for i in range(n):
        x_name.append('x' + str(i))
        y_name.append('y' + str(i))

    x = [LpVariable(x_name[i], lowBound = 0, upBound = 1) for i in range(n)]
    y = [LpVariable(y_name[i], lowBound = 0, upBound = 1) for i in range(n)]
    z = LpVariable(name='z')

    # add objective function to the model
    model += z

    # constraints for z
    for index_g in range(len(G_P)):
        constraint = []
        g = G_P[index_g]
        for index_i in range(len(g)):
            constraint.append(w_ig[index_g][index_i]*y[g.index[index_i]]*g.iloc[index_i]['q_i']/tau_g[index_g])
        model += z <= lpSum(constraint)
    
    # constraints for (2) in LP
    constraint = []
    for index_g in range(len(G_I)):
        constraint.append(x[index_g])
    model += lpSum(constraint) <= B_g
    
    # constraints for (3) in LP
    for i in range(n):
        model += y[i] <= x[i]*data.iloc[i]['p_i']
        
    # constraints for (4) in LP
    for index_g in range(len(G_E)):
        constraint = []
        g = G_E[index_g]
        for index_i in range(len(g)):
            constraint.append(y[g.index[index_i]]*g.iloc[index_i]['q_i'])
        model += lpSum(constraint) <= b_g[index_g]
    
    # solve the model 
    model.solve(PULP_CBC_CMD(msg=0))
    
    x_optimal = [0]*n
    y_optimal = [0]*n

    for var in model.variables()[:n]:
        x_optimal[int(str(var.name)[1:])] = round(var.varValue, 3)
    
    for var in model.variables()[n:-1]:
        y_optimal[int(str(var.name)[1:])] = round(var.varValue, 3)
    
    # return x*_i, y*_i for each candidate i
    return x_optimal, y_optimal

In [12]:
# verify if x* and y* of each input instance satisfy the constraints 
x_optimal = []
y_optimal = []
print('Verifying all input instances')
for index in range(len(test_data)):
    
    x, y = solveLP(test_data[index])
    x_optimal.append(x)
    y_optimal.append(y)

    G_I, G_E, G_P, B_g, b_g, tau_g, w_ig = generate_input(test_data[index])
    flag = True
    
    # constraint 2
    if sum(x_optimal[index]) > B_g:
        flag = False

    # constraint 3
    for i in range(n):
        if y_optimal[index][i] > x_optimal[index][i]*test_data[index].iloc[i]['p_i']:
            flag = False
    
    # constraint 4
    for index_g in range(len(G_E)):
        g = G_E[index_g]
        for index_i in range(len(g)):
            if y_optimal[index][g.index[index_i]]*g.iloc[index_i]['q_i'] > b_g[index_g]:
                flag = False
if flag:
    print('-> Every constraints has been satisfied throughout 10 input instances')
else:
    print('-> Not every constraints has been satisfied throughout 10 input instances')

Verifying all input instances
-> Every constraints has been satisfied throughout 10 input instances


In [10]:
def algorithm_1(data):
    x_optimal, y_optimal = solveLP(data)
    