# Import required libraries

In [32]:
# import basic libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# import libraries for machine learning models
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

# import libraries to solve LP
from pulp import *

# Output $p_i$ and $q_i$ for each candidate i $\in$ [n]

In [33]:
# read data
df = pd.read_csv('clean_law_school.csv', index_col = 0)

# split data into training and testing part
target = ['admit', 'enroll']
y = df[target]
X = df.drop(target, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, shuffle = True, random_state = 1)

# implement the machine learning model to predict p_i and q_i
lg_clf = ClassifierChain(RandomForestClassifier())
lg_clf.fit(X_train, y_train)
y_pred = lg_clf.predict_proba(X_test)
X_test = pd.merge(X_test, y_test, left_index = True, right_index = True)
X_test[['p_i', 'q_i']] = np.round(y_pred.toarray(), 3)

# Generating input instance for APD-S

In APD-S, since we assume there is only one academic unit (e.g, department) and one admission committe (AC), we would only select applicants from colleges with similar acceptance rate and admission rate.

In [34]:
# summary of college acceptance rate
X_test['college_acceptance_rate'].describe()

count    22754.000000
mean         0.259164
std          0.080609
min          0.141930
25%          0.206761
50%          0.248110
75%          0.287721
max          0.479332
Name: college_acceptance_rate, dtype: float64

In [35]:
# summary of college admission rate
X_test['college_admission_rate'].describe()

count    22754.000000
mean         0.074805
std          0.039673
min          0.015370
25%          0.045344
50%          0.062960
75%          0.087537
max          0.182125
Name: college_admission_rate, dtype: float64

Based on the summary of the statistics of acceptance and admission rate, we would select applicants who are in in the range of 25th percentile and 75th percentile only.

In [36]:
X_test = X_test[(X_test['college_acceptance_rate'] >= np.percentile(X_test['college_acceptance_rate'], 25)) & \
                (X_test['college_acceptance_rate'] <= np.percentile(X_test['college_acceptance_rate'], 75)) & \
                (X_test['college_admission_rate'] >= np.percentile(X_test['college_admission_rate'], 25)) & \
                (X_test['college_admission_rate'] <= np.percentile(X_test['college_admission_rate'], 75))]

We generate the input instance as follow: create 10 random input instances from the testing data set, each with 1000 applicants.

Note: we use the result of *admit* as *passing the interview* and *enroll* as *accepting the offer*.

The input instance will have the following elements:
- [n] = 1000 candidates
- $p_i, q_i$ for each i $\in$ [n]
- In APD-S, we only have single interview-related constraint that can be captured as {g, B} with g = [n] being the only group in $G_I$. So $G_I$ contains only 1 group with n candidates.
- We have enrollment-related budget constraints {g, $b_g|g \in G_E$}. In this dataset, there are two groups inside $G_E$ which indicate candidates who are in-state and those who are out-of-state.
- We capture the collection of protected groups of interest $G_P$ as the combination of the race and gender of the candidates. That means $G_P$ will have 8 groups by combining Race: {Black, Hispanic, Asian, White} and Gender: {Male, Female}.
- For each candidate i $\in g$ of $G_P, w_{ig}$ (the degree of relevance of i to g) is calculated by check if the candidate has the same race and gender as the protected group
- We set the the cap on interview-related group g $B_g$ and enrollment-related group g $b_g$ using the actual statistics of the acceptance rate and admission rate from the dataset.
- We are finding policies that define the target quota $\tau_g$ for protected group g. However, the implementation of racial/gender quotas is not public or banned in some states. Alternatively we can follow the admission statistics of universities that apply Affirmative Action in their admission process. 
    + Specifically, we would use the statistics of Harvard Law School, known for its [yearly commitment to Affirmative Action in the admission/employment process](https://hr.harvard.edu/files/humanresources/files/reaffirmation_statement.pdf)
    + Based on the [demographics of Hardvard Fall 2020 applications](https://www.ilrg.com/rankings/law/view/49), we calculate the percentage of each protected group in the enrollment number and set $\tau_g$ accordingly. 

In [37]:
# create 10 random input instances from the testing data set, each with 1000 applicants
n = 1000
test_data = [X_test.sample(n=n).reset_index(drop=True) for num in range(10)]

In [38]:
def generate_input(data):
    # collection of interview-related groups
    G_I = data
    
    # cap imposed on interview-related group g
    B_g = len(data[data['admit'] == 1])
    
    # collection of enrollment-related groups
    in_state = (data['resident'] == 1)
    out_of_state = (data['resident'] == 0)
    G_E = [data[in_state], data[out_of_state]]
    
    # cap imposed on enrollment-related group g
    b_g = [len(data[in_state & (data['enroll'] == 1)]), len(data[out_of_state & (data['enroll'] == 1)])]
    
    # collection of protected groups
    female_black = data[(data['gender'] == 0) & (data['black']==1)]
    female_hispanic = data[(data['gender'] == 0) & (data['hispanic'] == 1)]
    female_asian = data[(data['gender'] == 0) & (data['asian'] == 1)]
    female_white = data[(data['gender'] == 0) & (data['white'] == 1)]
    female_other = data[(data['gender'] == 0) & (data['other_race'] == 1)]
    
    male_black = data[(data['gender'] == 1) & (data['black'] == 1)]
    male_hispanic = data[(data['gender'] == 1) & (data['hispanic'] == 1)]
    male_asian = data[(data['gender'] == 1) & (data['asian'] == 1)]
    male_white = data[(data['gender'] == 1) & (data['white'] == 1)]
    male_other = data[(data['gender'] == 1) & (data['other_race'] == 1)]

    G_P = [female_black, female_hispanic, female_asian, female_white, female_other,\
           male_black, male_hispanic, male_asian, male_white, male_other] 
    
    # target quota for protected group g to achieve
    tau_g = np.array([len(data)*0.0345, len(data)*0.0415, len(data)*0.0535, len(data)*0.252, len(data)*0.1185,\
             len(data)*0.0345, len(data)*0.0415, len(data)*0.0535, len(data)*0.252, len(data)*0.1185], int)
    
    # relevance of i to protected group g
    w_ig = [np.random.uniform(size=len(g)) for g in G_P] #synthetic w_ig
    
    return G_I, G_E, G_P, B_g, b_g, tau_g, w_ig

The objective model is max min$_{g \in G_P} (\sum_{i \in g} w_{ig} y_i q_i / \tau_g)$.

We can rewrite it as the following to solve:

max z

s.t $ \space \space$  z $\le \sum_{i \in g} w_{ig} y_i q_i / \tau_g \space \space \space \space \space \space$ for $g \in G_P$

Other constraints will be kept as original.

In [48]:
def solveLP(data):
    # create input instance
    G_I, G_E, G_P, B_g, b_g, tau_g, w_ig = generate_input(data)
    
    # create model
    model = LpProblem(name='APD-S', sense = LpMaximize)

    # define decision variables
    x_name = []
    y_name = []
    for i in range(n):
        x_name.append('x' + str(i))
        y_name.append('y' + str(i))

    x = [LpVariable(x_name[i], lowBound = 0, upBound = 1) for i in range(n)]
    y = [LpVariable(y_name[i], lowBound = 0, upBound = 1) for i in range(n)]
    z = LpVariable(name='z')

    # add objective function to the model
    model += z

    # constraints for z
    for index_g in range(len(G_P)):
        constraint = []
        g = G_P[index_g]
        for index_i in range(len(g)):
            constraint.append(w_ig[index_g][index_i]*y[g.index[index_i]]*g.iloc[index_i]['q_i']/tau_g[index_g])
        model += z <= lpSum(constraint)
    
    # constraints for (2) in LP
    constraint = []
    for index_g in range(len(G_I)):
        constraint.append(x[index_g])
    model += lpSum(constraint) <= B_g
    
    # constraints for (3) in LP
    for i in range(n):
        model += y[i] <= x[i]*data.iloc[i]['p_i']
        
    # constraints for (4) in LP
    for index_g in range(len(G_E)):
        constraint = []
        g = G_E[index_g]
        for index_i in range(len(g)):
            constraint.append(y[g.index[index_i]]*g.iloc[index_i]['q_i'])
        model += lpSum(constraint) <= b_g[index_g]
    
    # solve the model 
    model.solve(PULP_CBC_CMD(msg=0))
    print('-> The solution is', LpStatus[model.status])
    
    x_optimal = [0]*n
    y_optimal = [0]*n

    for var in model.variables()[:n]:
        x_optimal[int(str(var.name)[1:])] = round(var.varValue, 3)
    
    for var in model.variables()[n:-1]:
        y_optimal[int(str(var.name)[1:])] = round(var.varValue, 3)
    
    # return x*_i, y*_i for each candidate i
    return x_optimal, y_optimal

In [49]:
# verify if x* and y* of each input instance satisfy the constraints 
x_optimal = []
y_optimal = []
for index in range(len(test_data)):
    print('Verifying input instance number', index)
    x, y = solveLP(test_data[index])
    x_optimal.append(x)
    y_optimal.append(y)

    G_I, G_E, G_P, B_g, b_g, tau_g, w_ig = generate_input(test_data[index])
    flag = True
    
    # constraint 2
    if sum(x_optimal[index]) > B_g:
        flag = False

    # constraint 3
    for i in range(n):
        if y_optimal[index][i] > x_optimal[index][i]*test_data[index].iloc[i]['p_i']:
            flag = False
    
    # constraint 4
    for index_g in range(len(G_E)):
        g = G_E[index_g]
        for index_i in range(len(g)):
            if y_optimal[index][g.index[index_i]]*g.iloc[index_i]['q_i'] > b_g[index_g]:
                flag = False
    
    if flag:
        print('-> Every constraints has been satisfied \n')
    else:
        print('-> Not every constraints has been satisfied \n')

Verifying input instance number 0
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 1
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 2
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 3
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 4
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 5
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 6
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 7
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 8
-> The solution is Optimal
-> Every constraints has been satisfied 

Verifying input instance number 9
-> The solution is Optimal
-> Every con