# Import required libraries

In [3]:
# import basic libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# import libraries for machine learning models
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

# import libraries to solve LP
from pulp import *

# Output $p_i$ and $q_i$ for each candidate i $\in$ [n]

In [4]:
# read data
df = pd.read_csv('law_school_clean.csv')

# split data into training and testing part
target = ['admit', 'enroll']
y = df[target]
X = df.drop(target, axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, shuffle = True, random_state = 1)

# implement the machine learning model to predict p_i and q_i
lg_clf = ClassifierChain(LogisticRegression(max_iter = 1000))
lg_clf.fit(X_train, y_train)
y_pred = lg_clf.predict_proba(X_test)
X_test = pd.merge(X_test, y_test, left_index = True, right_index = True)
X_test[['p_i', 'q_i']] = y_pred.toarray()

In [5]:
X_test

Unnamed: 0.1,Unnamed: 0,lsat,gpa,resident,gender,black,hispanic,asian,white,college_tier1,college_tier2,college_tier3,college_tier4,admit,enroll,p_i,q_i
79344,79344,153.0,2.68,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.267583,0.081567
21732,21732,165.0,3.97,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.264140,0.066272
72769,72769,162.0,3.24,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.258187,0.071105
15266,15266,133.0,2.96,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.304971,0.105873
70309,70309,166.0,3.88,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.253991,0.066873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74322,74322,160.0,3.88,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.260215,0.073298
95826,95826,156.0,3.47,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.260922,0.078626
25294,25294,147.0,3.48,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.285260,0.086770
49226,49226,163.0,3.78,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.261359,0.069223


# Generating input instance for APD-S

We generate the input as follow:
- [n] = 1000 candidates
- $p_i, q_i$ for each i in the testing dataset
- In APD-S, we only have single interview-related constraint that can be captured as {g, B} with g = [n] being the only group in $G_I$. So $G_I$ contains 1 group only with n candidates.
- The cap $B_g$ imposed on an interview related group g is synthetically generated to be 0.4 of each group g in $G_I$. That is, the number of candidates getting an interview accounts for only 40% of all the total candidates.
- We have enrollment-related budget constraints {g, $b_g|g \in G_E$}. There are two groups inside $G_E$ imply candidates who are in-state and those who are out-of-state. '
- The cap $b_g$ of each group g $\in G_E$ is generated as the actual enrollment statistics of in-state and out-of-state students.
- We generate the collection of protected groups of interest $G_P$ as the combination of the race and gender of the candidates. That means $G_P$ will have 8 groups by combining Race: {Black, Hispanic, Asian, White} and Gender: {Male, Female}.
- For each candidate i $\in g$ of G_P, w_ig (the degree of relevance of i to g) is synthetically generated as random uniform number from [0,1]
- The target quota $\tau_g$ for each protected group g is generated as the actual enrollment statistics of that group + 1 (to avoid division by zero). For example, $\tau_g$ for group g (Black, Female) is set as the number of Black and Female candidates who choose to enroll + 1.
- We are finding policies that define $B_g, b_g$, and $\tau_g$.

In [6]:
# create 10 random input instances from the testing data set, each with 1000 applicants
n = 1000
test_data = [X_test.sample(n=n).reset_index(drop=True) for num in range(10)]

In [7]:
def generate_input(data):
    # collection of interview-related groups
    G_I = data
    
    # cap imposed on interview-related group g
    B_g = len(G_I)*0.4
    
    # collection of enrollment-related groups
    in_state = (data['resident'] == 1) & (data['enroll'] == 1)
    out_of_state = (data['resident'] == 0) & (data['enroll'] == 1)
    G_E = [data[in_state], data[out_of_state]]
    
    # cap imposed on enrollment-related group g
    b_g = [len(G_E[num]) for num in range(len(G_E))]
    
    # collection of protected groups
    female_black = (data['gender'] == 0) & (data['black']==1)
    female_hispanic = (data['gender'] == 0) & (data['hispanic'] == 1)
    female_asian = (data['gender'] == 0) & (data['asian'] == 1)
    female_white = (data['gender'] == 0) & (data['white'] == 1)
    
    male_black = (data['gender'] == 1) & (data['black'] == 1)
    male_hispanic = (data['gender'] == 1) & (data['hispanic'] == 1)
    male_asian = (data['gender'] == 1) & (data['asian'] == 1)
    male_white = (data['gender'] == 1) & (data['white'] == 1)

    G_P = [data[female_black], data[female_hispanic], data[female_asian], data[female_white],\
           data[male_black], data[male_hispanic], data[male_asian], data[male_white]] 
    
    # target quota for protected group g to achieve
    tau_g = [len(data[female_black & (data['enroll'] == 1)]) + 1, len(data[female_hispanic & (data['enroll'] == 1)]) + 1,\
             len(data[female_asian & (data['enroll'] == 1)]) + 1, len(data[female_white & (data['enroll'] == 1)]) + 1,\
             len(data[male_black & (data['enroll'] == 1)]) + 1, len(data[male_hispanic & (data['enroll'] == 1)]) + 1,\
             len(data[male_asian & (data['enroll'] == 1)]) + 1, len(data[male_white & (data['enroll'] == 1)]) + 1]
    
    # relevance of i to protected group g
    w_ig = [np.random.uniform(size=len(g)) for g in G_P] #synthetic w_ig
    
    return G_I, G_E, G_P, B_g, b_g, tau_g, w_ig

The objective model is max min$_{g \in G_P} (\sum_{i \in g} w_{ig} y_i q_i / \tau_g)$.

We can rewrite it as the following to solve:

max z

s.t $ \space \space$  z $\le \sum_{i \in g} w_{ig} y_i q_i / \tau_g \space \space \space \space \space \space \forall  g \in G_P$

In [28]:
def solveLP(data):
    # create input instance
    G_I, G_E, G_P, B_g, b_g, tau_g, w_ig = generate_input(data)
    
    # create model
    model = LpProblem(name='APD-S', sense=LpMaximize)

    # define decision variables
    x_name = []
    y_name = []
    for i in range(n):
        x_name.append('x' + str(i))
        y_name.append('y' + str(i))

    x = [LpVariable(x_name[i], lowBound = 0, upBound = 1) for i in range(n)]
    y = [LpVariable(y_name[i], lowBound = 0, upBound = 1) for i in range(n)]
    z = LpVariable(name='z')

    # add objective function to the model
    model += LpAffineExpression(z)

    # constraints for z
    for index_g in range(len(G_P)):
        constraint = []
        g = G_P[index_g]
        for index_i in range(len(g)):
            constraint.append(w_ig[index_g][index_i]*y[g.index[index_i]]*g.iloc[index_i]['q_i']/tau_g[index_g])
        model += z <= lpSum(constraint)
    
    # constraints for (2) in LP
    constraint = []
    for index_g in range(len(G_I)):
        constraint.append(x[index_g])
    model += lpSum(constraint) <= B_g
    
    # constraints for (3) in LP
    for i in range(n):
        model += y[i] <= x[i]*data.iloc[i]['p_i']
        
    # constraints for (4) in LP
    for index_g in range(len(G_E)):
        constraint = []
        g = G_E[index_g]
        for index_i in range(len(g)):
            constraint.append(y[g.index[index_i]]*g.iloc[index_i]['q_i'])
        model += lpSum(constraint) <= b_g[index_g]
    
    # solve the model 
    model.solve()
    
    # return x*_i, y*_i for each candidate i
    return [var.varValue for var in model.variables()[:n]], [var.varValue for var in model.variables()[n:-1]]

In [9]:
x_optimal, y_optimal = solveLP(test_data[0])

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /usr/local/lib/python3.8/dist-packages/pulp/apis/../solverdir/cbc/linux/64/cbc /tmp/6c6ec930e7f141a5a06c91c957a1399c-pulp.mps max timeMode elapsed branch printingOptions all solution /tmp/6c6ec930e7f141a5a06c91c957a1399c-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 1016 COLUMNS
At line 5102 RHS
At line 6114 BOUNDS
At line 8116 ENDATA
Problem MODEL has 1011 rows, 2001 columns and 4084 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Presolve 1009 (-2) rows, 2001 (0) columns and 4008 (-76) elements
Perturbing problem by 0.001% of 0.0054035179 - largest nonzero change 0 ( 0%) - largest zero change 4.9969129e-05
0  Obj 0.19537591 Primal inf 747.26061 (8)
95  Obj 0.12457438 Primal inf 403.27592 (656)
190  Obj 0.11508503 Primal inf 332.07782 (553)
285  Obj 0.10798178 Primal inf 273.34958 (467)
380  Obj 0.10308442 Primal inf

Problem: x* is mostly in binary form.

In [27]:
# verify if x* and y* satisfy the constraints in the first batch of test data
G_I, G_E, G_P, B_g, b_g, tau_g, w_ig = generate_input(test_data[0])

# constraint 2
print('Constraint 2 is:', sum(x_optimal) < B_g)
# print('Sum of x*_i:', sum(x_optimal))
# print('Cap B_g:', B_g)

# constraint 3
flag = True
for i in range(n):
    if y_optimal[i] > x_optimal[i]*test_data[0].iloc[i]['p_i']:
        flag = False
#       print('y*' + str(i) + ':', y_optimal[i])
#       print('x*' + str(i) + '.' + 'p' + str(i) + ':', x_optimal[i]*test_data[0].iloc[i]['p_i'])
print('Constraint 3 is:', flag)

# constraint 4
flag = True
for index_g in range(len(G_E)):
    g = G_E[index_g]
    for index_i in range(len(g)):
        if y_optimal[g.index[index_i]]*g.iloc[index_i]['q_i'] <= b_g[index_g]:
            flag = False
print('Constraint 4 is:', flag)

Constraint 2 is: False
Constraint 3 is: False
Constraint 4 is: False
