In [1]:
import pandas as pd
import seaborn as sb

# Loan Bucket Data

In [2]:
filename = "data-for-facctrec - Loan_Bucket_5.csv"

In [3]:
raw_bucket_df = pd.read_csv(filename, header=None)
raw_bucket_df.head()

Unnamed: 0,0,1,2,3,4,5
0,BPR,,0.0658,-0.828,Original,
1,BMF,,0.0325,-0.628,Original,
2,SLIM,,0.0167,-0.773,Original,
3,RankALS,,0.0554,-0.723,Original,
4,,,,,,


In [4]:
raw_bucket_df.columns = ['Base', 'Reranker', 'NDCG', 'PSP', 'Optimization', 'Protected']

## Handle substitutions

In [5]:
raw_bucket_df.dropna(axis=0, how='all', inplace=True)

In [6]:
raw_bucket_df.fillna({'Reranker':'None', 'NDCG': 1.0, 'PSP': 2.0, 'Protected Feature': 'None'}, inplace=True)

In [7]:
raw_bucket_df['Reranker'].replace('FairStar', 'FA*IR', inplace=True)

In [8]:
raw_bucket_df.head()

Unnamed: 0,Base,Reranker,NDCG,PSP,Optimization,Protected
0,BPR,,0.0658,-0.828,Original,
1,BMF,,0.0325,-0.628,Original,
2,SLIM,,0.0167,-0.773,Original,
3,RankALS,,0.0554,-0.723,Original,
5,BPR,FAR,0.0595,-0.328,Separate,Loan Bucket 5


## Melt to get _long_ data

In [9]:
long_bucket_df = raw_bucket_df.melt(id_vars=['Base', 'Reranker', 'Optimization', 'Protected'], value_vars=['NDCG', 'PSP'], 
                     var_name='Metric', value_name='Value')

In [10]:
long_bucket_df.head()

Unnamed: 0,Base,Reranker,Optimization,Protected,Metric,Value
0,BPR,,Original,,NDCG,0.0658
1,BMF,,Original,,NDCG,0.0325
2,SLIM,,Original,,NDCG,0.0167
3,RankALS,,Original,,NDCG,0.0554
4,BPR,FAR,Separate,Loan Bucket 5,NDCG,0.0595


# Country data

In [11]:
filename = "data-for-facctrec - COUNTRY_low_pfr.csv"

In [12]:
raw_country_df = pd.read_csv(filename, header=None)
raw_country_df.head()

Unnamed: 0,0,1,2,3,4,5
0,BPR,,0.0658,-0.965,Original,COUNTRY_low_pfr
1,BMF,,0.0325,-0.727,Original,COUNTRY_low_pfr
2,SLIM,,0.0167,-0.719,Original,COUNTRY_low_pfr
3,RankALS,,0.0554,-0.723,Original,COUNTRY_low_pfr
4,,,,,,


In [13]:
raw_country_df.columns = ['Base', 'Reranker', 'NDCG', 'PSP', 'Optimization', 'Protected']

## Handle substitutions

In [14]:
raw_country_df.dropna(axis=0, how='all', inplace=True)

In [15]:
raw_country_df.fillna({'Reranker':'None', 'NDCG': 1.0, 'PSP': 2.0, 'Protected Feature': 'None'}, inplace=True)

In [16]:
raw_country_df['Reranker'].replace('FairStar', 'FA*IR', inplace=True)

In [17]:
raw_country_df['Protected'].replace('COUNTRY_low_pfr', 'Country', inplace=True)

In [18]:
raw_country_df.head()

Unnamed: 0,Base,Reranker,NDCG,PSP,Optimization,Protected
0,BPR,,0.0658,-0.965,Original,Country
1,BMF,,0.0325,-0.727,Original,Country
2,SLIM,,0.0167,-0.719,Original,Country
3,RankALS,,0.0554,-0.723,Original,Country
5,BPR,FAR,0.0642,-0.512,Separate,Country


## Melt to get _long_ data

In [19]:
long_country_df = raw_country_df.melt(id_vars=['Base', 'Reranker', 'Optimization', 'Protected'], value_vars=['NDCG', 'PSP'], 
                     var_name='Metric', value_name='Value')

In [20]:
long_country_df.head()

Unnamed: 0,Base,Reranker,Optimization,Protected,Metric,Value
0,BPR,,Original,Country,NDCG,0.0658
1,BMF,,Original,Country,NDCG,0.0325
2,SLIM,,Original,Country,NDCG,0.0167
3,RankALS,,Original,Country,NDCG,0.0554
4,BPR,FAR,Separate,Country,NDCG,0.0642


# Combine and save

In [21]:
long_df = pd.concat([long_bucket_df, long_country_df])

In [22]:
long_df.to_csv('optimization-experiments.csv', index=False)

In [23]:
long_df

Unnamed: 0,Base,Reranker,Optimization,Protected,Metric,Value
0,BPR,,Original,,NDCG,0.0658
1,BMF,,Original,,NDCG,0.0325
2,SLIM,,Original,,NDCG,0.0167
3,RankALS,,Original,,NDCG,0.0554
4,BPR,FAR,Separate,Loan Bucket 5,NDCG,0.0595
...,...,...,...,...,...,...
99,RankALS,FA*IR,Joint-2%,Country,PSP,2.0000
100,BPR,PFAR,Joint-2%,Country,PSP,2.0000
101,BMF,PFAR,Joint-2%,Country,PSP,2.0000
102,SLIM,PFAR,Joint-2%,Country,PSP,2.0000
