In [None]:
def create_binary_dataset():    
    """This will create a binary dataset from the csv with a set salary 
    as the threshold for later predictions.
    
    Input - A numeric salary to be set as the threshold
    
    Out - A AIF360 binary dataset with one-hot encoded categorical columns
    """
    
    data = pd.read_csv('../company_x.csv', index_col='employee_id')
    data_with_label = data.copy()
    data_with_label['salary'] = data_with_label['salary'].transform(lambda x: x > salary).astype(int)
    data_with_label['sex'] = data_with_label['sex'].transform(lambda x: x == 'M').astype(int)

    std_data = StandardDataset(df=data_with_label,   
                             label_name='salary',
                             favorable_classes =[1],
                            protected_attribute_names=['sex'], 
                             privileged_classes=[[1]],
                            categorical_features=['degree_level', 'dept'], 
                              features_to_drop=['boss_id'])

    df_data = std_data.convert_to_dataframe()
    binary_dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df_data[0], label_names=['salary'],
                  protected_attribute_names=['sex'])
    
    return binary_dataset