In [None]:
# coding=utf-8
#
# The copyright of this file belongs to Feedzai. The file cannot be
# reproduced in whole or in part, stored in a retrieval system,
# transmitted in any form, or by any means electronic, mechanical,
# photocopying, or otherwise, without the prior permission of the owner.
#
# (c) 2022 Feedzai, Strictly Confidential
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np   # Seed generation
import pandas as pd  # Matrix operations

In [None]:
# Reading the 3M sample:
large_sample_path = "/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Synthetic Data/synthetic-data-merged.csv"
large_sample_df = pd.read_csv(large_sample_path)

# Reading the original (with same preprocessed features) dataset:
original_sample_path = "/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Base.csv"
original_sample_df = pd.read_csv(original_sample_path)

In [None]:
print(large_sample_df.columns)
large_sample_df = large_sample_df.drop(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0'], axis=1) # drop additional columns created through merging datasets
print(large_sample_df.columns)

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'fraud_bool', 'income',
       'name_email_similarity', 'prev_address_months_count',
       'current_address_months_count', 'customer_age', 'days_since_request',
       'intended_balcon_amount', 'payment_type', 'zip_count_4w', 'velocity_6h',
       'velocity_24h', 'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')
Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'pa

In [None]:
large_sample_df.head() # check df

In [None]:
original_sample_df.head() # check df

In [None]:
# checking minimum and maximum income values within large and base dataframes

print(f"Highest income for large_sample_df: {large_sample_df['income'].max()}")
print(f"Lowest income for large_sample_df: {large_sample_df['income'].min()}\n")

print(f"Highest income for original_sample_df: {original_sample_df['income'].max()}")
print(f"Lowest income for original_sample_df: {original_sample_df['income'].min()}\n")

Highest income for large_sample_df: 0.9
Lowest income for large_sample_df: 0.1

Highest income for original_sample_df: 0.9
Lowest income for original_sample_df: 0.1



In [None]:
exact_70_income = large_sample_df.loc[(large_sample_df['income'] == 0.7)] # check if any samples have income exactly equal to split value
print(len(exact_70_income))

0


In [None]:
# Obtain month frequency and fraud prevalence per month (on original data).
month_frequency =  original_sample_df["month"].value_counts(normalize=True).to_dict()
month_fraud_prev = original_sample_df.groupby("month")["fraud_bool"].mean().to_dict()
# We cast to dict in order to facilitate the next operations.

In [None]:
# Calculating the expected number of positive and negative instances,
# per month, given the observed month frequency and prevalence.

sample_size = 1e6

expected_positives = {}
expected_negatives = {}

for month in month_fraud_prev.keys():
    expected_positives[month] = round(sample_size * month_frequency[month] * month_fraud_prev[month])
    expected_negatives[month] = round(sample_size * month_frequency[month] * (1-month_fraud_prev[month]))

In [None]:
# Sampling the "Base" dataset: Same month frequency and fraud rate per month.
base_dfs = []

SEED = 42

num_months = len(large_sample_df["month"].unique())
seed_possible_values = list(range(1_000_000))
seed_list = np.random.choice(seed_possible_values, size=num_months, replace=False)

for month, seed in zip(large_sample_df["month"].unique(), seed_list):
    positive_pool = large_sample_df[(large_sample_df["month"]==month) & (large_sample_df["fraud_bool"]==1)]
    negative_pool = large_sample_df[(large_sample_df["month"]==month) & (large_sample_df["fraud_bool"]==0)]

    positive_sample = positive_pool.sample(expected_positives[month], random_state=seed)
    negative_sample = negative_pool.sample(expected_negatives[month], random_state=seed+SEED)

    base_dfs.extend([positive_sample, negative_sample])

In [None]:
# Concatenate the filtered samples to obtain the final dataset.
base_df = pd.concat(base_dfs)

In [None]:
# Now generating the biased samples.
# We will start by defining the protected groups.
large_sample_df["group"] = (large_sample_df["income"] < 0.7).map({True:"Minority", False: "Majority"})

In [None]:
# Helper method to define the joint probability of each combination of
# group and label.

def calculate_probabilities(
    original_prevalence: float,
    prev_ratio: float,
    maj_pct: float,
):
    # Probability notation (p_maj = P(A=maj))
    p_maj = maj_pct
    p_min = 1 - p_maj

    # Calculate prevalence for each class
    prev_min = original_prevalence / (prev_ratio * p_maj + (1 - p_maj))
    prev_maj = prev_ratio * prev_min

    # Calculate joint and conditional probabilities of majority group
    p_maj_and_pos = prev_maj * p_maj
    p_maj_giv_pos: float = p_maj_and_pos / original_prevalence
    p_maj_and_neg = p_maj - p_maj_and_pos
    p_maj_giv_neg: float = p_maj_and_neg / (1 - original_prevalence)

    # Calculate joint and conditional probabilities of minority group
    p_min_and_pos = prev_min * p_min
    p_min_giv_pos: float = p_min_and_pos / original_prevalence
    p_min_and_neg = p_min - p_min_and_pos
    p_min_giv_neg: float = p_min_and_neg / (1 - original_prevalence)

    return p_min_and_pos, p_maj_and_pos, p_min_and_neg, p_maj_and_neg

In [None]:
# Helper method to obtain a dataframe from given group, month and label.
def get_filtered_df(large_sample_df, group, month, label):
    return large_sample_df[
        (large_sample_df["month"]==month) &
        (large_sample_df["group"]==group) &
        (large_sample_df["fraud_bool"]==label)]


# Method to generate a biased sample controling group size or prevalence (fraud rate)
def group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity):
    seed_list = np.random.choice(seed_possible_values, size=num_months, replace=False)

    bias_dfs = []

    # Allow for different majority sizes/fraud rates depending on the month of data.
    # This replicates a value if only one is passed.
    if isinstance(majority_size, float):
        majority_size=[majority_size]*original_sample_df["month"].unique().shape[0]
    if isinstance(fraud_rate_disparity, (int, float)):
        fraud_rate_disparity=[fraud_rate_disparity]*original_sample_df["month"].unique().shape[0]

    for month, seed, maj_size, fr_disp in zip(large_sample_df["month"].unique(), seed_list, majority_size, fraud_rate_disparity):
        month_prevalence = original_sample_df[original_sample_df["month"]==month]["fraud_bool"].mean()
        (
            p_min_and_pos,
            p_maj_and_pos,
            p_min_and_neg,
            p_maj_and_neg,
        ) = calculate_probabilities(month_prevalence, 1/fr_disp, maj_size)

        month_size = original_sample_df["month"].value_counts(normalize=True)[month]*sample_size

        # Calculate the needed amount of each combination of group/label to satisfy the disparities in month.
        n_minority_positive = round(month_size*p_min_and_pos, 0)
        n_minority_negative = round(month_size*p_min_and_neg, 0)
        n_majority_positive = round(month_size*p_maj_and_pos, 0)
        n_majority_negative = round(month_size*p_maj_and_neg, 0)

        # Sample the large sample with expected values.
        bias_dfs.extend(
        [
            get_filtered_df(large_sample_df, "Minority", month, 1).sample(int(n_minority_positive), random_state=seed),
            get_filtered_df(large_sample_df, "Minority", month, 0).sample(int(n_minority_negative), random_state=seed+SEED),
            get_filtered_df(large_sample_df, "Majority", month, 1).sample(int(n_majority_positive), random_state=seed+2*SEED),
            get_filtered_df(large_sample_df, "Majority", month, 0).sample(int(n_majority_negative), random_state=seed+3*SEED),
        ]
        )

    return pd.concat(bias_dfs)

In [None]:
# Params for the generated sample
majority_size = 0.9      # Relative size of the majority group
fraud_rate_disparity = 1 # fraud prevalence in minority / fraud prevalence in majority

# For Type I we want to test group size disparity.
# Majority will have 90% of instances, Minority 10% of instances.

In [None]:
typeI_df = group_prevalence_disparity(large_sample_df, original_sample_df, majority_size, fraud_rate_disparity)
typeI_df.to_csv("/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Income Data Variants 1m/income_07_type1.csv")

In [None]:
majority_size = 0.9
fraud_rate_disparity = 3
typeII_df = group_prevalence_disparity(large_sample_df,original_sample_df,majority_size,fraud_rate_disparity)
typeII_df.to_csv("/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Income Data Variants 1m/income_07_type2.csv")

In [None]:
majority_size = 0.7
fraud_rate_disparity = 1
typeIII_df = group_prevalence_disparity(large_sample_df,original_sample_df,majority_size,fraud_rate_disparity)
typeIII_df.to_csv("/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Income Data Variants 1m/income_07_type3.csv")

In [None]:
majority_size = 0.7
fraud_rate_disparity = 3
typeIV_df = group_prevalence_disparity(large_sample_df,original_sample_df,majority_size,fraud_rate_disparity)
typeIV_df.to_csv("/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Income Data Variants 1m/income_07_type4.csv")

In [None]:
majority_size = 0.5
fraud_rate_disparity = 1
typeV_df = group_prevalence_disparity(large_sample_df,original_sample_df,majority_size,fraud_rate_disparity)
typeV_df.to_csv("/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Income Data Variants 1m/income_07_type5.csv")

In [None]:
majority_size = 0.5
fraud_rate_disparity = 3
typeVI_df = group_prevalence_disparity(large_sample_df,original_sample_df,majority_size,fraud_rate_disparity)
typeVI_df.to_csv("/content/drive/MyDrive/Colab Notebooks/ECE697/Project/Income Data Variants 1m/income_07_type6.csv")