### In this notebook we will be taking the original South German Credit dataset and adding bias against foreign workers

In [None]:
%%capture 
#Cell Magic to hide output

#For Synthetic Data Generation
!pip install sdv
#Imports
import pandas as pd
from sagemaker.s3 import S3Downloader
from sdv.tabular import GaussianCopula

### Download data

First,  __download__ the data and save it in the `data` folder.


$^{[2]}$ Ulrike Grömping
Beuth University of Applied Sciences Berlin
Website with contact information: https://prof.beuth-hochschule.de/groemping/.

In [None]:
#Load data from s3
S3Downloader.download(
    "s3://sagemaker-sample-files/datasets/tabular/uci_statlog_german_credit_data/SouthGermanCredit.asc",
    "data",
)
#Set Column names
credit_columns = [
    "status",
    "duration",
    "credit_history",
    "purpose",
    "amount",
    "savings",
    "employment_duration",
    "installment_rate",
    "personal_status_sex",
    "other_debtors",
    "present_residence",
    "property",
    "age",
    "other_installment_plans",
    "housing",
    "number_credits",
    "job",
    "people_liable",
    "telephone",
    "foreign_worker",
    "credit_risk",
]
#Load South German Credit data
training_data = pd.read_csv(
    "data/SouthGermanCredit.asc",
    names=credit_columns,
    header=0,
    sep=r" ",
    engine="python",
    na_values="?",
).dropna()
#Take a look at first 5 rows of data
training_data.head()

### Explanation of Data Features

$`laufkont = status`
                                               
 1 : no checking account                       
 2 : ... < 0 DM                                
 3 : 0<= ... < 200 DM                          
 4 : ... >= 200 DM / salary for at least 1 year

$`laufzeit = duration`
     

$`moral = credit_history`
                                                
 0 : delay in paying off in the past            
 1 : critical account/other credits elsewhere   
 2 : no credits taken/all credits paid back duly
 3 : existing credits paid back duly till now   
 4 : all credits at this bank paid back duly    

$`verw = purpose`
                        
 0 : others             
 1 : car (new)          
 2 : car (used)         
 3 : furniture/equipment
 4 : radio/television   
 5 : domestic appliances
 6 : repairs            
 7 : education          
 8 : vacation           
 9 : retraining         
 10 : business          

$`hoehe = amount`
     

$`sparkont = savings`
                               
 1 : unknown/no savings account
 2 : ... <  100 DM             
 3 : 100 <= ... <  500 DM      
 4 : 500 <= ... < 1000 DM      
 5 : ... >= 1000 DM            

$`beszeit = employment_duration`
                     
 1 : unemployed      
 2 : < 1 yr          
 3 : 1 <= ... < 4 yrs
 4 : 4 <= ... < 7 yrs
 5 : >= 7 yrs        

$`rate = installment_rate`
                   
 1 : >= 35         
 2 : 25 <= ... < 35
 3 : 20 <= ... < 25
 4 : < 20          

$`famges = personal_status_sex`
                                         
 1 : male : divorced/separated           
 2 : female : non-single or male : single
 3 : male : married/widowed              
 4 : female : single                     

$`buerge = other_debtors`
                 
 1 : none        
 2 : co-applicant
 3 : guarantor   

$`wohnzeit = present_residence`
                     
 1 : < 1 yr          
 2 : 1 <= ... < 4 yrs
 3 : 4 <= ... < 7 yrs
 4 : >= 7 yrs        

$`verm = property`
                                              
 1 : unknown / no property                    
 2 : car or other                             
 3 : building soc. savings agr./life insurance
 4 : real estate                              

$`alter = age`
     

$`weitkred = other_installment_plans`
           
 1 : bank  
 2 : stores
 3 : none  

$`wohn = housing`
             
 1 : for free
 2 : rent    
 3 : own     

$`bishkred = number_credits`
         
 1 : 1   
 2 : 2-3 
 3 : 4-5 
 4 : >= 6

$`beruf = job`
                                               
 1 : unemployed/unskilled - non-resident       
 2 : unskilled - resident                      
 3 : skilled employee/official                 
 4 : manager/self-empl./highly qualif. employee

$`pers = people_liable`
              
 1 : 3 or more
 2 : 0 to 2   

$`telef = telephone`
                              
 1 : no                       
 2 : yes (under customer name)

$`gastarb = foreign_worker`
        
 1 : yes
 2 : no 

$`kredit = credit_risk`
         
 0 : bad 
 1 : good


### Synthetic Data Generation 

Synthetic data generation is an effective tool when studying model bias. Data can be biased towards or against specific groups or sub-groups. By generating biased data we can observe how our AI models behave towards a specific group or sub-group. One such sub-group we can examine are Foreign Workers. The model under inspection labels its predictions as 0(Bad credit) or 1(good credit), we can artificially lower their credit worthiness compared to other groups’ through synthetic data generation. 

For the data generation we will be using Synthetic Data Vault (https://sdv.dev/SDV/) or SDV for short.  SDV is a python library that allows you to generate synthetic data that is agreeable with the statistical nature of the original dataset.

In [None]:
#Parameters for generated data
#How many rows of data - switch to the amount of the South German Credit Risk 
Rows = 1000

#Increase loan request amount (multiplier)
LoanMod = 1.2

#Decrease savings (decrease by integer) Savings are measure in value from 1 (worst) - 5 (Best)
SavingsMod = 0 

#Decrease credit history value (decrease by integer) Credit history is measured in value from 0 (worst) - 4 (Best). 
CreditHistoryMod = 1 

#Select all foreign workers who were declined (foreign_worker value 1 credit risk 0)
ForeignWorkerData = training_data.loc[(training_data['foreign_worker'] == 1) & (training_data['credit_risk'] == 0)]
ForeignWorkerData

#Fit Foreign Worker data to SDV model (ignore warning)
model = GaussianCopula()
model.fit(ForeignWorkerData)

#Generate Synthetic foreign worker data based on rows stated 
SynthForeignWorkers = model.sample(Rows)

#Apply Loan Modifier
SynthForeignWorkers['amount'] = SynthForeignWorkers['amount'].apply(lambda x: x*LoanMod)

#Apply Savings Modifier 
cond = SynthForeignWorkers['savings'] > 1
SynthForeignWorkers.loc[cond,'savings'] += SavingsMod

#Apply Credit History Modifier
cond = SynthForeignWorkers['credit_history'] > 0
SynthForeignWorkers.loc[cond,'credit_history'] -= CreditHistoryMod

#Create new dataset with Synthetic Foreign workers (1000) + Original South German Credit Risk (1000)
frames = [training_data, SynthForeignWorkers]
BiasedData1 = pd.concat(frames)
BiasedData1.info()

In [None]:
#Store dataframe for use in Notebook 2 
%store BiasedData1

### Data inspection

In [None]:
#View first 5 lines of biased dataset
BiasedData1.head()

In [None]:
#Pie Charts for good credit/bad credit labels for Foreign Workers vs. Non-Foreign Workers 

#Imports 
import matplotlib.pyplot as plt
import seaborn as sns

#Select all Non-Foreign workers in the dataset
BiasedDataNonForeign = BiasedData1[BiasedData1['foreign_worker']==2]
#Select all Foreign workers in the dataset
BiasedDataForeign = BiasedData1[BiasedData1['foreign_worker']==1]
#Pie Chart for Non-Foreign workers good credit/bad credit
BiasedDataNonForeign.groupby(['credit_risk']).sum().plot(kind='pie', y='foreign_worker', autopct='%1.1f%%', colors = ['red','green'], legend = None)
plt.title('Credit Risk Ratio for Non-Foreign Workers')
plt.show()
#Pie Chart for Foreign workers good credit/bad credit
BiasedDataForeign.groupby(['credit_risk']).sum().plot(kind='pie', y='foreign_worker', autopct='%1.1f%%', colors = ['red','green'])
plt.title('Credit Risk Ratio for Foreign Workers')
plt.legend(labels=["Bad Credit","Good Credit"])
plt.show()

In [None]:
#Check how many Foreign workers there are vs. Non-Foreign Workers
print("Number of Foreign Workers: ",BiasedData1['foreign_worker'].value_counts()[1])
print("Number of Non-Foreign Workers: ",BiasedData1['foreign_worker'].value_counts()[2])

In [None]:
#Savings distribution of dataset
plt.figure(figsize=(8, 8))
plt.title('Biased Data Savings')
sns.countplot('savings', data=BiasedData1)

In [None]:
#Credit History distrobution of dataset
plt.figure(figsize=(8, 8))
plt.title('Biased Data Credit History')
sns.countplot('credit_history', data=BiasedData1)

### After ensuring every cell in this notebook has run correctly, you can now make your way over to the Notebook named "3-Single-AMT.ipynb" on the left menu