In [1]:
!pip install pyreadr



In [2]:
import pyreadr

## Helper functions

In [3]:
def convert_nonzero_to_one(feature):
    df[feature] = df[feature].apply(lambda x: 1 if x != 0 else x)
    return df

In [4]:
def drop_features(drop_features):
    df_final = df.drop(drop_features, axis=1)
    return df_final

## Upload dataset

In [5]:
dataset_name = "credit"

# Load the RDA file
result = pyreadr.read_r(dataset_name +".rda")

# Extract the dataframe from the result
df = result[dataset_name]

# Print the dataframe
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings,employment,installment_rate,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,telephone,foreign_worker,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0.0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1.0
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0.0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0.0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1.0


In [6]:
df.describe()

Unnamed: 0,duration,credit_amount,installment_rate,residence_since,age,existing_credits,num_dependents,class
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155,0.3
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086,0.458487
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0,0.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0,0.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0,0.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0,1.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0,1.0


In [32]:
df.isnull().values.any()

False

## Preprocessing

In [7]:
y_label = "class"
features_to_drop = ["purpose", "duration", "installment_rate" ]

In [8]:
personal_status_to_gender = {
    'A91': 0,
    'A92': 1,
    'A93': 0,
    'A94': 0,
    'A95': 1
}

# Create the gender feature
df['Gender'] = df['personal_status'].map(personal_status_to_gender)

# Drop the personal_status feature
df.drop(columns=['personal_status'], inplace=True)

In [9]:
df = convert_nonzero_to_one(y_label)
df[y_label].value_counts()

class
0.0    700
1.0    300
Name: count, dtype: int64

In [10]:
df = drop_features(features_to_drop)

In [11]:
df.dtypes

checking_status        category
credit_history         category
credit_amount             int32
savings                category
employment             category
other_parties          category
residence_since           int32
property_magnitude     category
age                       int32
other_payment_plans    category
housing                category
existing_credits          int32
job                    category
num_dependents            int32
telephone              category
foreign_worker         category
class                   float64
Gender                    int64
dtype: object

In [12]:
positive_outcome = 1  # Assuming 1 represents the positive outcome

# Calculate counts
total_counts = df['Gender'].value_counts()
positive_counts = df[df[y_label] == 1]['Gender'].value_counts()

# Calculate base rates
base_rates = positive_counts / total_counts

# Display results
for gender, rate in base_rates.items():
    gender_label = 'Female' if gender == 1 else 'Male'
    print(f"Gender: {gender_label}")
    print(f"  Positive outcome ({y_label} = {positive_outcome}) rate: {rate:.2%}")
    print(f"  Total count: {total_counts[gender]}")
    print(f"  Positive count: {positive_counts[gender]}")
    print()

Gender: Male
  Positive outcome (class = 1) rate: 27.68%
  Total count: 690
  Positive count: 191

Gender: Female
  Positive outcome (class = 1) rate: 35.16%
  Total count: 310
  Positive count: 109



## Save as a CSV

In [38]:
df.to_csv(dataset_name + '.csv')