In [None]:
!pip install pyreadr

In [None]:
import pyreadr

## Helper functions

In [None]:
def convert_nonzero_to_one(feature):
    df[feature] = df[feature].apply(lambda x: 1 if x != 0 else x)
    return df

In [None]:
def balance_variable(df, features):
    # Create a new column combining the features
    df['group_key'] = df[features[0]].astype(str) + '_' + df[features[1]].astype(str)

    # Calculate the minimum count for each group combination
    min_counts = df['group_key'].value_counts().min()

    # Sample an equal number of rows for each group combination
    balanced_df = df.groupby('group_key', group_keys=False).apply(lambda x: x.sample(min_counts))

    # Drop the group_key column after balancing
    balanced_df = balanced_df.drop(columns=['group_key'])

    return balanced_df

In [None]:
def drop_features(drop_features):
    df_final = df.drop(drop_features, axis=1)
    return df_final

## Upload dataset

In [None]:
dataset_name = "norauto"

# Load the RDA file
result = pyreadr.read_r(dataset_name +".rda")

# Extract the dataframe from the result
df = result[dataset_name]

# Print the dataframe
df.head()

In [None]:
y_label = "ClaimAmount"
features_to_drop = ["NbClaim"]

In [None]:
df.isnull().values.any()

## Preprocessing

In [None]:
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the DataFrame
df = df.drop(df.index[:700]).reset_index(drop=True)  # Drop the first 700 rows after shuffling

In [None]:
df = convert_nonzero_to_one(y_label)
df[y_label].value_counts()

In [None]:
df = drop_features(features_to_drop)

In [None]:
df.dtypes

In [None]:
positive_outcome = 1  # Assuming 1 represents the positive outcome

# Calculate counts
total_counts = df['Male'].value_counts()
positive_counts = df[df[y_label] == 1]['Male'].value_counts()

# Calculate base rates
base_rates = positive_counts / total_counts

# Display results
for gender, rate in base_rates.items():
    gender_label = 'Female' if gender == 1 else 'Male'
    print(f"Gender: {gender_label}")
    print(f"  Positive outcome ({y_label} = {positive_outcome}) rate: {rate:.2%}")
    print(f"  Total count: {total_counts[gender]}")
    print(f"  Positive count: {positive_counts[gender]}")
    print()

## Save as a CSV

In [None]:
df.to_csv(dataset_name + "/" + dataset_name + '.csv')