In [1]:
import pandas as pd

In [None]:
url = 'https://s3.amazonaws.com/cfpb-hmda-public/prod/three-year-data/2019/2019_public_lar_three_year_csv.zip'
# See information about data fields of the original dataset here:
# https://ffiec.cfpb.gov/documentation/publications/loan-level-datasets/lar-data-fields

num_rows_to_read = 700000

# 'loan_purpose': condition, 'business or commercial_purpose, condition two is home owners.
condition = lambda x: x == 1
condition2 = lambda x: x == 2

# List of column names we want to read:
columns_to_read = [
'loan_type',
'loan_amount',
'action_taken',
'occupancy_type',
'census_tract',
'applicant_ethnicity_1',
'co_applicant_ethnicity_1',
'applicant_race_1',
'co_applicant_race_1',
'applicant_sex',
'co_applicant_sex',
'applicant_age',
'co_applicant_age',
'income',
'lien_status',
'applicant_credit_score_type',
'co_applicant_credit_score_type',
'origination_charges',
'interest_rate',
'debt_to_income_ratio',
'combined_loan_to_value_ratio',
'loan_term',
'property_value',
'manufactured_home_secured_property_type',
'total_units',
'aus_1',
'reverse_mortgage',
'open_end_line_of_credit',
'manufactured_home_land_property_interest',
'total_loan_costs',
'negative_amortization',
'interest_only_payment',
'balloon_payment',
'other_nonamortizing_features',
    
'co_applicant_race_2', # Need those for the EDA in '3', remove later.
'applicant_race_2'  # Need those for the EDA in '3', remove later.
]

# Read the CSV file with the specified columns
data = pd.read_csv(url, usecols=columns_to_read, nrows=num_rows_to_read, converters={'loan_purpose': condition, 'business_or_commercial_purpose': condition2}, low_memory=False)

In [None]:
data.head()

In [None]:
data.info()

## Check target variable for 'enough' data.

In [None]:
# we want enough data in the target column in the categories 1 and 3. at least 30% in category 3.
data['action_taken'].value_counts()

In [None]:
df_test = data.copy()

In [None]:
# Create a Boolean mask to identify rows with missing values in 'object' columns
missing_values_mask = df_test.select_dtypes(include=['object']).isna().any(axis=1)

# Create a Boolean mask to identify rows without missing values in 'object' columns or with 'action_taken' equal to 3
no_missing_values_mask = ~missing_values_mask | (df_test['action_taken'] == 3)

# Use the mask to filter the DataFrame and get rows without missing values in 'object' columns, or with 'action_taken' equal to 3
rows_without_missing_values = df_test[no_missing_values_mask]

# Print the number of rows without missing values in 'object' columns or with 'action_taken' equal to 3
print(len(rows_without_missing_values))


In [None]:
data['action_taken'].value_counts()

In [None]:
rows_without_missing_values.info()

In [None]:
rows_without_missing_values.head()

In [None]:
categorical_columns = rows_without_missing_values.select_dtypes(include=['object']).columns.tolist()
categorical_columns

In [None]:
numerical_columns = rows_without_missing_values.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_columns

In [None]:
rows_without_missing_values['action_taken'].value_counts()

In [None]:
# Filter rows to keep only categories 1 and 3
df = rows_without_missing_values[rows_without_missing_values['action_taken'].isin([1, 3])]

# Calculate the counts for each category using len
category_1_count = len(df[df['action_taken'] == 1])
category_3_count = len(df[df['action_taken'] == 3])

# Ensure that counts are the same for categories 1 and 3
if category_1_count != category_3_count:
    if category_1_count > category_3_count:
        # Randomly sample rows from category 1 to match the count of category 3
        category_1_rows = df[df['action_taken'] == 1]
        sampled_rows = category_1_rows.sample(n=category_3_count, random_state=1)
        df = pd.concat([df[df['action_taken'] == 3], sampled_rows])
    elif category_3_count > category_1_count:
        # Randomly sample rows from category 3 to match the count of category 1
        category_3_rows = df[df['action_taken'] == 3]
        sampled_rows = category_3_rows.sample(n=category_1_count, random_state=1)
        df = pd.concat([df[df['action_taken'] == 1], sampled_rows])
# Now, df contains the same number of values in categories 1 and 3
df['action_taken'].value_counts()


In [None]:
df.info()

In [None]:
df.to_csv('1_downloaded_data.csv', index=False)