In [1]:
import pandas as pd

In [2]:
url = 'https://s3.amazonaws.com/cfpb-hmda-public/prod/three-year-data/2019/2019_public_lar_three_year_csv.zip'
# See information about data fields of the original dataset here:
# https://ffiec.cfpb.gov/documentation/publications/loan-level-datasets/lar-data-fields

num_rows_to_read = 700000

# 'loan_purpose': condition, 'business or commercial_purpose, condition two is home owners.
condition = lambda x: x == 1
condition2 = lambda x: x == 2

# List of column names we want to read:
columns_to_read = [
'loan_type',
'loan_amount',
'action_taken',
'occupancy_type',
'census_tract',
'applicant_ethnicity_1',
'co_applicant_ethnicity_1',
'applicant_race_1',
'co_applicant_race_1',
'applicant_sex',
'co_applicant_sex',
'applicant_age',
'co_applicant_age',
'income',
'lien_status',
'applicant_credit_score_type',
'co_applicant_credit_score_type',
'origination_charges',
'interest_rate',
'debt_to_income_ratio',
'combined_loan_to_value_ratio',
'loan_term',
'property_value',
'manufactured_home_secured_property_type',
'total_units',
'aus_1',
'reverse_mortgage',
'open_end_line_of_credit',
'manufactured_home_land_property_interest',
'total_loan_costs',
'negative_amortization',
'interest_only_payment',
'balloon_payment',
'other_nonamortizing_features',
    
'co_applicant_race_2', # Need those for the EDA in '3', remove later.
'applicant_race_2'  # Need those for the EDA in '3', remove later.
]

# Read the CSV file with the specified columns
data = pd.read_csv(url, usecols=columns_to_read, nrows=num_rows_to_read, converters={'loan_purpose': condition, 'business_or_commercial_purpose': condition2}, low_memory=False)

In [3]:
data.head()

Unnamed: 0,census_tract,action_taken,loan_type,lien_status,reverse_mortgage,open_end_line_of_credit,loan_amount,combined_loan_to_value_ratio,interest_rate,total_loan_costs,...,co_applicant_ethnicity_1,applicant_race_1,applicant_race_2,co_applicant_race_1,co_applicant_race_2,applicant_sex,co_applicant_sex,applicant_age,co_applicant_age,aus_1
0,9003416000.0,3,2,1,2,2,115000,98.188,,,...,5.0,3.0,,8.0,,1,5,35-44,9999,1
1,42017100000.0,2,1,1,2,2,345000,79.254,3.5,,...,2.0,6.0,,6.0,,1,2,45-54,55-64,1
2,34039030000.0,5,2,1,2,2,225000,,,,...,1.0,5.0,,5.0,,1,2,45-54,45-54,1
3,42055010000.0,5,2,1,2,2,125000,,,,...,5.0,5.0,,8.0,,2,5,65-74,9999,1
4,30111000000.0,5,1,1,2,2,125000,,,,...,5.0,5.0,,8.0,,2,5,55-64,9999,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 36 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   census_tract                              698778 non-null  float64
 1   action_taken                              700000 non-null  int64  
 2   loan_type                                 700000 non-null  int64  
 3   lien_status                               700000 non-null  int64  
 4   reverse_mortgage                          700000 non-null  int64  
 5   open_end_line_of_credit                   700000 non-null  int64  
 6   loan_amount                               700000 non-null  int64  
 7   combined_loan_to_value_ratio              377661 non-null  object 
 8   interest_rate                             492908 non-null  object 
 9   total_loan_costs                          473993 non-null  object 
 10  origination_charges 

## Check target variable for 'enough' data.

In [5]:
# we want enough data in the target column in the categories 1 and 3. at least 30% in category 3.
data['action_taken'].value_counts()

action_taken
1    297049
6    191999
4    118062
3     83560
5      5254
2      3167
8       856
7        53
Name: count, dtype: int64

In [6]:
df_test = data.copy()

In [7]:
# Create a Boolean mask to identify rows with missing values in 'object' columns
missing_values_mask = df_test.select_dtypes(include=['object']).isna().any(axis=1)

# Create a Boolean mask to identify rows without missing values in 'object' columns or with 'action_taken' equal to 3
no_missing_values_mask = ~missing_values_mask | (df_test['action_taken'] == 3)

# Use the mask to filter the DataFrame and get rows without missing values in 'object' columns, or with 'action_taken' equal to 3
rows_without_missing_values = df_test[no_missing_values_mask]

# Print the number of rows without missing values in 'object' columns or with 'action_taken' equal to 3
print(len(rows_without_missing_values))


360281


In [8]:
data['action_taken'].value_counts()

action_taken
1    297049
6    191999
4    118062
3     83560
5      5254
2      3167
8       856
7        53
Name: count, dtype: int64

In [9]:
rows_without_missing_values.info()

<class 'pandas.core.frame.DataFrame'>
Index: 360281 entries, 0 to 699986
Data columns (total 36 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   census_tract                              360137 non-null  float64
 1   action_taken                              360281 non-null  int64  
 2   loan_type                                 360281 non-null  int64  
 3   lien_status                               360281 non-null  int64  
 4   reverse_mortgage                          360281 non-null  int64  
 5   open_end_line_of_credit                   360281 non-null  int64  
 6   loan_amount                               360281 non-null  int64  
 7   combined_loan_to_value_ratio              358926 non-null  object 
 8   interest_rate                             276876 non-null  object 
 9   total_loan_costs                          276876 non-null  object 
 10  origination_charges      

In [10]:
rows_without_missing_values.head()

Unnamed: 0,census_tract,action_taken,loan_type,lien_status,reverse_mortgage,open_end_line_of_credit,loan_amount,combined_loan_to_value_ratio,interest_rate,total_loan_costs,...,co_applicant_ethnicity_1,applicant_race_1,applicant_race_2,co_applicant_race_1,co_applicant_race_2,applicant_sex,co_applicant_sex,applicant_age,co_applicant_age,aus_1
0,9003416000.0,3,2,1,2,2,115000,98.188,,,...,5.0,3.0,,8.0,,1,5,35-44,9999,1
7,42091200000.0,1,1,1,2,2,355000,75.822,4.25,8352.71,...,2.0,5.0,,5.0,,1,2,35-44,>74,1
8,6067006000.0,1,1,1,2,2,265000,77.612,3.5,1983.0,...,5.0,5.0,,8.0,,2,5,45-54,9999,6
13,55133200000.0,1,2,1,2,2,285000,84.777,4.125,6854.68,...,2.0,5.0,,5.0,,1,2,35-44,35-44,6
16,55133200000.0,1,1,1,2,2,295000,95.0,3.5,4700.6,...,5.0,5.0,,8.0,,2,5,35-44,9999,6


In [11]:
categorical_columns = rows_without_missing_values.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['combined_loan_to_value_ratio',
 'interest_rate',
 'total_loan_costs',
 'origination_charges',
 'loan_term',
 'property_value',
 'total_units',
 'debt_to_income_ratio',
 'applicant_age',
 'co_applicant_age']

In [12]:
numerical_columns = rows_without_missing_values.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_columns

['census_tract',
 'action_taken',
 'loan_type',
 'lien_status',
 'reverse_mortgage',
 'open_end_line_of_credit',
 'loan_amount',
 'negative_amortization',
 'interest_only_payment',
 'balloon_payment',
 'other_nonamortizing_features',
 'occupancy_type',
 'manufactured_home_secured_property_type',
 'manufactured_home_land_property_interest',
 'income',
 'applicant_credit_score_type',
 'co_applicant_credit_score_type',
 'applicant_ethnicity_1',
 'co_applicant_ethnicity_1',
 'applicant_race_1',
 'applicant_race_2',
 'co_applicant_race_1',
 'co_applicant_race_2',
 'applicant_sex',
 'co_applicant_sex',
 'aus_1']

In [13]:
rows_without_missing_values['action_taken'].value_counts()

action_taken
1    275989
3     83560
6       441
4       205
2        49
5        19
8        14
7         4
Name: count, dtype: int64

In [14]:
# Filter rows to keep only categories 1 and 3
df = rows_without_missing_values[rows_without_missing_values['action_taken'].isin([1, 3])]

# Calculate the counts for each category using len
category_1_count = len(df[df['action_taken'] == 1])
category_3_count = len(df[df['action_taken'] == 3])

# Ensure that counts are the same for categories 1 and 3
if category_1_count != category_3_count:
    if category_1_count > category_3_count:
        # Randomly sample rows from category 1 to match the count of category 3
        category_1_rows = df[df['action_taken'] == 1]
        sampled_rows = category_1_rows.sample(n=category_3_count, random_state=1)
        df = pd.concat([df[df['action_taken'] == 3], sampled_rows])
    elif category_3_count > category_1_count:
        # Randomly sample rows from category 3 to match the count of category 1
        category_3_rows = df[df['action_taken'] == 3]
        sampled_rows = category_3_rows.sample(n=category_1_count, random_state=1)
        df = pd.concat([df[df['action_taken'] == 1], sampled_rows])
# Now, df contains the same number of values in categories 1 and 3
df['action_taken'].value_counts()


action_taken
3    83560
1    83560
Name: count, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167120 entries, 0 to 343818
Data columns (total 36 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   census_tract                              167000 non-null  float64
 1   action_taken                              167120 non-null  int64  
 2   loan_type                                 167120 non-null  int64  
 3   lien_status                               167120 non-null  int64  
 4   reverse_mortgage                          167120 non-null  int64  
 5   open_end_line_of_credit                   167120 non-null  int64  
 6   loan_amount                               167120 non-null  int64  
 7   combined_loan_to_value_ratio              165765 non-null  object 
 8   interest_rate                             83715 non-null   object 
 9   total_loan_costs                          83715 non-null   object 
 10  origination_charges      

In [17]:
df.to_csv('1_downloaded_data.csv', index=False)