# Imports

In [1]:
import pandas as pd

# Data Source

In [2]:
url = 'https://s3.amazonaws.com/cfpb-hmda-public/prod/three-year-data/2019/2019_public_lar_three_year_csv.zip'
# See information about data fields of the original dataset here:
# https://ffiec.cfpb.gov/documentation/publications/loan-level-datasets/lar-data-fields

# How many rows
- there are potentionally millions.
- you can not load all! the computer will die.

In [3]:
num_rows_to_read = 1000000

# Conditions - which columns we want

In [4]:
# 'loan_purpose': condition, 'business or commercial_purpose, condition two is home owners.
condition = lambda x: x == 1
condition2 = lambda x: x == 2

# List of column names we want to read:
columns_to_read = [
'loan_type',
'loan_amount',
'action_taken',
'occupancy_type',
'applicant_ethnicity_1',
'co_applicant_ethnicity_1',
'applicant_race_1',
'co_applicant_race_1',
'applicant_sex',
'co_applicant_sex',
'applicant_age',
'co_applicant_age',
'income',
'lien_status',
'applicant_credit_score_type',
'co_applicant_credit_score_type',
'origination_charges',
'interest_rate',
'debt_to_income_ratio',
'combined_loan_to_value_ratio',
'loan_term',
'property_value',
'manufactured_home_secured_property_type',
'total_units',
'open_end_line_of_credit',
'manufactured_home_land_property_interest',
'total_loan_costs',
'negative_amortization',
'interest_only_payment',
'balloon_payment',
'other_nonamortizing_features',
    
'co_applicant_race_2', # Need those for the EDA in '3', remove later.
'applicant_race_2'  # Need those for the EDA in '3', remove later.
]


In [5]:
# Read the CSV file with the specified columns
data = pd.read_csv(url, usecols=columns_to_read, nrows=num_rows_to_read, converters={'loan_purpose': condition, 'business_or_commercial_purpose': condition2}, low_memory=False)

In [6]:
data.head()

Unnamed: 0,action_taken,loan_type,lien_status,open_end_line_of_credit,loan_amount,combined_loan_to_value_ratio,interest_rate,total_loan_costs,origination_charges,loan_term,...,applicant_ethnicity_1,co_applicant_ethnicity_1,applicant_race_1,applicant_race_2,co_applicant_race_1,co_applicant_race_2,applicant_sex,co_applicant_sex,applicant_age,co_applicant_age
0,3,2,1,2,115000,98.188,,,,360,...,2.0,5.0,3.0,,8.0,,1,5,35-44,9999
1,2,1,1,2,345000,79.254,3.5,,,180,...,2.0,2.0,6.0,,6.0,,1,2,45-54,55-64
2,5,2,1,2,225000,,,,,360,...,1.0,1.0,5.0,,5.0,,1,2,45-54,45-54
3,5,2,1,2,125000,,,,,360,...,2.0,5.0,5.0,,8.0,,2,5,65-74,9999
4,5,1,1,2,125000,,,,,360,...,2.0,5.0,5.0,,8.0,,2,5,55-64,9999


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 33 columns):
 #   Column                                    Non-Null Count    Dtype  
---  ------                                    --------------    -----  
 0   action_taken                              1000000 non-null  int64  
 1   loan_type                                 1000000 non-null  int64  
 2   lien_status                               1000000 non-null  int64  
 3   open_end_line_of_credit                   1000000 non-null  int64  
 4   loan_amount                               1000000 non-null  int64  
 5   combined_loan_to_value_ratio              534599 non-null   object 
 6   interest_rate                             736975 non-null   object 
 7   total_loan_costs                          702224 non-null   object 
 8   origination_charges                       716826 non-null   object 
 9   loan_term                                 999137 non-null   object 
 10  negativ

## Target Variable.

the target variable: action_taken.

- 1 - Loan originated
- 2 - Application approved but not accepted
- 3 - Application denied
- 4 - Application withdrawn by applicant
- 5 - File closed for incompleteness
- 6 - Purchased loan
- 7 - Preapproval request denied
- 8 - Preapproval request approved but not accepted

# we want enough data in the target column in the categories 1 and 3. at least 30% in category 3.
data['action_taken'].value_counts()

In [8]:
# Copy of the data for safety
df_test = data.copy()

## How many rows have missing values when target variable is 'declined'

In [9]:
# Create a Boolean mask to identify rows with missing values in 'object' columns
missing_values_mask = df_test.select_dtypes(include=['object']).isna().any(axis=1)
print(missing_values_mask)

0          True
1          True
2          True
3          True
4          True
          ...  
999995     True
999996    False
999997     True
999998    False
999999     True
Length: 1000000, dtype: bool


### ['action_taken'] == 3

When the target variable category is 3, then the loan is declined.(1 is accepted)
Many rows had missing values in many places for those rows. This makes predictions harder. As we have a lot of data, we can remove rows with missing values where ['action_taken'] == 3

# Pick rows with no missing values when target variable is 3

In [10]:
# Create a Boolean mask to identify rows without missing values in 'object' columns or with 'action_taken' equal to 3
no_missing_values_mask = ~missing_values_mask | (df_test['action_taken'] == 3)

# Use the mask to filter the DataFrame and get rows without missing values in 'object' columns, or with 'action_taken' equal to 3
rows_without_missing_values = df_test[no_missing_values_mask]

# Print the number of rows without missing values in 'object' columns or with 'action_taken' equal to 3
print(len(rows_without_missing_values))

506692


In [11]:
rows_without_missing_values.info()

<class 'pandas.core.frame.DataFrame'>
Index: 506692 entries, 0 to 999999
Data columns (total 33 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   action_taken                              506692 non-null  int64  
 1   loan_type                                 506692 non-null  int64  
 2   lien_status                               506692 non-null  int64  
 3   open_end_line_of_credit                   506692 non-null  int64  
 4   loan_amount                               506692 non-null  int64  
 5   combined_loan_to_value_ratio              500164 non-null  object 
 6   interest_rate                             397324 non-null  object 
 7   total_loan_costs                          397324 non-null  object 
 8   origination_charges                       397324 non-null  object 
 9   loan_term                                 506494 non-null  object 
 10  negative_amortization    

In [12]:
rows_without_missing_values.head()

Unnamed: 0,action_taken,loan_type,lien_status,open_end_line_of_credit,loan_amount,combined_loan_to_value_ratio,interest_rate,total_loan_costs,origination_charges,loan_term,...,applicant_ethnicity_1,co_applicant_ethnicity_1,applicant_race_1,applicant_race_2,co_applicant_race_1,co_applicant_race_2,applicant_sex,co_applicant_sex,applicant_age,co_applicant_age
0,3,2,1,2,115000,98.188,,,,360,...,2.0,5.0,3.0,,8.0,,1,5,35-44,9999
7,1,1,1,2,355000,75.822,4.25,8352.71,4961.44,360,...,2.0,2.0,5.0,,5.0,,1,2,35-44,>74
8,1,1,1,2,265000,77.612,3.5,1983.0,0.0,360,...,2.0,5.0,5.0,,8.0,,2,5,45-54,9999
13,1,2,1,2,285000,84.777,4.125,6854.68,0.0,360,...,2.0,2.0,5.0,,5.0,,1,2,35-44,35-44
16,1,1,1,2,295000,95.0,3.5,4700.6,1630.2,360,...,2.0,5.0,5.0,,8.0,,2,5,35-44,9999


# Categorical columns

In [13]:
categorical_columns = rows_without_missing_values.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['combined_loan_to_value_ratio',
 'interest_rate',
 'total_loan_costs',
 'origination_charges',
 'loan_term',
 'property_value',
 'total_units',
 'debt_to_income_ratio',
 'applicant_age',
 'co_applicant_age']

# Numerical columns

In [14]:
numerical_columns = rows_without_missing_values.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_columns

['action_taken',
 'loan_type',
 'lien_status',
 'open_end_line_of_credit',
 'loan_amount',
 'negative_amortization',
 'interest_only_payment',
 'balloon_payment',
 'other_nonamortizing_features',
 'occupancy_type',
 'manufactured_home_secured_property_type',
 'manufactured_home_land_property_interest',
 'income',
 'applicant_credit_score_type',
 'co_applicant_credit_score_type',
 'applicant_ethnicity_1',
 'co_applicant_ethnicity_1',
 'applicant_race_1',
 'applicant_race_2',
 'co_applicant_race_1',
 'co_applicant_race_2',
 'applicant_sex',
 'co_applicant_sex']

In [15]:
rows_without_missing_values['action_taken'].value_counts()

action_taken
1    393274
3    109885
6      2851
4       423
2       132
5        52
8        52
7        23
Name: count, dtype: int64

# Filter all rows that do not have the target variable as either 1 or 3.

In [16]:
# Filter rows to keep only categories 1 and 3
df = rows_without_missing_values[rows_without_missing_values['action_taken'].isin([1, 3])]

# Calculate the counts for each category using len
category_1_count = len(df[df['action_taken'] == 1])
category_3_count = len(df[df['action_taken'] == 3])

# Ensure that counts are the same for categories 1 and 3
if category_1_count != category_3_count:
    if category_1_count > category_3_count:
        # Randomly sample rows from category 1 to match the count of category 3
        category_1_rows = df[df['action_taken'] == 1]
        sampled_rows = category_1_rows.sample(n=category_3_count, random_state=1)
        df = pd.concat([df[df['action_taken'] == 3], sampled_rows])
    elif category_3_count > category_1_count:
        # Randomly sample rows from category 3 to match the count of category 1
        category_3_rows = df[df['action_taken'] == 3]
        sampled_rows = category_3_rows.sample(n=category_1_count, random_state=1)
        df = pd.concat([df[df['action_taken'] == 1], sampled_rows])
# Now, df contains the same number of values in categories 1 and 3
df['action_taken'].value_counts()


action_taken
3    109885
1    109885
Name: count, dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 219770 entries, 0 to 16536
Data columns (total 33 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   action_taken                              219770 non-null  int64  
 1   loan_type                                 219770 non-null  int64  
 2   lien_status                               219770 non-null  int64  
 3   open_end_line_of_credit                   219770 non-null  int64  
 4   loan_amount                               219770 non-null  int64  
 5   combined_loan_to_value_ratio              213242 non-null  object 
 6   interest_rate                             110402 non-null  object 
 7   total_loan_costs                          110402 non-null  object 
 8   origination_charges                       110402 non-null  object 
 9   loan_term                                 219572 non-null  object 
 10  negative_amortization     

In [18]:
df.to_csv('1_downloaded_data.csv', index=False)