# Imports

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from itertools import product
import matplotlib.pyplot as plt

# load dataset 9

In [2]:
df = pd.read_csv('9_loan_term.csv', low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54832 entries, 0 to 54831
Data columns (total 33 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   action_taken                              54832 non-null  int64  
 1   loan_type                                 54832 non-null  int64  
 2   lien_status                               54832 non-null  int64  
 3   open_end_line_of_credit                   54832 non-null  int64  
 4   loan_amount                               54832 non-null  int64  
 5   combined_loan_to_value_ratio              54832 non-null  object 
 6   interest_rate                             54832 non-null  object 
 7   total_loan_costs                          54832 non-null  object 
 8   origination_charges                       54832 non-null  object 
 9   loan_term                                 54832 non-null  object 
 10  negative_amortization             

In [4]:
# Select object (string) and category columns from your DataFrame
object_category_columns = df.select_dtypes(include=['object'])

# Print the names of the selected columns
print(object_category_columns.columns)

Index(['combined_loan_to_value_ratio', 'interest_rate', 'total_loan_costs',
       'origination_charges', 'loan_term', 'total_units',
       'debt_to_income_ratio'],
      dtype='object')


# Convert interest_rate to numerical:

In [5]:
print(df['interest_rate'].value_counts())

interest_rate
3.5-4.0     35596
4.0-4.5      7277
3.0-3.5      5905
4.5-5.0      2939
2.0-3.0      1582
5.0-6.0      1325
6.0-7.0       170
7.0-8.0        15
8.0-9.0        13
9.9-15.0        5
1.0-2.0         3
0.0-1.0         2
Name: count, dtype: int64


In [6]:
# Define a dictionary to map the interest rate categories to numerical values
interest_rate_mapping = {
    '0.0-1.0': 1,
    '1.0-2.0': 2,
    '2.0-3.0': 3,
    '3.0-3.5': 4,
    '3.5-4.0': 5,
    '4.0-4.5': 6,
    '4.5-5.0': 7,
    '5.0-6.0': 8,
    '6.0-7.0': 9,
    '7.0-8.0': 10,
    '8.0-9.0': 11
}

# Map the values to the 'interest_rate' column
df['interest_rate'] = df['interest_rate'].map(interest_rate_mapping)

In [7]:
print(df['interest_rate'].value_counts())

interest_rate
5.0     35596
6.0      7277
4.0      5905
7.0      2939
3.0      1582
8.0      1325
9.0       170
10.0       15
11.0       13
2.0         3
1.0         2
Name: count, dtype: int64


# Convert loan_term to numerical:

In [8]:
print(df['loan_term'].value_counts())

loan_term
27-31 years    42129
12-16 years     7872
17-21 years     3220
7-11 years      1154
22-26 years      290
2-6 years        159
0-1 years          5
37-41 years        2
1-2 years          1
Name: count, dtype: int64


In [9]:
# Define a dictionary to map the loan term categories to numerical values
loan_term_mapping = {
    '2-6 years': 1,
    '7-11 years': 2,
    '12-16 years': 3,
    '17-21 years': 4,
    '22-26 years': 5,
    '27-31 years': 6,
    '37-41 years': 7
}

# Map the values to the 'loan_term' column
df['loan_term'] = df['loan_term'].map(loan_term_mapping)

In [10]:
print(df['loan_term'].value_counts())

loan_term
6.0    42129
3.0     7872
4.0     3220
2.0     1154
5.0      290
1.0      159
7.0        2
Name: count, dtype: int64


# Convert debt_to_income_ratio to numerical:

In [11]:
print(df['debt_to_income_ratio'].value_counts())

debt_to_income_ratio
36%-50%     20445
20%-<30%    11873
30%-<36%     9669
<20%         4499
>60%         4460
50%-60%      3659
Name: count, dtype: int64


In [12]:
# Define a dictionary to map the debt to income ratio categories to numerical values
debt_to_income_mapping = {
    '<20%': 1,
    '20%-<30%': 2,
    '30%-<36%': 3,
    '36%-50%': 4,
    '50%-60%': 5,
    '>60%': 6
}

# Map the values to the 'debt_to_income_ratio' column
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].map(debt_to_income_mapping)

In [13]:
print(df['debt_to_income_ratio'].value_counts())

debt_to_income_ratio
4.0    20445
2.0    11873
3.0     9669
1.0     4499
6.0     4460
5.0     3659
Name: count, dtype: int64


# Convert origination_charges to numerical:

In [14]:
print(df['origination_charges'].value_counts())

origination_charges
0-500        32956
1000-1500     8326
500-1000      5460
1500-2000     3032
2000-2500     1856
2500-3000     1333
3000-3500     1049
3500-4000      780
4000-4500       40
Name: count, dtype: int64


In [15]:
# Define a dictionary to map the origination_charges categories to numerical values
origination_charges_mapping = {
    '0-500': 1,
    '1000-1500': 2,
    '500-1000': 3,
    '1500-2000': 4,
    '2000-2500': 5,
    '2500-3000': 6,
    '3000-3500': 7,
    '3500-4000': 8
}

# Map the values to the 'origination_charges' column
df['origination_charges'] = df['origination_charges'].map(origination_charges_mapping)

In [16]:
print(df['origination_charges'].value_counts())

origination_charges
1.0    32956
2.0     8326
3.0     5460
4.0     3032
5.0     1856
6.0     1333
7.0     1049
8.0      780
Name: count, dtype: int64


# Convert total_loan_costs to numerical:

In [17]:
print(df['total_loan_costs'].value_counts())

total_loan_costs
0-500        25773
3000-3500     4176
2500-3000     4000
3500-4000     3346
2000-2500     2741
500-1000      2549
4000-4500     2532
4500-5000     1949
1500-2000     1566
5000-5500     1515
1000-1500     1138
5500-6000     1112
6000-6500      696
6500-7000      523
7000-7500      371
7500-8000      308
8000-8500      242
8500-9000      185
9000-9500      110
Name: count, dtype: int64


In [18]:
# Define a dictionary to map the total_loan_costs categories to numerical values
total_loan_costs_mapping = {
    '2500-3000': 1,
    '3000-3500': 2,
    '3500-4000': 3,
    '500-1000': 4,
    '2000-2500': 5,
    '4000-4500': 6,
    '0-500': 7,
    '4500-5000': 8,
    '1500-2000': 9,
    '5000-5500': 10,
    '1000-1500': 11,
    '5500-6000': 12,
    '6000-6500': 13,
    '6500-7000': 14,
    '7000-7500': 15,
    '7500-8000': 16,
    '8000-8500': 17,
    '8500-9000': 18
}

# Map the values to the 'origination_charges' column
df['total_loan_costs'] = df['total_loan_costs'].map(total_loan_costs_mapping)

In [19]:
print(df['total_loan_costs'].value_counts())

total_loan_costs
7.0     25773
2.0      4176
1.0      4000
3.0      3346
5.0      2741
4.0      2549
6.0      2532
8.0      1949
9.0      1566
10.0     1515
11.0     1138
12.0     1112
13.0      696
14.0      523
15.0      371
16.0      308
17.0      242
18.0      185
Name: count, dtype: int64


# Convert total_units to numerical:

In [20]:
print(df['total_units'].value_counts())

total_units
1       54079
2         529
4         120
3         103
5-24        1
Name: count, dtype: int64


In [21]:
# Define a dictionary to map the total_units categories to numerical values
total_units_mapping = {
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5-24': 5,
}

# Map the values to the 'combined_loan_to_value_ratio' column
df['total_units'] = df['total_units'].map(total_units_mapping)

In [22]:
print(df['total_units'].value_counts())

total_units
1    54079
2      529
4      120
3      103
5        1
Name: count, dtype: int64


# Convert combined_loan_to_value_ratio to numerical:

In [23]:
print(df['combined_loan_to_value_ratio'].value_counts())

combined_loan_to_value_ratio
75.0-80.0      13229
70.0-75.0       5643
95.0-100.0      4687
65.0-70.0       4668
90.0-95.0       3951
85.0-90.0       3719
55.0-60.0       3477
60.0-65.0       2979
80.0-85.0       2885
50.0-55.0       2317
45.0-50.0       2133
40.0-45.0       1508
35.0-40.0       1222
100.0-120.0     1087
30.0-35.0        957
25.0-30.0        370
Name: count, dtype: int64


In [24]:
# Define a dictionary to map the combined_loan_to_value_ratio categories to numerical values
combined_loan_to_value_ratio_mapping = {
    '75.0-80.0': 1,
    '70.0-75.0': 2,
    '95.0-100.0': 3,
    '65.0-70.0': 4,
    '90.0-95.0': 5,
    '85.0-90.0': 6,
    '55.0-60.0': 7,
    '60.0-65.0': 8,
    '50.0-55.0': 9,
    '80.0-85.0': 10,
    '45.0-50.0': 11,
    '40.0-45.0': 12,
    '35.0-40.0': 13,
    '30.0-35.0': 14,
    '25.0-30.0': 15,
    '100.0-120.0': 16,
    '0.0-25.0': 17
}


# Map the values to the 'combined_loan_to_value_ratio' column
df['combined_loan_to_value_ratio'] = df['combined_loan_to_value_ratio'].map(combined_loan_to_value_ratio_mapping)

In [25]:
print(df['combined_loan_to_value_ratio'].value_counts())

combined_loan_to_value_ratio
1     13229
2      5643
3      4687
4      4668
5      3951
6      3719
7      3477
8      2979
10     2885
9      2317
11     2133
12     1508
13     1222
16     1087
14      957
15      370
Name: count, dtype: int64


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54832 entries, 0 to 54831
Data columns (total 33 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   action_taken                              54832 non-null  int64  
 1   loan_type                                 54832 non-null  int64  
 2   lien_status                               54832 non-null  int64  
 3   open_end_line_of_credit                   54832 non-null  int64  
 4   loan_amount                               54832 non-null  int64  
 5   combined_loan_to_value_ratio              54832 non-null  int64  
 6   interest_rate                             54827 non-null  float64
 7   total_loan_costs                          54722 non-null  float64
 8   origination_charges                       54792 non-null  float64
 9   loan_term                                 54826 non-null  float64
 10  negative_amortization             

In [27]:
df.head()

Unnamed: 0,action_taken,loan_type,lien_status,open_end_line_of_credit,loan_amount,combined_loan_to_value_ratio,interest_rate,total_loan_costs,origination_charges,loan_term,...,applicant_ethnicity_1,co_applicant_ethnicity_1,applicant_race_1,applicant_race_2,co_applicant_race_1,co_applicant_race_2,applicant_sex,co_applicant_sex,applicant_age,co_applicant_age
0,3,3,1,2,205000,16,5.0,7.0,1.0,6.0,...,2.0,2.0,5.0,,5.0,,1,2,7,4
1,3,2,1,2,155000,7,5.0,7.0,1.0,6.0,...,2.0,2.0,5.0,,5.0,,2,1,3,3
2,3,2,1,2,125000,8,5.0,7.0,1.0,6.0,...,2.0,2.0,5.0,,5.0,,1,2,4,3
3,3,2,1,2,55000,7,5.0,7.0,1.0,6.0,...,2.0,2.0,3.0,,3.0,,1,2,2,2
4,3,1,1,2,345000,1,5.0,7.0,1.0,3.0,...,2.0,2.0,5.0,,5.0,,1,2,2,2


# Make data the same data type

## Remove nan first

In [28]:
columns_with_nan = df.columns[df.isna().any()].tolist()
print("Columns with NaN values:", columns_with_nan)

Columns with NaN values: ['interest_rate', 'total_loan_costs', 'origination_charges', 'loan_term', 'property_value', 'income', 'debt_to_income_ratio', 'applicant_race_2', 'co_applicant_race_2']


In [29]:
# Specify columns with NaN values and their respective fill methods
columns_to_fill = {
    'interest_rate': 'mean',  # Fill with mean
    'loan_term': 'mode',  # Fill with mode
    'income': 'mean',  # Fill with mean
    'applicant_race_1': 'mode',  # Fill with mode
    'applicant_race_2': 'mode',  # Fill with mode
    'co_applicant_race_1': 'mode',  # Fill with mode
    'co_applicant_race_2': 'mode',  # Fill with mode
    'applicant_ethnicity_1': 'mode',  # Fill with mode
    'co_applicant_ethnicity_1': 'mode',  # Fill with mode
    'total_loan_costs': 'mean',  # Fill with mean
    'origination_charges': 'mean',  # Fill with mean
    'property_value': 'mean',  # Fill with mean
    'debt_to_income_ratio': 'mean'  # Fill with mean
}

# Fill missing values according to the specified method for each column
for column, fill_method in columns_to_fill.items():
    if fill_method == 'mean':
        mean_value = df[column].mean()
        df[column].fillna(mean_value, inplace=True)
    elif fill_method == 'mode':
        mode_value = df[column].mode().values[0]
        df[column].fillna(mode_value, inplace=True)
    else:
        df[column].fillna(fill_method, inplace=True)


In [30]:
columns_with_nan = df.columns[df.isna().any()].tolist()
print("Columns with NaN values:", columns_with_nan)

Columns with NaN values: []


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54832 entries, 0 to 54831
Data columns (total 33 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   action_taken                              54832 non-null  int64  
 1   loan_type                                 54832 non-null  int64  
 2   lien_status                               54832 non-null  int64  
 3   open_end_line_of_credit                   54832 non-null  int64  
 4   loan_amount                               54832 non-null  int64  
 5   combined_loan_to_value_ratio              54832 non-null  int64  
 6   interest_rate                             54832 non-null  float64
 7   total_loan_costs                          54832 non-null  float64
 8   origination_charges                       54832 non-null  float64
 9   loan_term                                 54832 non-null  float64
 10  negative_amortization             

In [32]:
df.to_csv('10_all_numerical_32bit.csv', index=False)