In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

## Load Data 

In [2]:
df = pd.read_csv('dataset.csv', low_memory = False)


In [3]:
df.head()

Unnamed: 0,loan_type,loan_amount,action_taken,occupancy_type,census_tract,applicant_ethnicity_1,co_applicant_ethnicity_1,applicant_race_1,co_applicant_race_1,applicant_sex,...,total_units,aus_1,reverse_mortgage,open_end_line_of_credit,manufactured_home_land_property_interest,total_loan_costs,negative_amortization,interest_only_payment,balloon_payment,other_nonamortizing_features
0,1,355000,1,1,42091200000.0,2.0,2.0,5.0,5.0,1,...,1,1,2,2,5,8352.71,2,2,2,2
1,1,265000,1,1,6067006000.0,2.0,5.0,5.0,8.0,2,...,1,6,2,2,5,1983.0,2,2,2,2
2,2,285000,1,1,55133200000.0,2.0,2.0,5.0,5.0,1,...,1,6,2,2,5,6854.68,2,2,2,2
3,1,295000,1,1,55133200000.0,2.0,5.0,5.0,8.0,2,...,1,6,2,2,5,4700.6,2,2,2,2
4,1,475000,1,1,6073010000.0,3.0,3.0,6.0,6.0,1,...,1,6,2,2,5,2288.0,2,2,2,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804666 entries, 0 to 804665
Data columns (total 34 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   loan_type                                 804666 non-null  int64  
 1   loan_amount                               804666 non-null  int64  
 2   action_taken                              804666 non-null  int64  
 3   occupancy_type                            804666 non-null  int64  
 4   census_tract                              804666 non-null  float64
 5   applicant_ethnicity_1                     804666 non-null  float64
 6   co_applicant_ethnicity_1                  804666 non-null  float64
 7   applicant_race_1                          804666 non-null  float64
 8   co_applicant_race_1                       804666 non-null  float64
 9   applicant_sex                             804666 non-null  int64  
 10  co_applicant_sex    

In [5]:
df['applicant_ethnicity_1'].value_counts()

2.0     589626
3.0     148014
1.0      60039
14.0      3298
11.0      2246
12.0       843
13.0       421
4.0        179
Name: applicant_ethnicity_1, dtype: int64

In [6]:
#Remove 4.0 - not applicable and 3.0 - information not provided
df = df[df['applicant_ethnicity_1'] != 4.0 ]
df = df[df['applicant_ethnicity_1'] != 3.0]
df['applicant_ethnicity_1'].value_counts()

2.0     589626
1.0      60039
14.0      3298
11.0      2246
12.0       843
13.0       421
Name: applicant_ethnicity_1, dtype: int64

In [7]:
df['co_applicant_ethnicity_1'].value_counts()

5.0     340650
2.0     267558
1.0      25443
3.0      20325
14.0      1014
11.0       957
12.0       350
13.0       166
4.0         10
Name: co_applicant_ethnicity_1, dtype: int64

In [8]:
#Remove 3.0 and 4.0 from co_applicant_ethnicity_1 (same as applicant ethnicity). 
#Keep 5.0 as it is no co-applicant.

df = df[df['co_applicant_ethnicity_1'] != 4.0 ]
df = df[df['co_applicant_ethnicity_1'] != 3.0]
df['co_applicant_ethnicity_1'].value_counts()

5.0     340650
2.0     267558
1.0      25443
14.0      1014
11.0       957
12.0       350
13.0       166
Name: co_applicant_ethnicity_1, dtype: int64

In [9]:
df['applicant_race_1'].value_counts()

5.0     530082
2.0      42910
3.0      33573
6.0      15270
21.0      4248
1.0       3855
22.0      1507
4.0       1067
23.0      1048
27.0       946
25.0       484
26.0       465
44.0       308
24.0       231
41.0        61
42.0        34
7.0         25
43.0        24
Name: applicant_race_1, dtype: int64

In [10]:
# Remove 6.0 -information not provided and 7.0 -NA from applicant_race

df = df[df['applicant_race_1'] != 6.0 ]
df = df[df['applicant_race_1'] != 7.0]
df['applicant_race_1'].value_counts()


5.0     530082
2.0      42910
3.0      33573
21.0      4248
1.0       3855
22.0      1507
4.0       1067
23.0      1048
27.0       946
25.0       484
26.0       465
44.0       308
24.0       231
41.0        61
42.0        34
43.0        24
Name: applicant_race_1, dtype: int64

In [11]:
df['co_applicant_race_1'].value_counts()

8.0     331171
5.0     251366
2.0      20003
3.0       8722
6.0       2625
21.0      2075
1.0       1400
22.0       845
23.0       716
4.0        545
27.0       427
25.0       275
26.0       264
24.0       184
44.0       159
41.0        33
42.0        17
43.0        13
7.0          3
Name: co_applicant_race_1, dtype: int64

In [12]:
# Remove 6.0 -information not provided and 7.0 -NA from co_applicant_race

df = df[df['co_applicant_race_1'] != 6.0 ]
df = df[df['co_applicant_race_1'] != 7.0]
df['co_applicant_race_1'].value_counts()

8.0     331171
5.0     251366
2.0      20003
3.0       8722
21.0      2075
1.0       1400
22.0       845
23.0       716
4.0        545
27.0       427
25.0       275
26.0       264
24.0       184
44.0       159
41.0        33
42.0        17
43.0        13
Name: co_applicant_race_1, dtype: int64

In [13]:
df['applicant_sex'].value_counts()

1    418914
2    197636
3      1398
6       264
4         3
Name: applicant_sex, dtype: int64

In [14]:
# Remove 3 -information not provided, 4 -NA, 6 -Both from applicant_sex 
df = df[df['applicant_sex'] != 3]
df = df[df['applicant_sex'] != 4]
df = df[df['applicant_sex'] != 6]
df['applicant_sex'].value_counts()

1    418914
2    197636
Name: applicant_sex, dtype: int64

In [15]:
df['co_applicant_sex'].value_counts()

5    330175
2    214914
1     70851
3       428
6       173
4         9
Name: co_applicant_sex, dtype: int64

In [16]:
# Remove 3 -information not provided, 4 -NA, 6 -Both from applicant_sex 
df = df[df['co_applicant_sex'] != 3]
df = df[df['co_applicant_sex'] != 4]
df = df[df['co_applicant_sex'] != 6]
df['co_applicant_sex'].value_counts()

5    330175
2    214914
1     70851
Name: co_applicant_sex, dtype: int64

In [17]:
df['applicant_age'].value_counts()

35-44    158116
45-54    136362
25-34    127553
55-64    103725
65-74     56887
>74       16828
<25       16234
8888        235
Name: applicant_age, dtype: int64

In [18]:
#Age needs to be encoded then remove 8888
encoder = LabelEncoder()


In [19]:
df['applicant_age'] = encoder.fit_transform(df['applicant_age'])

In [20]:
df['applicant_age'].value_counts()

1    158116
2    136362
0    127553
3    103725
4     56887
7     16828
6     16234
5       235
Name: applicant_age, dtype: int64

In [21]:
df = df[df['applicant_age'] != 5]
df['applicant_age'].value_counts()

1    158116
2    136362
0    127553
3    103725
4     56887
7     16828
6     16234
Name: applicant_age, dtype: int64

In [22]:
df['co_applicant_age'].value_counts()

9999     329983
35-44     72527
25-34     60802
45-54     60316
55-64     49654
65-74     28701
>74        7042
<25        6625
8888         55
Name: co_applicant_age, dtype: int64

In [23]:
df['co_applicant_age'] = encoder.fit_transform(df['co_applicant_age'])
df['co_applicant_age'].value_counts()

6    329983
1     72527
0     60802
2     60316
3     49654
4     28701
8      7042
7      6625
5        55
Name: co_applicant_age, dtype: int64

In [24]:
df = df[df['co_applicant_age'] != 5]
df['co_applicant_age'].value_counts()

6    329983
1     72527
0     60802
2     60316
3     49654
4     28701
8      7042
7      6625
Name: co_applicant_age, dtype: int64

In [25]:
#Change income to reflect regular dollar amounts.
df['income'] = df['income'] * 1000

In [26]:
df['income'].head()

0    136000.0
1     60000.0
2    120000.0
3     91000.0
5    109000.0
Name: income, dtype: float64

### Encoding of Credit_score_type

In [27]:
df['applicant_credit_score_type'].value_counts()

1       196056
3       170132
2       146566
9        65587
1111     25151
7         5785
8         5281
5          787
4          304
6            1
Name: applicant_credit_score_type, dtype: int64

In [28]:
# Encode the credit_score_type
df['applicant_credit_score_type']= encoder.fit_transform(df['applicant_credit_score_type'])
df['applicant_credit_score_type'].value_counts()

0    196056
2    170132
1    146566
8     65587
9     25151
6      5785
7      5281
4       787
3       304
5         1
Name: applicant_credit_score_type, dtype: int64

In [29]:
df['co_applicant_credit_score_type'].value_counts()

10      317744
9       167069
1        38805
3        34086
2        28535
1111     25151
7         2062
8         1758
5          295
4          145
Name: co_applicant_credit_score_type, dtype: int64

In [30]:
df['co_applicant_credit_score_type'] = encoder.fit_transform(df['co_applicant_credit_score_type'])
df['co_applicant_credit_score_type'].value_counts()

8    317744
7    167069
0     38805
2     34086
1     28535
9     25151
5      2062
6      1758
4       295
3       145
Name: co_applicant_credit_score_type, dtype: int64

In [31]:
df['origination_charges']= encoder.fit_transform(df['origination_charges'])

In [32]:
df['origination_charges'].value_counts()

0         87499
143514    25677
12447     25500
143288     8838
142826     7296
          ...  
75134         1
122169        1
117117        1
93755         1
133778        1
Name: origination_charges, Length: 143515, dtype: int64

In [33]:
df['interest_rate'].value_counts()

3.875    60016
3.75     48104
3.625    43268
4.125    38506
3.99     37768
         ...  
5.662        1
9.65         1
4.11         1
11.99        1
5.325        1
Name: interest_rate, Length: 720, dtype: int64

In [34]:
df['interest_rate'] = encoder.fit_transform(df['interest_rate'])
df['interest_rate'].value_counts()

205    60016
188    48104
172    43268
243    38506
220    37768
       ...  
456        1
695        1
239        1
66         1
410        1
Name: interest_rate, Length: 720, dtype: int64

In [35]:
df['debt_to_income_ratio'].value_counts()

20%-<30%    127900
30%-<36%    114221
<20%         41414
44           26362
Exempt       25151
42           24209
43           23162
41           23092
39           22841
50%-60%      22841
40           22549
38           21869
37           21571
36           20835
49           17520
45           15918
48           14529
46           14330
47           14014
>60%          1322
Name: debt_to_income_ratio, dtype: int64

In [36]:
df_copy = df.copy()

In [37]:
df_copy['debt_to_income_ratio'].value_counts()

20%-<30%    127900
30%-<36%    114221
<20%         41414
44           26362
Exempt       25151
42           24209
43           23162
41           23092
39           22841
50%-60%      22841
40           22549
38           21869
37           21571
36           20835
49           17520
45           15918
48           14529
46           14330
47           14014
>60%          1322
Name: debt_to_income_ratio, dtype: int64

In [38]:
def category_1(ratio):
    if ratio in ['30%-<36%','36','37','38','39']:
        return '30%-<40%'
    return ratio


In [39]:
df_copy['debt_to_income_ratio'] = df_copy['debt_to_income_ratio'].apply(category_1)

print(df_copy['debt_to_income_ratio'].value_counts())

30%-<40%    201337
20%-<30%    127900
<20%         41414
44           26362
Exempt       25151
42           24209
43           23162
41           23092
50%-60%      22841
40           22549
49           17520
45           15918
48           14529
46           14330
47           14014
>60%          1322
Name: debt_to_income_ratio, dtype: int64


In [40]:
def category_2(ratio):
    if ratio in ['40','41','42','43','44','45','46','47','48','49']:
        return '40%-<50%'
    return ratio

In [41]:
df_copy['debt_to_income_ratio'] = df_copy['debt_to_income_ratio'].apply(category_2)

print(df_copy['debt_to_income_ratio'].value_counts())

30%-<40%    201337
40%-<50%    195685
20%-<30%    127900
<20%         41414
Exempt       25151
50%-60%      22841
>60%          1322
Name: debt_to_income_ratio, dtype: int64


In [42]:
df = df_copy
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 615650 entries, 0 to 804665
Data columns (total 34 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   loan_type                                 615650 non-null  int64  
 1   loan_amount                               615650 non-null  int64  
 2   action_taken                              615650 non-null  int64  
 3   occupancy_type                            615650 non-null  int64  
 4   census_tract                              615650 non-null  float64
 5   applicant_ethnicity_1                     615650 non-null  float64
 6   co_applicant_ethnicity_1                  615650 non-null  float64
 7   applicant_race_1                          615650 non-null  float64
 8   co_applicant_race_1                       615650 non-null  float64
 9   applicant_sex                             615650 non-null  int64  
 10  co_applicant_sex    

In [43]:
df['debt_to_income_ratio'].value_counts()

30%-<40%    201337
40%-<50%    195685
20%-<30%    127900
<20%         41414
Exempt       25151
50%-60%      22841
>60%          1322
Name: debt_to_income_ratio, dtype: int64

In [44]:
df['debt_to_income_ratio'] = encoder.fit_transform(df['debt_to_income_ratio'])
df['debt_to_income_ratio'].value_counts()

1    201337
2    195685
0    127900
4     41414
6     25151
3     22841
5      1322
Name: debt_to_income_ratio, dtype: int64

In [45]:
df['combined_loan_to_value_ratio'].value_counts()

80.0      85336
95.0      36847
96.5      27621
Exempt    25151
90.0      22194
          ...  
77.081        1
95.153        1
27.329        1
90.879        1
98.483        1
Name: combined_loan_to_value_ratio, Length: 62989, dtype: int64

In [46]:
# I encode the combined_loan_to_value_ratio to be able to include the 'Exempt' rows

df['combined_loan_to_value_ratio'] = encoder.fit_transform(df['combined_loan_to_value_ratio'])

df['combined_loan_to_value_ratio'].value_counts()

47453    85336
59372    36847
60450    27621
62988    25151
55309    22194
         ...  
44472        1
59471        1
7183         1
55882        1
61806        1
Name: combined_loan_to_value_ratio, Length: 62989, dtype: int64

In [47]:
#loan terms are given in months. 
df['loan_term'].value_counts()

360       472504
180        67545
Exempt     25677
240        25425
120         8916
           ...  
169            1
123            1
163            1
236            1
98             1
Name: loan_term, Length: 200, dtype: int64

In [53]:
df.loc[:, 'loan_term'] = df['loan_term'].apply(lambda x: int(x) // 12 if x != 'Exempt' else 'Exempt')

In [54]:
df['loan_term'].value_counts()

30        472699
15         67566
Exempt     25677
20         25429
10          8941
25          3444
27          3081
5           1648
31          1389
12          1033
29           781
28           451
26           406
14           346
8            333
13           282
7            244
23           205
24           204
3            196
11           162
1            146
18           143
16           141
19           123
6            121
17           104
22            85
9             69
21            59
2             49
0             44
4             22
32            16
40            11
Name: loan_term, dtype: int64

Categories to use.
30
25-<30
20-<25
15-<20
10-<15
5-<10
0-<5
Exempt

In [67]:
def categorize_loan_term(ratio):
    # Custom logic to categorize 'loan_term' values
    if ratio == 'Exempt':
        return 'Exempt'
    ratio = int(ratio)
    if ratio < 5:
        return '0-<5 years'
    elif 5 <= ratio < 10:
        return '5-< 10 years'
    elif 10 <= ratio < 15:
        return '10-<15 years'
    elif 15 <= ratio < 20:
        return '15-<20 years'
    elif 20 <= ratio < 25:
        return '20-<25 years'
    elif 25 <= ratio < 30 :
        return '25-<30 years'
    elif ratio >=30:
        return '30+ years'
    return 'Unknown'

In [71]:
df['loan_term'] = df['loan_term'].apply(categorize_loan_term)

In [72]:
df['loan_term'].value_counts()

30+ years       474115
15-<20 years     68077
20-<25 years     25982
Exempt           25677
10-<15 years     10764
25-<30 years      8163
5-< 10 years      2415
0-<5 years         457
Name: loan_term, dtype: int64

In [73]:
df['loan_term'] = encoder.fit_transform(df['loan_term'])
df['loan_term'].value_counts()

5    474115
2     68077
3     25982
7     25677
1     10764
4      8163
6      2415
0       457
Name: loan_term, dtype: int64

In [74]:
df['property_value'].value_counts()

Exempt      25614
255000      14778
225000      14577
235000      14180
265000      13868
            ...  
5975000         1
11355000        1
5645000         1
5635000         1
8765000         1
Name: property_value, Length: 758, dtype: int64

In [81]:
df['property_value'] = encoder.fit_transform(df['property_value'])
df['property_value'].value_counts()

757    25614
240    14778
203    14577
215    14180
251    13868
       ...  
588        1
31         1
562        1
561        1
716        1
Name: property_value, Length: 758, dtype: int64

In [82]:
df['total_units'].value_counts()

1        606149
2          7233
3          1234
4           889
5-24        129
25-49         9
50-99         5
>149          2
Name: total_units, dtype: int64

In [83]:
def total_units(ratio):
    if ratio in ['5-24', '25-49', '50-99', '>149']:
        return '5+'
    return ratio

In [89]:
df['total_units'] = df['total_units'].apply(total_units)

df['total_units'].value_counts()

1     606149
2       7233
3       1234
4        889
5+       145
Name: total_units, dtype: int64

In [90]:
df['total_units'] = encoder.fit_transform(df['total_units'])

In [91]:
df['total_units'].value_counts()

0    606149
1      7233
2      1234
3       889
4       145
Name: total_units, dtype: int64

In [92]:
df['total_loan_costs'].value_counts()

Exempt      25677
0.0         13387
695.0        3050
635.43       1943
15.43        1252
            ...  
2781.94         1
2656.47         1
3428.97         1
2389.97         1
11483.15        1
Name: total_loan_costs, Length: 326681, dtype: int64

In [93]:
df['total_loan_costs'] = encoder.fit_transform(df['total_loan_costs'])

In [94]:
df['total_loan_costs'].value_counts()

326680    25677
0         13387
271069     3050
253500     1943
32862      1252
          ...  
88152         1
81317         1
125282        1
67592         1
13820         1
Name: total_loan_costs, Length: 326681, dtype: int64

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 615650 entries, 0 to 804665
Data columns (total 34 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   loan_type                                 615650 non-null  int64  
 1   loan_amount                               615650 non-null  int64  
 2   action_taken                              615650 non-null  int64  
 3   occupancy_type                            615650 non-null  int64  
 4   census_tract                              615650 non-null  float64
 5   applicant_ethnicity_1                     615650 non-null  float64
 6   co_applicant_ethnicity_1                  615650 non-null  float64
 7   applicant_race_1                          615650 non-null  float64
 8   co_applicant_race_1                       615650 non-null  float64
 9   applicant_sex                             615650 non-null  int64  
 10  co_applicant_sex    

In [98]:
df['income'].value_counts()

60000.0      6092
65000.0      5575
75000.0      5508
50000.0      5505
62000.0      5392
             ... 
1621000.0       1
1413000.0       1
4927000.0       1
1388000.0       1
2052000.0       1
Name: income, Length: 2529, dtype: int64

In [105]:
df['action_taken'].value_counts()

1    609255
3      2771
4      2287
2       849
5       357
6        67
8        49
7        15
Name: action_taken, dtype: int64

In [104]:
df.to_csv('processed_data_fin.csv', index=False)