In [1]:
import pandas as pd
from sklearn.impute import KNNImputer


In [29]:
train_df = pd.read_csv('Resources/train_u6lujuX_CVtuZ9i.csv')
test_df = pd.read_csv('Resources/test_Y3wMUE5_7gLdaTN.csv')

In [30]:
train_df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [31]:
test_df.isna().sum() 

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [32]:
train_df['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [33]:
train_df['Loan_Amount_Term'].value_counts()

360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

In [34]:
columns_for_imputation = ['Credit_History', 'LoanAmount']

# Create a KNNImputer instance
imputer = KNNImputer(n_neighbors=5)

# Perform imputation on the selected columns
train_df_imputed = train_df.copy()  # Make a copy of the original DataFrame to preserve it
train_df_imputed[columns_for_imputation] = imputer.fit_transform(train_df[columns_for_imputation])

# Check for any remaining missing values after imputation
train_df_imputed.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [35]:
train_df_imputed['Credit_History'] = train_df_imputed['Credit_History'].round()

train_df_imputed['Credit_History'].value_counts()

1.0    525
0.0     89
Name: Credit_History, dtype: int64

In [36]:
train_df['Credit_History'].value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [37]:
train_df_imputed['LoanAmount'].value_counts()

115.0    25
120.0    20
110.0    17
100.0    15
160.0    12
         ..
54.0      1
78.0      1
436.0     1
207.0     1
253.0     1
Name: LoanAmount, Length: 205, dtype: int64

In [38]:
train_df['LoanAmount'].value_counts()

120.0    20
110.0    17
100.0    15
160.0    12
187.0    12
         ..
240.0     1
214.0     1
59.0      1
166.0     1
253.0     1
Name: LoanAmount, Length: 203, dtype: int64

In [39]:
train_df_imputed.to_csv('Output/train_knn_imputed.csv')

In [40]:
train_df_imputed['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [45]:
# filter by only married applicants
married = train_df_imputed[train_df_imputed['Married'] == 'Yes']
married['Married'].value_counts()

Yes    398
Name: Married, dtype: int64

In [50]:
# Need to change dependents to 3 instead of 3+ and change the rest from strings to integers 
married.loc[married['Dependents'] == '3+', 'Dependents'] = 3
married.loc[married['Dependents'] == '2', 'Dependents'] = 2
married.loc[married['Dependents'] == '1', 'Dependents'] = 1
married.loc[married['Dependents'] == '0', 'Dependents'] = 0

married['Dependents'].value_counts()

0    174
2     93
1     79
3     44
Name: Dependents, dtype: int64

In [51]:
# take the sum of the dependents for married applicants
dependent_married_avg = married['Dependents'].mean()
dependent_married_avg

1.0179487179487179

In [52]:
train_df_imputed.loc[(train_df_imputed['Dependents'].isna()) & (train_df_imputed['Married'] == 'Yes'), 'Dependents'] = 1
train_df_imputed.loc[(train_df_imputed['Dependents'].isna()) & (train_df_imputed['Married'] == 'No'), 'Dependents'] = 0
train_df_imputed


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,115.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [53]:
train_df_imputed.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents            3
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64