In [124]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [125]:
df = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
df = df.drop('Loan_ID', axis=1)

In [126]:
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term']:
    df[col] = df[col].fillna(df[col].mode()[0])

In [127]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])


In [128]:
df['Loan_Status'] = df['Loan_Status'].fillna(df['Loan_Status'].mode()[0])
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

In [129]:
categorical_features = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [130]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [133]:
y_pred = dt_model.predict(X_test)

In [134]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [135]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.67
Precision: 0.73
Recall: 0.79
F1-Score: 0.76
Confusion Matrix:
 [[20 23]
 [17 63]]


In [137]:
from sklearn.ensemble import RandomForestClassifier

In [138]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [140]:
y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Results:")
print(f"Accuracy: {accuracy_rf:.2f}")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"F1-Score: {f1_rf:.2f}")
print("Confusion Matrix:\n", conf_matrix_rf)

Random Forest Results:
Accuracy: 0.78
Precision: 0.76
Recall: 0.97
F1-Score: 0.85
Confusion Matrix:
 [[18 25]
 [ 2 78]]


In [141]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_weighted',
                           n_jobs=-1,
                           verbose=1)

grid_search.fit(X, y)

print(f"best param: {grid_search.best_params_}")

best_model = grid_search.best_estimator_
final_pred = best_model.predict(X_test)

final_accuracy = accuracy_score(y_test, final_pred)
final_precision = precision_score(y_test, final_pred)
final_recall = recall_score(y_test, final_pred)
final_f1 = f1_score(y_test, final_pred)
final_conf_matrix = confusion_matrix(y_test, final_pred)

print(f"best Accuracy : {final_accuracy:.2f}")
print(f"best Precision: {final_precision:.2f}")
print(f"best Recall: {final_recall:.2f}")
print(f"best F1-Score: {final_f1:.2f}")
print(f"final Confusion Matrix :\n", final_conf_matrix)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
best param: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
best Accuracy : 0.84
best Precision: 0.80
best Recall: 1.00
best F1-Score: 0.89
final Confusion Matrix :
 [[23 20]
 [ 0 80]]


In [145]:
import pandas as pd
import numpy as np

df_test = pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

test_ids = df_test['Loan_ID']

df_test = df_test.drop('Loan_ID', axis=1)

for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term']:
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

df_test['LoanAmount'] = df_test['LoanAmount'].fillna(df_test['LoanAmount'].median())
df_test['Credit_History'] = df_test['Credit_History'].fillna(df_test['Credit_History'].mode()[0])

categorical_features_test = df_test.select_dtypes(include=['object']).columns
df_test = pd.get_dummies(df_test, columns=categorical_features_test, drop_first=True)

X_train_cols = X.columns
df_test = df_test.reindex(columns=X_train_cols, fill_value=0)

final_predictions = best_model.predict(df_test)

final_predictions = np.where(final_predictions == 1, 'Y', 'N')

submission = pd.DataFrame({
    "Loan_ID": test_ids,
    "Loan_Status": final_predictions
})


submission.to_csv('submission.csv', index=False)
