In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib

# Load data
data = pd.read_csv('loan_approval_dataset.csv')
print(data.head())

# Preprocess data
X = data.drop(columns=['loan_id', 'loan_status'])
y = data['loan_status'].map({'Approved': 1, 'Rejected': 0})

# Define categorical and numerical columns
categorical_features = ['education', 'self_employed']
numerical_features = X.columns.difference(categorical_features)

# Preprocessing pipelines for both numerical and categorical data
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    # 'Random Forest': RandomForestClassifier(),
    # 'Support Vector Machine': SVC()
}

# Train and evaluate models
best_model = None
best_score = 0
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {score:.4f}')

    if score > best_score:
        best_score = score
        best_model = pipeline

# Save the best model
joblib.dump(best_model, 'best_model.joblib')
print(f'Best model saved with accuracy: {best_score:.4f}')


   loan_id  no_of_dependents     education self_employed  income_annum  \
0        1                 2      Graduate            No       9600000   
1        2                 0  Not Graduate           Yes       4100000   
2        3                 3      Graduate            No       9100000   
3        4                 3      Graduate            No       8200000   
4        5                 5  Not Graduate           Yes       9800000   

   loan_amount  loan_term  cibil_score  residential_assets_value  \
0     29900000         12          778                   2400000   
1     12200000          8          417                   2700000   
2     29700000         20          506                   7100000   
3     30700000          8          467                  18200000   
4     24200000         20          382                  12400000   

   commercial_assets_value  luxury_assets_value  bank_asset_value loan_status  
0                 17600000             22700000           8000000 

In [None]:
import pandas as pd
import joblib

# Define the sample data
sample_data = {
    'loan_id': [1],
    'no_of_dependents': [2],
    'education': ['Graduate'],
    'self_employed': ['No'],
    'income_annum': [9600000],
    'loan_amount': [29900000],
    'loan_term': [12],
    'cibil_score': [675],
    'residential_assets_value': [2400000],
    'commercial_assets_value': [17600000],
    'luxury_assets_value': [22700000],
    'bank_asset_value': [8000000],
    'loan_status': ['Approved']
}

# Create a DataFrame from the sample data
sample_df = pd.DataFrame(sample_data)

# Load the best model
best_model = joblib.load('best_model.joblib')

# Preprocess the sample data
X_sample = sample_df.drop(columns=['loan_id', 'loan_status'])
y_sample = sample_df['loan_status'].map({'Approved': 1, 'Rejected': 0})

# Predict using the best model
y_pred_sample = best_model.predict(X_sample)

# Display the prediction result
print(f"Predicted Approval Status: {y_pred_sample[0]}")

Predicted Approval Status: 1
