In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, recall_score,fbeta_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [None]:
#os.chdir(r'D:\use case\Loan Default Prediction\data')

In [2]:
df = pd.read_csv('processed_data.csv')

In [3]:
df.head(10)

Unnamed: 0,Client_Income_category,Credit_Amount_category,Credit_to_Income_Category,Loan_Annuity_category,Client_Family_Members_Category,Car_Owned,Bike_Owned,Active_Loan,Accompany_Client,Client_Income_Type,...,Cleint_City_Rating,Application_Process_Day,Application_Hour_Category,Type_Organization_Grouped,Score_Source_2,Score_Source_3,Social_Circle_Default,Phone_Change_category,Credit_Bureau_Category,Default
0,income_band2,credit_band6,Upto_10_times,upto_6_percent,2_members,0,0,1,Alone,Commercial,...,2.0,6.0,upto_18_hours,Self-employed,0.478787,0.51118,1,same_year,no_search,0
1,income_band5,credit_band1,Upto_2_times,more_than_8_percent,2_members,1,0,1,Alone,Service,...,2.0,3.0,upto_12_hours,Public Sector,0.215068,0.51118,0,same_year,no_search,0
2,income_band4,credit_band5,Upto_4_times,upto_6_percent,2_members,0,0,1,Alone,Service,...,2.0,4.0,upto_12_hours,Self-employed,0.552795,0.329655,1,same_year,no_search,0
3,income_band4,credit_band5,Upto_4_times,upto_6_percent,2_members,0,0,1,Alone,Retired,...,3.0,2.0,upto_18_hours,Self-employed,0.135182,0.631355,0,four_year,more_than_two_search,0
4,income_band5,credit_band6,Upto_4_times,upto_4_percent,more_than_3_members,1,0,1,Alone,Commercial,...,1.0,3.0,upto_12_hours,Business,0.301182,0.355639,1,one_year,one_search,0
5,income_band3,credit_band1,Upto_2_times,upto_6_percent,2_members,0,1,1,Alone,Service,...,2.0,2.0,upto_12_hours,Other,0.697928,0.420611,1,two_year,no_search,0
6,income_band4,credit_band6,Upto_10_times,upto_4_percent,upto_1_member,1,1,0,Alone,Retired,...,2.0,3.0,upto_18_hours,Self-employed,0.602545,0.511892,1,same_year,more_than_two_search,0
7,income_band3,credit_band6,Upto_5_times,upto_6_percent,2_members,0,0,1,Alone,Retired,...,2.0,4.0,upto_18_hours,Self-employed,0.657508,0.549597,0,four_year,more_than_two_search,0
8,income_band3,credit_band4,Upto_4_times,upto_4_percent,3_members,1,1,0,Relative,Commercial,...,2.0,4.0,upto_18_hours,Self-employed,0.637594,0.553165,1,four_year,no_search,0
9,income_band3,credit_band1,Upto_2_times,upto_8_percent,upto_1_member,0,0,0,Alone,Retired,...,2.0,2.0,upto_12_hours,Self-employed,0.063343,0.08065,0,one_year,more_than_two_search,0


In [None]:
df.Default.dtype

In [4]:
# Step 1: Convert all categorical columns to object, excluding 'Score_Source_2', 'Score_Source_3'
categorical_columns = [col for col in df.columns if col not in ['Score_Source_2', 'Score_Source_3']]
df[categorical_columns] = df[categorical_columns].astype('object')


In [5]:
# If it's numeric but string-like:
df['Default'] = df['Default'].astype(int)

In [6]:
# Step 2: Split into features and target
X = df.drop('Default', axis=1)
y = df['Default']

In [7]:
# Step 3: Define preprocessing pipelines
numeric_features = ['Score_Source_2', 'Score_Source_3']
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [8]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
# Step 4: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
X_train.shape
print(X_test.shape)

In [34]:
import pickle
pickle.dump(preprocessor, open('data_scaling_latest.pkl', 'wb'))

In [33]:
# Step 5: Fit preprocessing pipeline to training data
X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

In [None]:
# Step 6: Train logistic regression on preprocessed data
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_prepared, y_train)

In [None]:
# Step 7: Make predictions
y_pred = model.predict(X_test_prepared)
y_prob = model.predict_proba(X_test_prepared)[:, 1]  # for ROC-AUC

In [None]:
# Step 8: Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


In [None]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
xgb1 = XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42)

In [None]:
xgb1.fit(X_train_prepared,y_train)

In [None]:
 # Predict
y_pred = model.predict(X_test_prepared)
y_prob = model.predict_proba(X_test_prepared)[:, 1]
  
results = []
    # Evaluate
results.append({
        'Model': 'XGBoost',
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    })

# Create a DataFrame of results
results_df = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False)

In [None]:
results

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Store results
results = []

# Loop through models
for name, model in models.items():
    # For Naive Bayes: it doesn't work with sparse matrix (from OneHotEncoder), convert to dense
    X_train_mod = X_train_prepared.toarray() if name == 'Naive Bayes' else X_train_prepared
    X_test_mod = X_test_prepared.toarray() if name == 'Naive Bayes' else X_test_prepared
    
    # Train
    model.fit(X_train_mod, y_train)
    
    # Predict
    y_pred = model.predict(X_test_mod)
    y_prob = model.predict_proba(X_test_mod)[:, 1]
    
    # Evaluate
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    })

# Create a DataFrame of results
results_df = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False)

In [None]:
# Step 1: Compute scale_pos_weight for imbalance
# Formula: count(negative class) / count(positive class)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")

In [None]:
# Step 2: Define XGBoost model
xgb = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

In [None]:
# Step 3: Hyperparameter grid (basic but effective)
param_grid = {
    'n_estimators': [100],
    'max_depth': [3],
    'learning_rate': [0.01],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}


In [None]:
# Step 4: Use recall as scoring metric
recall_scorer = make_scorer(recall_score)

In [None]:
# Step 4: Define F-beta scorer with beta = 3
fbeta_scorer = make_scorer(fbeta_score, beta=4)

In [None]:
# Step 5: Cross-validation with GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=fbeta_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [None]:
# Step 6: Fit model
grid_search.fit(X_train_prepared, y_train)

In [None]:
# Step 7: Best model and recall score
print("Best Parameters:\n", grid_search.best_params_)
print("Best Recall Score on CV:", grid_search.best_score_)




In [None]:
# Step 8: Predict on test set using best estimator

y_pred_test = grid_search.best_estimator_.predict(X_test_prepared)

# Step 9: Evaluate on test data
print("Test Recall Score:", recall_score(y_test, y_pred_test))

In [None]:
results.append({
        'Model': 'grid_search_XGBoost',
        'Accuracy': accuracy_score(y_test, y_pred_test),
        'Precision': precision_score(y_test, y_pred_test),
        'Recall': recall_score(y_test, y_pred_test),
        'F1 Score': f1_score(y_test, y_pred_test),
        'ROC AUC': roc_auc_score(y_test, y_pred_test)
    })

In [None]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Expanded hyperparameter grid for random search
param_distributions = {
    'n_estimators': [100, 200, 300,400,500],
    'max_depth': [3, 5, 7,9],
    'learning_rate': [0.01, 0.05, 0.1,0.3],
    'subsample': [0.2,0,4,0.6, 0.8],
    'colsample_bytree': [0.4,0.6, 0.8]
}

In [None]:
# RandomizedSearchCV with more parameter combinations
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    scoring=fbeta_scorer,
    cv=5,
    n_iter=50,  # You can adjust this depending on how many random combos you want to try
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [None]:
# Step 6: Fit model
random_search.fit(X_train_prepared, y_train)

In [None]:
# Step 8: Predict on test set using best estimator

y_pred_test_random = random_search.best_estimator_.predict(X_test_prepared)

# Step 9: Evaluate on test data
print("Test Recall Score:", recall_score(y_test, y_pred_test_random))

In [None]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test_random)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
results.append({
        'Model': 'random_search_XGBoost_modified',
        'Accuracy': accuracy_score(y_test, y_pred_test_random),
        'Precision': precision_score(y_test, y_pred_test_random),
        'Recall': recall_score(y_test, y_pred_test_random),
        'F1 Score': f1_score(y_test, y_pred_test_random),
        'ROC AUC': roc_auc_score(y_test, y_pred_test_random)
    })

In [None]:
# Create a DataFrame of results
results_df = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False)

In [143]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,Beta_value
0,random_search_XGBoost_modified1,0.690155,0.157452,0.656229,0.253968,0.674674,4
1,random_search_CATBoost_modified,0.690155,0.157452,0.656229,0.253968,0.674674,5
3,CATBoost_modified_recall,0.679049,0.152728,0.658271,0.247933,0.669568,0
2,CATBoost_modified_micro_recall,0.875181,0.284426,0.364874,0.319666,0.642326,0


In [None]:
random_search.best_estimator_

In [None]:
random_search.best_params_

In [None]:
param_grid = {
    'n_estimators': [500,450],
    'max_depth': [3, 5,7],
    'learning_rate': [0.05],
    'subsample': [0.6],
    'colsample_bytree': [0.6],
    'reg_alpha': [ 0.1, 0.2,0.3],          # L1 regularization
    'reg_lambda': [1, 1.5, 2]        # L2 regularization
}

print(1)

In [None]:
# Step 5: Cross-validation with GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=fbeta_scorer,
    cv=3,
    n_jobs=1,
    verbose=3
)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
grid_search.fit(X_train_prepared, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
# Step 8: Predict on test set using best estimator

y_pred_test_grid = grid_search.best_estimator_.predict(X_test_prepared)

# Step 9: Evaluate on test data
print("Test Recall Score:", recall_score(y_test, y_pred_test_grid))

In [None]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test_grid)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
results.append({
        'Model': 'random_search_XGBoost_modified1',
        'Accuracy': accuracy_score(y_test, y_pred_test_grid),
        'Precision': precision_score(y_test, y_pred_test_grid),
        'Recall': recall_score(y_test, y_pred_test_grid),
        'F1 Score': f1_score(y_test, y_pred_test_grid),
        'ROC AUC': roc_auc_score(y_test, y_pred_test_grid)
    })

In [None]:
# Create a DataFrame of results
results_df = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False)

In [None]:
results_df

In [None]:
pip install --upgrade numpy


In [None]:
pip install --upgrade catboost


In [None]:
# Step 4: Define F-beta scorer with beta = 3
fbeta_scorer = make_scorer(fbeta_score, beta=0.7)

In [None]:
from catboost import CatBoostClassifier


# Define the CatBoostClassifier
catboost_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    auto_class_weights='Balanced'  # Automatically handles class imbalance
)

# Define the hyperparameter distributions
param_distributions = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'l2_leaf_reg': [1, 3, 5, 7],
    'bagging_temperature': [0, 0.5, 1.0],
    'border_count': [32, 64, 128]
}
#{'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 200, 
 #'depth': 4, 'border_count': 32, 'bagging_temperature': 0.5}
# param_distributions = {
#     'depth': [4],
#     'learning_rate': [0.1],
#     'iterations': [ 200],
#     'l2_leaf_reg': [3],
#     'bagging_temperature': [0.5],
#     'border_count': [32]
# }

# Randomized search
random_search = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_distributions,
    scoring='recall_micro',  # or 'f1_weighted'
    cv=5,
    n_iter=100,              # Try 20 random combinations
    n_jobs=1,
    verbose=3,
    random_state=42
)

# Fit to training data
random_search.fit(X_train_prepared, y_train)

# Output best results
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)


In [None]:
# Best Parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 1, 'iterations': 300, 'depth': 10, 'border_count': 64, 'bagging_temperature': 0}
# Best Score: 0.8874195865435504

In [None]:
random_search.best_score_

In [None]:
y_pred1 = random_search.best_estimator_.predict(X_test_prepared)
y_prob1 = random_search.best_estimator_.predict_proba(X_test_prepared)[:, 1]

In [None]:
y1 = pd.DataFrame(y_pred1)
y1.value_counts()

In [None]:
# Step 8: Predict on test set using best estimator

y_pred_test_grid = random_search.best_estimator_.predict(X_test_prepared)



In [None]:
cm

In [None]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred1)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
results=[]

In [None]:


results.append({
        'Model': 'CATBoost_modified_micro_recall',
        'Accuracy': accuracy_score(y_test, y_pred1),
        'Precision': precision_score(y_test, y_pred1),
        'Recall': recall_score(y_test, y_pred1),
        'F1 Score': f1_score(y_test, y_pred1),
        'ROC AUC': roc_auc_score(y_test, y_pred1),
        'Beta_value': 0
    })

In [None]:
results_df = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False)
print(results_df)

In [None]:
from catboost import CatBoostClassifier


# Define the CatBoostClassifier
catboost_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    auto_class_weights='Balanced'  # Automatically handles class imbalance
)

# Define the hyperparameter distributions
param_distributions = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'l2_leaf_reg': [1, 3, 5, 7],
    'bagging_temperature': [0, 0.5, 1.0],
    'border_count': [32, 64, 128]
}
#{'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 200, 
 #'depth': 4, 'border_count': 32, 'bagging_temperature': 0.5}
# param_distributions = {
#     'depth': [4],
#     'learning_rate': [0.1],
#     'iterations': [ 200],
#     'l2_leaf_reg': [3],
#     'bagging_temperature': [0.5],
#     'border_count': [32]
# }

# Randomized search
random_search1 = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_distributions,
    scoring='recall',  # or 'f1_weighted'
    cv=5,
    n_iter=50,              # Try 20 random combinations
    n_jobs=1,
    verbose=3,
    random_state=42
)

# Fit to training data
random_search1.fit(X_train_prepared, y_train)

# Output best results
print("Best Parameters:", random_search1.best_params_)
print("Best Score:", random_search1.best_score_)


In [None]:
# Step 8: Predict on test set using best estimator

y_pred_test_recall = random_search1.best_estimator_.predict(X_test_prepared)



In [None]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test_recall)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
If you're experiencing "greying out" of output in Visual Studio Code (VSCode), it usually happens when the editor is unable to properly render the output or when there are issues with the Python extension or Jupyter Notebook integration. Here are some steps to resolve this issue:

### 1. **Update VSCode and Extensions**
    - Ensure that you are using the latest version of VSCode.
    - Update the Python and Jupyter extensions in VSCode.

### 2. **Check Python Environment**
    - Ensure that the correct Python interpreter is selected in VSCode.
    - You can select the interpreter by clicking on the Python version in the bottom-left corner of VSCode and choosing the appropriate environment.

### 3. **Clear Output**
    - If the output is greyed out, try clearing the output by clicking the "Clear All Outputs" button in the Jupyter Notebook toolbar.

### 4. **Restart Kernel**
    - Restart the Jupyter kernel by clicking the "Restart Kernel" button in the Jupyter Notebook toolbar.

### 5. **Reinstall Jupyter and IPython**
    - If the issue persists, try reinstalling Jupyter and IPython in your Python environment:
      ```bash
      pip install --upgrade jupyter ipython
      ```

### 6. **Check Notebook File**
    - Ensure that the `.ipynb` file is not corrupted. You can try opening it in another Jupyter Notebook interface (e.g., JupyterLab or Jupyter Notebook in the browser) to verify.

### 7. **Disable Extensions**
    - Sometimes, other VSCode extensions can interfere with the Jupyter Notebook rendering. Try disabling unnecessary extensions to see if the issue resolves.

### 8. **Switch to Native Notebook Editor**
    - VSCode supports a native notebook editor. Ensure that you are using the native editor by enabling the following setting:
      - Go to `File > Preferences > Settings`.
      - Search for `Jupyter: Use Notebook Editor` and enable it.

### 9. **Check Output Format**
    - Ensure that the output format of your notebook cells is compatible with VSCode. For example, large outputs or unsupported MIME types might cause rendering issues.

### 10. **Reinstall VSCode**
    - If none of the above steps work, consider reinstalling VSCode to ensure a clean setup.

By following these steps, you should be able to resolve the "greying out" issue in VSCode.

In [None]:


results.append({
        'Model': 'CATBoost_modified_recall',
        'Accuracy': accuracy_score(y_test, y_pred_test_recall),
        'Precision': precision_score(y_test, y_pred_test_recall),
        'Recall': recall_score(y_test, y_pred_test_recall),
        'F1 Score': f1_score(y_test, y_pred_test_recall),
        'ROC AUC': roc_auc_score(y_test, y_pred_test_recall),
        'Beta_value': 0
    })

In [None]:
results_df = pd.DataFrame(results).sort_values(by='ROC AUC', ascending=False)
results_df

In [None]:
from catboost import CatBoostClassifier


# Define the CatBoostClassifier
catboost_model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    auto_class_weights='Balanced'  # Automatically handles class imbalance
)

# Define the hyperparameter distributions
param_distributions = {
    'depth': [4],
    'learning_rate': [ 0.03],
    'iterations': [300],
    'l2_leaf_reg': [5],
    'bagging_temperature': [1.0],
    'border_count': [32]
}
#{'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 200, 
 #'depth': 4, 'border_count': 32, 'bagging_temperature': 0.5}
# param_distributions = {
#     'depth': [4],
#     'learning_rate': [0.1],
#     'iterations': [ 200],
#     'l2_leaf_reg': [3],
#     'bagging_temperature': [0.5],
#     'border_count': [32]
# }

# Randomized search
random_search1 = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_distributions,
    scoring='recall',  # or 'f1_weighted'
    cv=5,
    n_iter=50,              # Try 20 random combinations
    n_jobs=1,
    verbose=3,
    random_state=42
)

# Fit to training data
random_search1.fit(X_train_prepared, y_train)

# Output best results
print("Best Parameters:", random_search1.best_params_)
print("Best Score:", random_search1.best_score_)


In [None]:
import pickle
pickle.dump(random_search1, open('catboost_model_version_2.pkl', 'wb'))


In [13]:
import pickle
# load the model from file
loaded_model = pickle.load(open('catboost_model_version_2.pkl', 'rb'))

In [22]:
loaded_pipe = pickle.load(open('data_scaling_new.pkl', 'rb'))

In [32]:
loaded_pipe.fit_transform(X_train)[0:1]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 29 stored elements and shape (1, 152)>

In [None]:
loaded_model.predict(preprocessor.transform(X_train)[0].reshape(1, -1))

In [None]:
print(X_train.shape)

In [10]:
first_row_dict = X_train.iloc[0].to_dict()
print(first_row_dict)

{'Client_Income_category': 'income_band4', 'Credit_Amount_category': 'credit_band3', 'Credit_to_Income_Category': 'Upto_2_times', 'Loan_Annuity_category': 'upto_4_percent', 'Client_Family_Members_Category': 'upto_1_member', 'Car_Owned': 1, 'Bike_Owned': 0, 'Active_Loan': 1, 'Accompany_Client': 'Relative', 'Client_Income_Type': 'Service', 'Client_Education': 'Secondary', 'Client_Marital_Status': 'S', 'Loan_Contract_Type': 'CL', 'Client_Housing_Type': 'Home', 'Registration_Years_Category': 'upto_30_years', 'ID_Years_Category': 'upto_10_years', 'Employed_Days_Category': 'upto_5_years', 'Age_Days_Category': 'upto_30_years', 'House_Own': 1.0, 'Client_Occupation': 'Laborers', 'Cleint_City_Rating': 2.0, 'Application_Process_Day': 2.0, 'Application_Hour_Category': 'upto_12_hours', 'Type_Organization_Grouped': 'Business', 'Score_Source_2': 0.728828041, 'Score_Source_3': 0.466864006, 'Social_Circle_Default': 1, 'Phone_Change_category': 'one_year', 'Credit_Bureau_Category': 'two_search'}


In [11]:
import pandas as pd
import json

# Assuming X_train is a DataFrame
first_row_json = X_train.iloc[0].to_json()
print(first_row_json)


{"Client_Income_category":"income_band4","Credit_Amount_category":"credit_band3","Credit_to_Income_Category":"Upto_2_times","Loan_Annuity_category":"upto_4_percent","Client_Family_Members_Category":"upto_1_member","Car_Owned":1,"Bike_Owned":0,"Active_Loan":1,"Accompany_Client":"Relative","Client_Income_Type":"Service","Client_Education":"Secondary","Client_Marital_Status":"S","Loan_Contract_Type":"CL","Client_Housing_Type":"Home","Registration_Years_Category":"upto_30_years","ID_Years_Category":"upto_10_years","Employed_Days_Category":"upto_5_years","Age_Days_Category":"upto_30_years","House_Own":1.0,"Client_Occupation":"Laborers","Cleint_City_Rating":2.0,"Application_Process_Day":2.0,"Application_Hour_Category":"upto_12_hours","Type_Organization_Grouped":"Business","Score_Source_2":0.728828041,"Score_Source_3":0.466864006,"Social_Circle_Default":1,"Phone_Change_category":"one_year","Credit_Bureau_Category":"two_search"}


In [36]:
df = pd.read_csv('dataset.csv')
df_clean = df.dropna()
X = df_clean.drop(columns=['Default'], errors='ignore')




  df = pd.read_csv('dataset.csv')


In [37]:
first_row_json = X.iloc[0].to_json()
print(first_row_json)

{"ID":12127046,"Client_Income":"27000","Car_Owned":1.0,"Bike_Owned":0.0,"Active_Loan":0.0,"House_Own":1.0,"Child_Count":3.0,"Credit_Amount":"53366.85","Loan_Annuity":"4003.2","Accompany_Client":"Alone","Client_Income_Type":"Commercial","Client_Education":"Secondary","Client_Marital_Status":"M","Client_Gender":"Female","Loan_Contract_Type":"CL","Client_Housing_Type":"Home","Population_Region_Relative":"0.032561","Age_Days":"12323","Employed_Days":"1117","Registration_Days":"5420","ID_Days":"3900","Own_House_Age":2.0,"Mobile_Tag":1,"Homephone_Tag":0,"Workphone_Working":1,"Client_Occupation":"Laborers","Client_Family_Members":5.0,"Cleint_City_Rating":1.0,"Application_Process_Day":4.0,"Application_Process_Hour":10.0,"Client_Permanent_Match_Tag":"Yes","Client_Contact_Work_Tag":"Yes","Type_Organization":"Business Entity Type 3","Score_Source_1":0.47716857,"Score_Source_2":0.67744661,"Score_Source_3":0.581483706,"Social_Circle_Default":0.033,"Phone_Change":1805.0,"Credit_Bureau":4.0}


In [None]:
sample = preprocessor.transform(X_train[0:1])
print(sample.shape)

In [None]:
# Choosing CatBoost model with optimized parameters 
# {'learning_rate': 0.03, 'l2_leaf_reg': 5, 
# 'iterations': 300, 'depth': 4, 'border_count': 32, 'bagging_temperature': 1.0}
# Optimization metric is recall  
# Best Score achieved during training : 0.6584634763080928 i.e 65.85%
# Final Model Evaluation on Test Data and the score  is 0.6582716049382716 i.e 65.83% 
# Model is properly trained without overfitting and is ready for deployment.


# Save the model
import joblib
joblib.dump(random_search1.best_estimator_, 'catboost_model_version_1.pkl')
# Load the model
loaded_model = joblib.load('catboost_model_version_1.pkl')
# Predict using the loaded model
y_pred_loaded = loaded_model.predict(X_test_prepared)
# Evaluate the loaded model
print("Loaded Model Test Recall Score:", recall_score(y_test, y_pred_loaded))
print("Loaded Model Test Accuracy Score:", accuracy_score(y_test, y_pred_loaded))
print("Loaded Model Test F1 Score:", f1_score(y_test, y_pred_loaded))
print("Loaded Model Test ROC AUC Score:", roc_auc_score(y_test, y_pred_loaded))
print("Loaded Model Test Precision Score:", precision_score(y_test, y_pred_loaded))
print("Loaded Model Test Confusion Matrix:\n", confusion_matrix(y_test, y_pred_loaded))


In [None]:
df1 = df.copy()

In [30]:
sample = preprocessor.transform(X_train)[0:1]
print(sample.shape)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
preprocessor.transform(X_test)[0].shape