In [1]:
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate synthetic data
data = pd.DataFrame({
    'age': np.random.randint(18, 70, n_samples),
    'income': np.random.randint(20000, 150000, n_samples),
    'debt': np.random.randint(0, 50000, n_samples),
    'credit_history_length': np.random.randint(1, 30, n_samples),
    'num_of_credit_accounts': np.random.randint(1, 10, n_samples),
    'payment_history': np.random.rand(n_samples) * 100,
    'credit_score': np.random.randint(0, 2, n_samples)  # 0 or 1
})

# Display the first few rows of the dataset
data.head()


Unnamed: 0,age,income,debt,credit_history_length,num_of_credit_accounts,payment_history,credit_score
0,56,125186,18546,17,2,16.036541,0
1,69,54674,19129,5,3,67.364542,0
2,46,55854,1591,20,6,17.925326,1
3,32,66271,11303,21,9,69.394959,0
4,60,93688,30561,8,9,22.959816,0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Check for missing values
missing_values = data.isnull().sum()

# Standardize the numerical features
features = data.drop('credit_score', axis=1)
target = data['credit_score']

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.3, random_state=42)

missing_values, X_train.shape, X_test.shape


(age                       0
 income                    0
 debt                      0
 credit_history_length     0
 num_of_credit_accounts    0
 payment_history           0
 credit_score              0
 dtype: int64,
 (700, 6),
 (300, 6))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Initialize the models
log_reg = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Train the models
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)
y_pred_decision_tree = decision_tree.predict(X_test)
y_pred_random_forest = random_forest.predict(X_test)

# Evaluate the models
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    return accuracy, precision, recall, f1, roc_auc

log_reg_metrics = evaluate_model(y_test, y_pred_log_reg)
decision_tree_metrics = evaluate_model(y_test, y_pred_decision_tree)
random_forest_metrics = evaluate_model(y_test, y_pred_random_forest)

log_reg_metrics, decision_tree_metrics, random_forest_metrics


((0.5066666666666667,
  0.4959349593495935,
  0.41496598639455784,
  0.45185185185185184,
  0.5048686141123117),
 (0.55,
  0.538961038961039,
  0.564625850340136,
  0.5514950166112957,
  0.5502867813792183),
 (0.56, 0.5641025641025641, 0.4489795918367347, 0.5, 0.5578231292517007))

In [9]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 5 folds for each of 108 candidates, totalling 540 fits


({'max_depth': None,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.5842857142857143)

In [10]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 5 folds for each of 108 candidates, totalling 540 fits


({'max_depth': None,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.5842857142857143)