In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
data = pd.read_csv('credit_risk_data.csv')

# Display the first few rows of the dataset
print(data.head())

# Step 1: Data Cleaning
# Handle missing values (if any)
imputer = SimpleImputer(strategy='most_frequent')
data_cleaned = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encode categorical variables
label_encoders = {}
for column in data_cleaned.select_dtypes(include=['object']).columns:
    if column != 'class':  # Do not encode the target variable yet
        le = LabelEncoder()
        data_cleaned[column] = le.fit_transform(data_cleaned[column])
        label_encoders[column] = le

# Encode the target variable
le_class = LabelEncoder()
data_cleaned['class'] = le_class.fit_transform(data_cleaned['class'])

# Step 2: Feature Selection and Scaling
# Separate features and target variable
X = data_cleaned.drop(columns=['class'])
y = data_cleaned['class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Hyperparameter Tuning using GridSearchCV
# Define the parameter grid
param_grid = {
    'n_neighbors': range(3, 21),         # Trying a range of K values
    'metric': ['euclidean', 'manhattan'],  # Different distance metrics
    'weights': ['uniform', 'distance']   # Uniform vs distance-based weighting
}

# Initialize the KNN model
knn = KNeighborsClassifier()

# Perform GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Step 4: Model Training with Best Parameters
best_knn = grid_search.best_estimator_

# Predict on the test set
y_pred = best_knn.predict(X_test)

# Step 5: Model Evaluation
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optional: Save the best model and label encoders
import joblib
joblib.dump(best_knn, 'credit_risk_model_knn_best.pkl')
joblib.dump(label_encoders, 'label_encoders_knn_best.pkl')
joblib.dump(scaler, 'scaler_knn_best.pkl')


  checking_status  duration                  credit_history  \
0              <0       6.0  critical/other existing credit   
1        0<=X<200      48.0                   existing paid   
2     no checking      12.0  critical/other existing credit   
3              <0      42.0                   existing paid   
4              <0      24.0              delayed previously   

               purpose  credit_amount    savings_status employment  \
0             radio/tv         1169.0  no known savings        >=7   
1             radio/tv         5951.0              <100     1<=X<4   
2            education         2096.0              <100     4<=X<7   
3  furniture/equipment         7882.0              <100     4<=X<7   
4              new car         4870.0              <100     1<=X<4   

   installment_commitment     personal_status other_parties  ...  \
0                     4.0         male single          none  ...   
1                     2.0  female div/dep/mar          none  ...

['scaler_knn_best.pkl']