## German Credit Risk

Required Libraries and Data:

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings 
warnings.filterwarnings('ignore')

In [67]:
df = pd.read_csv('german_credit_data.csv')

In [69]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [71]:
df.columns

Index(['Unnamed: 0', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
##  there are many non Numeric Columns in dataset


In [None]:
# Get all column names
columns = df.columns

# Initialize an empty list for object-type columns
obj_col = []

# Loop through columns and check for object type
for col in columns:
    if df[col].dtype == 'O':  
        obj_col.append(col)

# Print the list of object columns
print(obj_col)


In [None]:
# Initialize an empty list for numeric columns
numeric_col = []

# Loop through columns and check for numeric types
for col in columns:
    if df[col].dtype in ['int64', 'float64']:  # Check for numeric types
        numeric_col.append(col)

# Print the list of numeric columns
print(numeric_col)


## Data Cleaning

In [None]:
## Drop the "Unnamed columns" no need of it

df.drop(["Unnamed: 0"], axis=1, inplace=True)


In [None]:
df.head()

In [None]:
df['Saving accounts'].unique()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
# Impute missing values for 'Saving accounts' and 'Checking account' with the mode
## df['Saving accounts'].fillna(df['Saving accounts'].mode()[0], inplace=True)
## df['Checking account'].fillna(df['Checking account'].mode()[0], inplace=True)

# Alternatively, drop rows with missing values
# df.dropna(subset=['Saving accounts', 'Checking account'], inplace=True)
##  but after droping this columns our data becomes in (522, 10) shape so this is not good idea

In [None]:
## try another methode to impute missing value because model is not giving better accurecy


In [None]:
# Manually encoding 'Saving accounts' column
saving_accounts_mapping = {'little': 0, 'moderate': 1, 'rich': 2, 'quite rich': 3}
df['Saving accounts'] = df['Saving accounts'].map(saving_accounts_mapping)

# Manually encoding 'Checking account' column
checking_account_mapping = {'little': 0, 'moderate': 1, 'rich': 2}
df['Checking account'] = df['Checking account'].map(checking_account_mapping)


In [None]:
from sklearn.impute import KNNImputer

# Initialize KNN imputer with n_neighbors
imputer = KNNImputer(n_neighbors=5)

# Impute the missing values
df[['Saving accounts', 'Checking account']] = imputer.fit_transform(df[['Saving accounts', 'Checking account']])


In [None]:
df.shape

In [None]:
df.isnull().sum()

## Descriptive Statistics

In [None]:
df.describe()

In [None]:
# Get the frequency distribution for categorical columns
print(df['Sex'].value_counts())
print()
print(df['Housing'].value_counts())
print()
print(df['Saving accounts'].value_counts())
print()
print(df['Checking account'].value_counts())

## Data Visualization

In [None]:
# Histogram for numerical features
df[['Age', 'Job', 'Credit amount', 'Duration']].hist(bins=20, figsize=(10, 6))
plt.show()



In [None]:
# Box plot for numerical features to check for outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[['Age', 'Job', 'Credit amount', 'Duration']])
plt.show()

In [None]:
## outliers in Credit amount

In [None]:
# Bar plot for categorical features
plt.figure(figsize=(10, 6))
sns.countplot(x='Sex', data=df)
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='Housing', data=df)
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(x='Risk', data=df)
plt.show()


In [None]:
## imbalance data (good/bad)

In [None]:
# Scatter plot between two numerical features
sns.scatterplot(x='Age', y='Credit amount', data=df)
plt.show()




In [None]:
## Age between 20 and 30  have credit amount 2500 and most of people belong to this age gap

In [None]:
# Correlation heatmap for numerical columns
correlation = df[['Age', 'Job', 'Credit amount', 'Duration']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Box plot between numerical and categorical variables
sns.boxplot(x='Risk', y='Credit amount', data=df)
plt.show()



In [None]:
## risk free (good)  credit amount mostly lie between 2500 and 5000

In [None]:
sns.violinplot(x='Risk', y='Age', data=df)
plt.show()


In [None]:
# Cross-tabulation between two categorical features
print(pd.crosstab(df['Sex'], df['Housing']))

# Stacked bar plot
pd.crosstab(df['Sex'], df['Housing']).plot(kind='bar', stacked=True, figsize=(10, 6))
plt.show()


In [None]:
## most of house own by male

In [None]:
sns.pairplot(df[['Age', 'Credit amount', 'Duration']])
plt.show()


In [None]:
# Boxplot to check for outliers
sns.boxplot(data=df[['Age', 'Job', 'Credit amount', 'Duration']])
plt.show()

# Z-score method to identify outliers
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(df[['Age', 'Credit amount', 'Duration']]))
outliers = (z_scores > 3).sum(axis=0)
print("Outliers detected in columns:", outliers)


In [None]:
# Check skewness of numeric columns
print(df[['Age', 'Job', 'Credit amount', 'Duration']].skew())

# Apply log transformation or other methods if needed
df['Credit amount'] = np.log1p(df['Credit amount'])  # Log transformation to handle skewness


In [None]:
df.head()

In [None]:
df['Checking account'].unique()

In [None]:
df.head()

In [None]:
labels = ("student", "young", "adult", "senior")
groups = pd.cut(df["Age"], labels=labels, bins=(18, 25, 35, 60, 120), ordered=True)
df["Age group"] = groups

sns.countplot(data=df, y="Age group", hue="Sex")
plt.show()

In [None]:
fig = px.histogram(data_frame=df, x="Purpose", color="Sex", histnorm="percent", barmode="group", width=800)
fig.update_layout(title="Distribution of Purpose by Sex", title_x=0.5)
fig.show()

In [None]:
fig = px.box(data_frame=df, x="Purpose", y="Credit amount", color="Sex", width=900)
fig.update_layout(title="Credit amount by Purpose and Risk", title_x=0.5)
fig.show()


In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(8, 4))
sns.boxplot(data=df, x="Risk", y="Credit amount", ax=ax0)
sns.boxplot(data=df, x="Risk", y="Duration", ax=ax1)
fig.suptitle("Credit amount and duration influence on credit risk")
fig.show()

In [None]:
fig, (ax0, ax1, ax2, ax3) = plt.subplots(1, 4, figsize=(16, 4))
sns.histplot(data=df, x="Age", hue="Risk", multiple="fill", bins=6, ax=ax0)
sns.histplot(data=df, x="Sex", hue="Risk", multiple="fill", ax=ax1)
sns.histplot(data=df, x="Purpose", hue="Risk", multiple="fill", ax=ax2)
sns.histplot(data=df, x="Housing", hue="Risk", multiple="fill", ax=ax3)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation= 90)
fig.suptitle("Risk proportion by age and sex")
plt.show()

In [None]:
df.head(10)

## 1. Data Preprocessing


In [None]:
print(df.columns.tolist())


In [None]:
df.columns = df.columns.str.strip()


In [None]:
categorical_cols = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Age group']

for col in categorical_cols:
    if col in df.columns:
        print(f"Unique values in {col}: {df[col].unique()}")
    else:
        print(f"Column {col} not found in the DataFrame.")


In [None]:
## try to change the incoding methods of features to get get better model

In [None]:
!pip install category_encoders

In [None]:
from sklearn.preprocessing import LabelEncoder

# Binary Encoding for 'Sex'
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'male' else 0)
df['Risk'] = df['Risk'].apply(lambda x: 1 if x == 'good' else 0)

# Columns to apply Label Encoding
label_encoding_cols = ['Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Age group']

# Initialize Label Encoder
label_encoder = LabelEncoder()

# Apply Label Encoding to each column
for col in label_encoding_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Verify the encoding
df.head()

In [None]:
df['Risk'].unique()

In [None]:
print(type(df))


In [None]:
df.columns


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
cols = ['Age', 'Sex','Job','Housing','Saving accounts','Checking account','Purpose','Age group','Credit amount', 'Duration']  # Columns to scale
df[cols] = scaler.fit_transform(df[cols])


In [None]:
df.head()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))  # Adjust the grid size based on your number of features
axes = axes.flatten()

# Plot scatter plots for each numerical feature
numerical_features = ['Age', 'Credit amount', 'Duration', 'Saving accounts', 'Checking account', 'Housing']
for i, feature in enumerate(numerical_features):
    sns.scatterplot(x=df[feature], y=df['Risk'], ax=axes[i])
    axes[i].set_title(f'Scatter Plot: {feature} vs Risk')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Risk')

# Adjust layout for better visualization
plt.tight_layout()
plt.show()

In [None]:
## No Linear Relationship with target veriable

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Step 1: Select the independent variables (exclude the target variable 'Risk')
X = df.drop(columns=['Risk'])

# Step 2: Add a constant to the data (for the intercept in the VIF calculation)
X = add_constant(X)

# Step 3: Calculate the VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Step 4: Display the results
print(vif_data)

In [None]:
# VIF is below 5 so no problem

## MODEL TRAINING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
X = df.drop('Risk', axis=1)  # Features
y = df['Risk']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(df['Risk'].value_counts(normalize=True))


In [None]:
## Imbalance DataSet

In [None]:
import sklearn

In [None]:
!pip install --upgrade imbalanced-learn


In [None]:
print(sklearn.__version__)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [None]:
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
# Check the distribution of the target variable after SMOTE
from collections import Counter

print("Class distribution after SMOTE:", Counter(y_resampled))


In [None]:
print(df['Risk'].value_counts(normalize=True))

In [None]:
from sklearn.metrics import classification_report, roc_auc_score,confusion_matrix

## 1 Random Forest Classifier

## 2 Gradient Boosting Classifier (XGBoost)

## 3 Support Vector Machine (SVM)

## 4 K-Nearest Neighbors (KNN)

## 5 Decision Tree Classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Prepare data
X = df.drop(columns=['Risk'])
y = df['Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 1 Random Forest Classifier¶

In [None]:
# Hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

# Best parameters and evaluation
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("Random Forest Best Params:", grid_search_rf.best_params_)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# 2 Gradient Boosting Classifier (XGBoost)

In [None]:
!pip install xgboost


In [None]:
# Hyperparameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_train, y_train)

# Best parameters and evaluation
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("XGBoost Best Params:", grid_search_xgb.best_params_)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


## 3. Support Vector Machine (SVM) with Hyperparameter Tuning

In [None]:
# Hyperparameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 4, 5]
}

svm_model = SVC(random_state=42)
grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=param_grid_svm, cv=3, n_jobs=-1, verbose=2)
grid_search_svm.fit(X_train, y_train)

# Best parameters and evaluation
best_svm = grid_search_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

print("SVM Best Params:", grid_search_svm.best_params_)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


## 4. K-Nearest Neighbors (KNN) with Hyperparameter Tuning


In [None]:
# Hyperparameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

knn_model = KNeighborsClassifier()
grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_grid_knn, cv=3, n_jobs=-1, verbose=2)
grid_search_knn.fit(X_train, y_train)

# Best parameters and evaluation
best_knn = grid_search_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)

print("KNN Best Params:", grid_search_knn.best_params_)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


## 5. Decision Tree Classifier with Hyperparameter Tuning

In [None]:
# Hyperparameter grid for Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

dt_model = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, cv=3, n_jobs=-1, verbose=2)
grid_search_dt.fit(X_train, y_train)

# Best parameters and evaluation
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)

print("Decision Tree Best Params:", grid_search_dt.best_params_)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Define the model
gb = GradientBoostingClassifier()

# Hyperparameters grid to search
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best Hyperparameters: ", grid_search.best_params_)

# Use the best estimator
best_gb = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_gb.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define individual models
model1 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
model2 = RandomForestClassifier(n_estimators=100, max_depth=5)
model3 = SVC(kernel='linear', probability=True)
model4 = LogisticRegression()

# Create a voting classifier (soft voting: predicted probability, can use hard for majority voting)
voting_clf = VotingClassifier(estimators=[('gb', model1), ('rf', model2), ('svc', model3), ('lr', model4)], voting='soft')

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Predict using the voting classifier
y_pred = voting_clf.predict(X_test)

# Calculate accuracy
print("Voting Classifier Accuracy: ", accuracy_score(y_test, y_pred))


In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('xgb', XGBClassifier(random_state=42))
]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf.fit(X_train, y_train)


In [None]:
!pip install xgboost
from xgboost import XGBClassifier

In [None]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
from pymongo.mongo_client import MongoClient
import pandas as pd
import json
import os
from dotenv import load_dotenv
# Step 1: Load environment variables from the .env file
load_dotenv()

# Retrieve MongoDB URL, Database Name, and Collection Name from environment variables
uri = "mongodb+srv://acashtech28:akash123@sambhajinagar.a2pwo.mongodb.net/?retryWrites=true&w=majority&appName=sambhajinagar"
DATABASE_NAME = "German_Bank"
COLLECTION_NAME = "credit_risk"

# Step 2: Ensure the MongoDB URI is available
if uri is None:
    raise ValueError("MONGO_DB_URL environment variable is not set.")
if DATABASE_NAME is None:
    raise ValueError("MONGO_DATABASE_NAME environment variable is not set.")
if COLLECTION_NAME is None:
    raise ValueError("MONGO_COLLECTION_NAME environment variable is not set.")

# Step 3: Establish connection with MongoDB
client = MongoClient(uri)

# Step 4: Path to the CSV file you want to upload
csv_file_path = r"german_credit_data.csv"  # Update the path if necessary

# Step 5: Read CSV file into a Pandas DataFrame
try:
    df = pd.read_csv(csv_file_path)
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit(1)

# Step 6: Check and drop any unnamed columns (typically, index columns in CSV files)
if "Unnamed: 0" in df.columns:
    df = df.drop("Unnamed: 0", axis=1)

# Step 7: Convert the DataFrame to JSON format that MongoDB accepts (list of dictionaries)
json_record = df.to_dict(orient="records")

# Step 8: Insert the records into MongoDB collection
try:
    # Insert data into the specified collection
    client[DATABASE_NAME][COLLECTION_NAME].insert_many(json_record)
    print("Data inserted successfully.")
except Exception as e:
    print(f"Error inserting data into MongoDB: {e}")


## data preprocessing pipeline

In [9]:
df = pd.read_csv('german_credit_data.csv')

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.impute import KNNImputer
import pandas as pd

class LabelEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            if col in X_copy.columns:
                le = LabelEncoder()
                X_copy[col] = le.fit_transform(X_copy[col])
        return X_copy

class BinaryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].apply(lambda x: 1 if x == 'good' or x == 'male' else 0)
        return X_copy

# ... (rest of the pipeline and transformation code)

label_encode_columns = ['Housing', 'Saving accounts', 'Checking account', 'Purpose']

# Define the pipeline steps
from sklearn.impute import SimpleImputer

pipeline_steps = [
    ('label_encoder', LabelEncoderTransformer(columns=label_encode_columns)),
    ('binary_encoder', BinaryEncoder(columns=['Sex', 'Risk'])),
    ('imputer', SimpleImputer(strategy='mean')),  # Use SimpleImputer
    ('scaler', RobustScaler())
]
# Initialize the pipeline
pipeline = Pipeline(pipeline_steps)

# Apply the pipeline to the data
df_transformed = pipeline.fit_transform(df)  # df is your original dataframe

# Convert transformed data back to a DataFrame for easier inspection
df_transformed = pd.DataFrame(df_transformed, columns=df.columns) 

# Output the transformed DataFrame
df_transformed


# import pandas as pd

# # Assuming your data string is stored in a variable named 'data_string'
# data_list = data_string.split('\n')

# # Define your desired column names
# column_names = ['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration', 'Purpose', 'Risk']  # Replace with your actual column names

# # Create a DataFrame with the specified column names
# df = pd.DataFrame([x.split() for x in data_list], columns=column_names)

# # Now, your DataFrame will have the desired column names
# print(df.head())


Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,-1.000000,2.266667,0.0,0.0,0.0,2.0,-0.333333,-0.441354,-1.00,0.50,0.0
1,-0.997998,-0.733333,-1.0,0.0,0.0,0.0,0.000000,1.393114,2.50,0.50,-1.0
2,-0.995996,1.066667,0.0,-1.0,0.0,0.0,0.666667,-0.085739,-0.50,0.00,0.0
3,-0.993994,0.800000,0.0,0.0,-1.0,0.0,-0.333333,2.133883,2.00,0.25,0.0
4,-0.991992,1.333333,0.0,0.0,-1.0,0.0,-0.333333,0.978421,0.50,-0.50,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
995,0.991992,-0.133333,-1.0,-1.0,0.0,0.0,0.666667,-0.223842,-0.50,0.25,0.0
996,0.993994,0.466667,0.0,1.0,0.0,0.0,-0.333333,0.589815,1.00,-0.50,0.0
997,0.995996,0.333333,0.0,0.0,0.0,0.0,0.666667,-0.581375,-0.50,0.50,0.0
998,0.997998,-0.666667,0.0,0.0,-1.0,0.0,-0.333333,-0.182027,2.25,0.50,-1.0
