In [None]:
# Step 2 (Fixed): Load a working credit dataset
import pandas as pd

# This is a real, publicly hosted credit scoring dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Display first few rows
df.head()


Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,... < 100 DM,6,critical account/other credits existing,domestic appliances,1169,unknown/no savings account,... >= 7 years,4,male : single,none,...,real estate,67,none,own,2,skilled employee/official,1,yes,yes,1
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,none,...,real estate,22,none,own,1,skilled employee/official,1,no,yes,0
2,no checking account,12,critical account/other credits existing,retraining,2096,... < 100 DM,4 <= ... < 7 years,2,male : single,none,...,real estate,49,none,own,1,unskilled - resident,2,no,yes,1
3,... < 100 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,guarantor,...,building society savings agreement/life insurance,45,none,for free,1,skilled employee/official,2,no,yes,1
4,... < 100 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,none,...,unknown/no property,53,none,for free,2,skilled employee/official,2,no,yes,0


In [None]:
# Step 3.1: Check for missing values
df.isnull().sum()


Unnamed: 0,0
status,0
duration,0
credit_history,0
purpose,0
amount,0
savings,0
employment_duration,0
installment_rate,0
personal_status_sex,0
other_debtors,0


In [None]:
# Step 3.2: Convert target labels (Good → 0, Bad → 1)
df['Class'] = df['Class'].map({'Good': 0, 'Bad': 1})


KeyError: 'Class'

In [None]:
# Show all column names
print(df.columns)


Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'personal_status_sex',
       'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'number_credits', 'job',
       'people_liable', 'telephone', 'foreign_worker', 'credit_risk'],
      dtype='object')


In [None]:
# Step 3.2 (fixed): Convert 'credit_risk' column to numeric
df['credit_risk'] = df['credit_risk'].map({'good': 0, 'bad': 1})


In [None]:
df['credit_risk'].value_counts()


Unnamed: 0_level_0,count
credit_risk,Unnamed: 1_level_1


In [None]:
# Step 3.3: Select features and target
X = df[['Duration', 'Amount', 'Age']]   # Input features
y = df['Class']                         # Target: 0 = Good, 1 = Bad


KeyError: "None of [Index(['Duration', 'Amount', 'Age'], dtype='object')] are in the [columns]"

In [None]:
# Step 3.3 (fixed): Select input features and target
X = df[['duration', 'amount', 'age']]   # Input features (correct column names)
y = df['credit_risk']                   # Target: 0 = good, 1 = bad


In [None]:
X.head()


Unnamed: 0,duration,amount,age


In [None]:
# Clean the data by removing rows where credit_risk is missing
df = df.dropna(subset=['credit_risk'])

# Check again to confirm
df['credit_risk'].isnull().sum()


np.int64(0)

In [None]:
# Step 3.3: Select input features and target
X = df[['duration', 'amount', 'age']]   # Input features
y = df['credit_risk']                   # Target column


In [None]:
# Select input features and target again (overwrite previous X and y)
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']


In [None]:
print("Missing values in y:", y.isnull().sum())


Missing values in y: 0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Step 4: Split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Step 5: Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression()

# Train it using the training data
model.fit(X_train, y_train)


ValueError: Input y contains NaN.

In [None]:
# Step 5: Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression()

# Train the model using training data
model.fit(X_train, y_train)


ValueError: Input y contains NaN.

In [None]:
# Remove rows where credit_risk is missing
df = df.dropna(subset=['credit_risk'])


In [None]:
# Remove all rows with any missing values
df = df.dropna()


In [None]:
# Select features and target after cleaning
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']


In [None]:
y.isnull().sum()


np.int64(0)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
print(df.shape)


(0, 21)


In [None]:
# Drop only if any of the required columns are missing
df = df.dropna(subset=['duration', 'amount', 'age', 'credit_risk'])


In [None]:
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']


In [None]:
print(X.shape, y.shape)


(0, 3) (0,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
print("Number of rows in df:", len(df))
print(df.shape)
df.head()


Number of rows in df: 0
(0, 21)


Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk


In [None]:
df[['duration', 'amount', 'age', 'credit_risk']].isnull().sum()


Unnamed: 0,0
duration,0
amount,0
age,0
credit_risk,0


In [None]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"


In [None]:
import pandas as pd

# Reload clean credit dataset
url = "https://raw.githubusercontent.com/AnandDeopurkar/datasets/main/german_credit_cleaned.csv"
df = pd.read_csv(url)

# Select input and target
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']


HTTPError: HTTP Error 404: Not Found

In [None]:
import pandas as pd

# Load a clean German credit dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Convert target to numeric
df['Class'] = df['Class'].map({'Good': 0, 'Bad': 1})

# Select features and target
X = df[['Duration', 'Amount', 'Age']]
y = df['Class']

# Confirm shapes
print(X.shape, y.shape)


KeyError: 'Class'

In [None]:
df.columns


Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'personal_status_sex',
       'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'number_credits', 'job',
       'people_liable', 'telephone', 'foreign_worker', 'credit_risk'],
      dtype='object')

In [None]:
'credit_risk'


'credit_risk'

In [None]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Check column names
print(df.columns)

# Convert target to numeric (0 for good, 1 for bad)
df['credit_risk'] = df['credit_risk'].map({'good': 0, 'bad': 1})

# Select features and target
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']

# Check shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Index(['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings',
       'employment_duration', 'installment_rate', 'personal_status_sex',
       'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'number_credits', 'job',
       'people_liable', 'telephone', 'foreign_worker', 'credit_risk'],
      dtype='object')
Shape of X: (1000, 3)
Shape of y: (1000,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


ValueError: Input y contains NaN.

In [None]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Check what unique values are in the target column
print("Unique values in credit_risk:", df['credit_risk'].unique())


Unique values in credit_risk: [1 0]


In [None]:
['good' 'bad']


['goodbad']

In [None]:
['Good' 'Bad']


['GoodBad']

In [None]:
# Convert target to lowercase first
df['credit_risk'] = df['credit_risk'].str.lower()

# Now map it to numbers
df['credit_risk'] = df['credit_risk'].map({'good': 0, 'bad': 1})

# Drop any rows where mapping failed (i.e., credit_risk is still NaN)
df = df.dropna(subset=['credit_risk'])


AttributeError: Can only use .str accessor with string values!

In [None]:
import pandas as pd

# Load dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Check the actual unique values in the credit_risk column
print("Before cleanup:", df['credit_risk'].unique())

# Convert credit_risk values to string and lowercase
df['credit_risk'] = df['credit_risk'].astype(str).str.lower()

# Map 'good' -> 0 and 'bad' -> 1
df['credit_risk'] = df['credit_risk'].map({'good': 0, 'bad': 1})

# Drop any rows where mapping failed (still NaN)
df = df.dropna(subset=['credit_risk'])

# Confirm result
print("After cleanup:", df['credit_risk'].unique())


Before cleanup: [1 0]
After cleanup: []


In [None]:
# Select input and output
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
print("Shape of df:", df.shape)


Shape of df: (0, 21)


In [None]:
print(df['credit_risk'].unique())


[]


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv")
print("credit_risk unique values (original):", df['credit_risk'].unique())


credit_risk unique values (original): [1 0]


In [None]:
import pandas as pd

# Load dataset — do NOT apply mapping!
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

# Select input features and target
X = df[['duration', 'amount', 'age']]
y = df['credit_risk']

# Check shape to confirm it's not empty
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (1000, 3)
y shape: (1000,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Predict on test set
y_pred = model.predict(X_test)

# Print metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.735
Precision: 0.7315789473684211
Recall: 0.9858156028368794
F1 Score: 0.8398791540785498
ROC-AUC Score: 0.5607044115879313

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.14      0.23        59
           1       0.73      0.99      0.84       141

    accuracy                           0.73       200
   macro avg       0.77      0.56      0.54       200
weighted avg       0.75      0.73      0.66       200



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

# Re-evaluate
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.38      0.20      0.26        59
           1       0.72      0.86      0.78       141

    accuracy                           0.67       200
   macro avg       0.55      0.53      0.52       200
weighted avg       0.62      0.67      0.63       200



In [None]:
['installment_rate', 'present_residence', 'number_credits', 'job', 'people_liable']


['installment_rate',
 'present_residence',
 'number_credits',
 'job',
 'people_liable']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Then do train-test split again


In [None]:
import joblib

# Save the trained Logistic Regression model to a file
joblib.dump(model, 'credit_score_model.pkl')

print("✅ Model saved as credit_score_model.pkl")


✅ Model saved as credit_score_model.pkl


In [None]:
# Load the model
loaded_model = joblib.load('credit_score_model.pkl')

# Example: make predictions with it
sample_prediction = loaded_model.predict(X_test[:5])
print("Sample predictions:", sample_prediction)


Sample predictions: [1 1 1 1 1]


In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
joblib.dump(rf_model, 'credit_score_rf_model.pkl')


['credit_score_rf_model.pkl']