In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

df_original = pd.read_csv("https://raw.githubusercontent.com/alvarofavale/week7_ml/refs/heads/main/data/raw/train.csv")


In [None]:
df = df_original.copy()
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.describe()

In [None]:
df.credit_score.unique()

In [None]:
df.dropna(inplace=True)
df.columns

In [None]:
# Step 1: Get the unique customer IDs
unique_customer_ids = df['customer_id'].unique()

# Step 2: Randomly sample 1000 unique customer IDs
sampled_customer_ids = np.random.choice(unique_customer_ids, size=1000, replace=False)

# Step 3: Filter the DataFrame to include only the rows with the sampled customer IDs
df_random_1000_clients = df[df['customer_id'].isin(sampled_customer_ids)]

# Display the resulting DataFrame
df_random_1000_clients.head()


In [None]:
# Select relevant features for the model (use numerical features for simplicity)
X = df_random_1000_clients[['annual_income', 'outstanding_debt']]  # Use 'annual_income' and 'outstanding_debt' as features
y = df_random_1000_clients['credit_score']  # Target variable is 'credit_score'

# Encoding 'credit_score' if it is categorical (e.g., Good = 0, Standard = 1, Bad = 2)
y = y.replace({'Good': 0, 'Standard': 1, 'Poor': 2})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Create a meshgrid for decision boundaries
x_min, x_max = X['annual_income'].min() - 1000, X['annual_income'].max() + 1000
y_min, y_max = X['outstanding_debt'].min() - 1000, X['outstanding_debt'].max() + 1000
xx, yy = np.meshgrid(np.arange(x_min, x_max, 500),
                     np.arange(y_min, y_max, 500))

# Predict the class for each point in the grid
Z = clf.predict(scaler.transform(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)

# Plotting
plt.figure(figsize=(10, 6))

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlGn)  # Adjust colors for each class (Good, Standard, Bad)

# Plot the actual data points
sns.scatterplot(x='annual_income', y='outstanding_debt', hue='credit_score', data=df_random_1000_clients, palette={ 0: 'green', 1: 'yellow', 2: 'red'},
                s=100, edgecolor='black')

# Customize the plot
plt.title('Decision Boundary for Credit Score Prediction')
plt.xlabel('Annual Income')
plt.ylabel('Outstanding Debt')
plt.legend(title='Credit Score', labels=['Good', 'Standard', 'Poor'])

plt.show()


In [None]:
# First, check the unique values in the credit_score column to make sure
print(df['credit_score'].unique())

# Assuming the output is ['Good', 'Standard', 'Bad']
# Adjust your color mapping accordingly:
color_palette = {'Good': 'green', 'Standard': 'yellow', 'Poor': 'red'}

# Now, create the scatter plot with the correct palette
sns.scatterplot(x='annual_income', y='outstanding_debt', hue='credit_score', data=dfc, palette=color_palette,
                s=100, edgecolor='black')

# Customize the plot
plt.title('Decision Boundary for Credit Score Prediction')
plt.xlabel('Annual Income')
plt.ylabel('Outstanding Debt')
plt.legend(title='Credit Score')

plt.show()


In [None]:
df.credit_score.unique()