In [None]:
import pandas as pd
import seaborn as sns
import statistics as stat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df_original = pd.read_csv("https://raw.githubusercontent.com/alvarofavale/week7_ml/refs/heads/main/data/raw/train.csv")


In [None]:
df = df_original.copy()
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.describe()

In [None]:
dfc = df[df["customer_id"] == 3392]
dfc

In [None]:
df.credit_score.unique()

In [None]:
df.columns

In [None]:
df.drop(columns=["ssn", "name"], inplace = True, axis = 1)

In [None]:
df.drop_duplicates()

In [None]:
df_limit = df.sample(1000)
#sns.pairplot(df_limit, hue ="Credit_Score")

In [None]:
features = df_limit.drop(columns = ["credit_score"])
features = features.select_dtypes(include = "number")
target = df_limit["credit_score"]

In [None]:
target.replace({"Good" : 1, "Standard" : 2, "Poor" : 3})


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [None]:
normalizer = MinMaxScaler()

normalizer.fit(X_train)

In [None]:
scaler = StandardScaler()

scaler.fit(X_train)

In [None]:
X_train_norm_np = normalizer.transform(X_train)

X_test_norm_np = normalizer.transform(X_test)

In [None]:
X_train_norm_df = pd.DataFrame(X_train_norm_np, columns = X_train.columns, index=X_train.index)
X_train_norm_df.head()

In [None]:
X_test_norm_df = pd.DataFrame(X_test_norm_np, columns = X_test.columns, index=X_test.index)
X_test_norm_df.head()

In [None]:
X_train_standarized_np = scaler.transform(X_train)
X_test_standarized_np = scaler.transform(X_test)

X_train_standarized_df = pd.DataFrame(X_train_standarized_np, columns = X_train.columns, index=X_train.index)
X_test_standarized_df  = pd.DataFrame(X_test_standarized_np, columns = X_test.columns, index=X_test.index)


DECISION TREE

In [None]:
tree = DecisionTreeRegressor(max_depth=10)

- Training the model

Note: Decision Trees are not "distance" based models. Therefore, we don't need to have the data before training them. If we do so, the model with adapt to the new transformed data and the changes in the predictions will be small.

Here, for simplicity we will train a Decision Tree with transformed data

In [None]:
y_train.replace({"Good" : 1, "Standard" : 2, "Poor" : 3}, inplace=True)

In [None]:
y_train

In [None]:
tree.fit(X_train_norm_df, y_train)

- Evaluate the model

In [None]:
X_train_norm_df

In [None]:
y_test.replace({"Good" : 1, "Standard" : 2, "Poor" : 3}, inplace=True)
y_test

In [None]:
y_pred_test_dt = tree.predict(X_test_norm_df)

print(f"MAE, {mean_absolute_error(y_pred_test_dt, y_test): .2f}")
print(f"RMSE, {mean_squared_error(y_pred_test_dt, y_test, squared=False): .2f}")
print(f"R2 score, {tree.score(X_test_norm_df, y_test): .2f}")

Often we check what are the most relevant features, like we did before in Linear Regression. However, here we don't have coefficients. Therefore, to do this kind of analisys, we check in which order were the features selected to split the data. The intuition behind this method is that the most important features will be selected first, while less important fetures will be used in later splits.

Fortunatelly for us, this information in stored in the instance of the class and it can be accessed trough the attribute `.feature_importances_` once the model has been trained.

In [None]:
tree_importance = {feature : importance for feature, importance in zip(X_train_norm_df.columns, tree.feature_importances_)}
tree_importance

In [None]:
from sklearn.tree import export_text

tree_viz = export_text(tree, feature_names=list(X_train_norm_df.columns))
print(tree_viz)


A bit overwhelming to see, let's use graphviz library.

**Note**: you will need to install graphivz

* pip install graphviz
* conda install graphviz -y

depending on your environment and package mannager.

In [None]:
#!pip install graphviz

- We will train a decision tree, in this case with max_depth=2 to better see the diagram

In [None]:
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import graphviz

tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X_train_norm_df, y_train)


dot_data = export_graphviz(tree, out_file="tree.dot", filled=True, rounded=True, feature_names=X_train_norm_df.columns)

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Example data (Replace this with your actual dataset)
data = {
    'income': [50000, 40000, 80000, 60000, 75000, 120000, 95000, 110000, 45000, 60000],
    'debt': [20000, 15000, 50000, 25000, 40000, 30000, 35000, 20000, 18000, 22000],
    'credit_score': [2, 0, 2, 1, 2, 2, 1, 2, 0, 1]  # Already encoded as 0, 1, 2
}

# Create a DataFrame
df = pd.DataFrame(data)

# Features and target
X = df[['income', 'debt']]
y = df['credit_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features (optional, but useful for some models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a classifier (RandomForest in this case)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Generate a meshgrid to define the decision boundary
x_min, x_max = X['income'].min() - 1000, X['income'].max() + 1000
y_min, y_max = X['debt'].min() - 1000, X['debt'].max() + 1000
xx, yy = np.meshgrid(np.arange(x_min, x_max, 500),
                     np.arange(y_min, y_max, 500))

# Predict the class for each point in the grid
Z = model.predict(scaler.transform(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)

# Create a plot
plt.figure(figsize=(10, 6))

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlGn)  # Red for bad, Yellow for standard, Green for good

# Plot the actual data points on top
sns.scatterplot(x='income', y='debt', hue='credit_score', data=df, palette={0: 'red', 1: 'orange', 2: 'green'},
                s=100, edgecolor='black')

# Customize the plot
plt.title('Credit Score Decision Regions (Good, Standard, Bad)')
plt.xlabel('Income')
plt.ylabel('Debt')
plt.legend(title='Credit Score', labels=['Bad', 'Standard', 'Good'])

plt.show()


In [None]:
# Features and target (assuming the target column is 'credit_score')
X = dfc[['income', 'debt', 'some_other_feature']]  # Replace 'some_other_feature' with actual features
y = dfc['credit_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Create a meshgrid for decision boundaries
x_min, x_max = X['income'].min() - 1000, X['income'].max() + 1000
y_min, y_max = X['debt'].min() - 1000, X['debt'].max() + 1000
xx, yy = np.meshgrid(np.arange(x_min, x_max, 500),
                     np.arange(y_min, y_max, 500))

# Predict the class for each point in the grid
Z = clf.predict(scaler.transform(np.c_[xx.ravel(), yy.ravel()]))
Z = Z.reshape(xx.shape)

# Plotting
plt.figure(figsize=(10, 6))

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlGn)  # Adjust colors for each class (Good, Standard, Bad)

# Plot the actual data points
sns.scatterplot(x='income', y='debt', hue='credit_score', data=dfc, palette={0: 'red', 1: 'yellow', 2: 'green'},
                s=100, edgecolor='black')

# Customize the plot
plt.title('Decision Boundary for Credit Score Prediction')
plt.xlabel('Income')
plt.ylabel('Debt')
plt.legend(title='Credit Score', labels=['Bad', 'Standard', 'Good'])

plt.show()