In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
url = 'https://raw.githubusercontent.com/alvarofavale/week7_ml/refs/heads/main/data/encoded/encoded_data.csv'
df = pd.read_csv(url)

# Set display option to show all columns
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,7.0,11.27,4.0,1,809.98,26.82262,265.0,1,49.574949,21.46538,1,312.494089,0
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,4.0,11.27,4.0,1,809.98,31.94496,266.0,1,49.574949,21.46538,2,284.629162,0
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,7.0,11.27,4.0,1,809.98,28.609352,267.0,1,49.574949,21.46538,3,331.209863,0
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,5.0,4.0,6.27,4.0,1,809.98,31.377862,268.0,1,49.574949,21.46538,4,223.45131,0
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,6.0,4.0,11.27,4.0,1,809.98,24.797347,269.0,1,49.574949,21.46538,5,341.489231,0


In [3]:
# Define features (X) and target (y) for classification
# Assuming 'credit_score' is the target column (modify if needed)
features = df.drop(columns=["credit_score", 'name'])  # Drop the target column from features
target = df["credit_score"]  # Use the 'credit_score' column as the target

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [4]:
# Create instances of the normalizer and scaler
normalizer = MinMaxScaler()
scaler = StandardScaler()

# Normalize the training and test data using MinMaxScaler
X_train_norm_np = normalizer.fit_transform(X_train)
X_test_norm_np = normalizer.transform(X_test)

# Convert the normalized data back to DataFrame with the original columns and index
X_train_norm_df = pd.DataFrame(X_train_norm_np, columns=X_train.columns, index=X_train.index)
X_test_norm_df = pd.DataFrame(X_test_norm_np, columns=X_test.columns, index=X_test.index)

# Standardize the training and test data using StandardScaler
X_train_standarized_np = scaler.fit_transform(X_train)
X_test_standarized_np = scaler.transform(X_test)

# Convert the standardized data back to DataFrame with the original columns and index
X_train_standarized_df = pd.DataFrame(X_train_standarized_np, columns=X_train.columns, index=X_train.index)
X_test_standarized_df = pd.DataFrame(X_test_standarized_np, columns=X_test.columns, index=X_test.index)

# Display the first few rows of the normalized and standardized DataFrames as an example
print("Normalized Training Data:")
X_train_norm_df.head()

Normalized Training Data:


Unnamed: 0,id,customer_id,month,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance
10382,0.103803,0.180905,0.857143,0.190476,0.432327,0.928571,0.008969,0.033117,0.727273,0.545455,0.636364,0.222222,0.133482,0.596774,0.6,0.242198,0.411765,1.0,0.353565,0.180027,0.352357,1.0,0.007402,0.032886,0.6,0.22348
73171,0.731724,0.28702,0.428571,0.547619,0.110725,1.0,0.073493,0.092115,0.909091,0.818182,0.545455,0.666667,0.027809,0.435484,0.44,0.421303,0.764706,0.5,0.411644,0.288082,0.486352,1.0,0.036011,0.068547,0.6,0.24147
30938,0.309384,0.504631,0.285714,0.452381,0.072287,0.071429,0.144755,0.159307,0.545455,0.818182,0.878788,0.777778,0.020022,0.16129,0.4,0.566147,0.647059,0.5,0.266705,0.335129,0.186104,1.0,0.095425,0.073536,0.8,0.240969
99310,0.993113,0.316944,0.857143,0.666667,0.170509,0.142857,0.323563,0.336714,0.0,0.272727,0.333333,0.0,0.629588,0.145161,0.0,0.188263,0.176471,0.0,0.013692,0.688726,0.471464,0.0,0.0,0.134882,0.8,0.600811
58959,0.589586,0.708819,1.0,0.595238,0.718464,0.285714,0.293747,0.306059,0.636364,0.636364,0.606061,0.222222,0.467186,0.903226,0.64,0.121777,0.705882,1.0,0.469911,0.596088,0.42928,1.0,0.041431,0.238643,0.6,0.259788


In [5]:
print("\nNormalized Testing Data:")
X_test_norm_df.head()


Normalized Testing Data:


Unnamed: 0,id,customer_id,month,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance
3582,0.035801,0.135639,0.857143,0.285714,0.299906,0.642857,0.322812,0.341558,0.818182,0.727273,0.575758,0.777778,0.18465,0.645161,0.72,0.043758,0.588235,1.0,0.81503,0.415187,0.064516,1.0,0.160183,0.13666,0.8,0.271849
60498,0.604993,0.409697,0.285714,0.285714,0.073171,0.642857,0.217518,0.209095,0.909091,0.545455,0.969697,0.555556,0.580645,0.241935,0.72,0.779512,0.352941,1.0,0.682075,0.644214,0.188586,1.0,0.08664,0.107928,0.6,0.070154
53227,0.532278,0.898926,0.428571,0.642857,0.590862,0.857143,0.121727,0.14052,0.454545,0.454545,0.181818,0.111111,0.938821,0.403226,0.04,0.313433,0.176471,0.0,0.167997,0.553091,0.53598,0.0,0.008339,0.047825,1.0,0.374216
21333,0.21332,0.588162,0.714286,0.857143,0.242392,0.857143,0.520312,0.528457,0.090909,0.636364,0.060606,0.0,0.629588,0.16129,0.04,0.007123,0.411765,0.0,0.049743,0.503717,0.898263,0.5,0.0,0.227371,0.8,0.783524
3885,0.038835,0.182366,0.714286,0.547619,0.190767,0.785714,0.470573,0.456986,0.181818,0.636364,0.333333,0.444444,0.859844,0.16129,0.12,0.290366,0.176471,0.0,0.254694,0.194102,0.935484,0.0,0.136157,0.208394,0.8,0.435348


In [6]:
print("\nStandardized Training Data:")
X_train_standarized_df.head()


Standardized Training Data:


Unnamed: 0,id,customer_id,month,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance
10382,-1.372176,-1.109867,1.092356,-1.052142,-0.233403,1.401226,-1.093296,-1.065023,1.011609,0.223839,0.851446,-0.628477,-1.367095,1.074422,0.268507,-0.428716,0.310446,1.451406,0.291733,-1.345607,-0.782487,0.901684,-0.717121,-1.046651,0.141044,-0.632408
73171,0.802661,-0.740132,-0.217882,0.34175,-1.338111,1.632599,-0.801168,-0.788487,1.782958,1.674163,0.508184,1.008314,-1.74893,0.398603,-0.373226,0.36911,1.86153,0.087479,0.542327,-0.712063,-0.241341,0.901684,-0.329834,-0.64818,0.141044,-0.526901
30938,-0.660138,0.018086,-0.654627,-0.029955,-1.470144,-1.375254,-0.478539,-0.47354,0.240261,1.674163,1.766812,1.417512,-1.777065,-0.750291,-0.53366,1.014322,1.344502,0.087479,-0.083039,-0.436217,-1.453908,0.901684,0.474473,-0.592434,0.790679,-0.529838
99310,1.707995,-0.635867,1.092356,0.80638,-1.132752,-1.143881,0.330998,0.358007,-2.073785,-1.226484,-0.292761,-1.446872,0.425519,-0.817872,-2.137994,-0.668971,-0.72361,-1.276448,-1.174713,1.636996,-0.301469,-1.266217,-0.81732,0.093038,0.790679,1.580499
58959,0.310359,0.729538,1.529102,0.527602,0.749477,-0.681134,0.19601,0.214318,0.625935,0.707281,0.737025,-0.628477,-0.161301,2.358479,0.428941,-0.965134,1.603016,1.451406,0.793732,1.093844,-0.471829,0.901684,-0.256459,1.252435,0.141044,-0.419472


In [7]:
print("\nStandardized Testing Data:")
X_test_standarized_df.head()


Standardized Testing Data:


Unnamed: 0,id,customer_id,month,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance
3582,-1.607705,-1.267588,1.092356,-0.680437,-0.688273,0.475732,0.327596,0.380708,1.397284,1.190722,0.622605,1.417512,-1.182207,1.277168,0.749808,-1.312672,1.085988,1.451406,2.282814,0.033179,-1.944947,0.901684,1.351133,0.112898,0.790679,-0.34874
60498,0.363722,-0.312689,-0.654627,-0.680437,-1.467109,0.475732,-0.149111,-0.240173,1.782958,0.223839,2.110074,0.599117,0.248669,-0.412381,0.749808,1.964763,0.051932,1.451406,1.709152,1.376012,-1.443887,0.901684,0.355558,-0.208143,0.141044,-1.531606
53227,0.111867,1.391925,-0.217882,0.713454,0.311163,1.169853,-0.582796,-0.5616,-0.145414,-0.259602,-0.864865,-1.037674,1.542888,0.263439,-1.977561,-0.111399,-0.72361,-1.276448,-0.508936,0.841744,-0.040917,-1.266217,-0.704439,-0.879721,1.440314,0.251603
21333,-0.992858,0.309135,0.65561,1.549789,-0.885831,1.169853,1.221759,1.256747,-1.688111,0.707281,-1.322548,-1.446872,0.425519,-0.750291,-1.977561,-1.475863,0.310446,-1.276448,-1.019162,0.552253,1.42218,-0.182266,-0.81732,1.126489,0.790679,2.652043
3885,-1.597199,-1.104779,0.65561,0.34175,-1.063164,0.938479,0.996573,0.921748,-1.302437,0.707281,-0.292761,0.189919,1.257517,-0.750291,-1.656694,-0.214149,-0.72361,-1.276448,-0.134863,-1.263084,1.572498,-1.266217,1.025875,0.914438,0.790679,0.610119


In [8]:
# Initialize and train the Decision Tree Regressor on normalized data
tree = DecisionTreeRegressor(max_depth=10)
tree.fit(X_train_norm_df, y_train)

In [9]:
# Predict on the test set using the decision tree
y_pred_test_dt = tree.predict(X_test_norm_df)

In [10]:
# Calculate and print performance metrics for the decision tree model
mae = mean_absolute_error(y_test, y_pred_test_dt)
rmse = mean_squared_error(y_test, y_pred_test_dt, squared=False)
r2_score = tree.score(X_test_norm_df, y_test)



In [11]:
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2_score:.2f}")

MAE: 0.35
RMSE: 0.50
R2 score: 0.45


In [12]:
from sklearn.tree import export_text

# Extract the feature importances from the trained decision tree
tree_importance = {feature: importance for feature, importance in zip(X_train_norm_df.columns, tree.feature_importances_)}

# Print the feature importance
print("Feature Importances:")
print(tree_importance)

# Visualize the decision tree using export_text
tree_viz = export_text(tree, feature_names=list(X_train_norm_df.columns))

# Print the decision tree visualization
print("\nDecision Tree Visualization:")
print(tree_viz)


Feature Importances:
{'id': 0.004244454662722455, 'customer_id': 0.00590471966107709, 'month': 0.009124058388060002, 'age': 0.004359419219772789, 'ssn': 0.02081519006338409, 'occupation': 0.0024983612298363042, 'annual_income': 0.013684435920752016, 'monthly_inhand_salary': 0.005699438438771812, 'num_bank_accounts': 0.01557777496840465, 'num_credit_card': 0.05395977435438765, 'interest_rate': 0.07498541838218942, 'num_of_loan': 0.0024410247070034137, 'type_of_loan': 0.009546536412094958, 'delay_from_due_date': 0.05706123824319856, 'num_of_delayed_payment': 0.024057738167615594, 'changed_credit_limit': 0.028296580451061672, 'num_credit_inquiries': 0.0030870241354032713, 'credit_mix': 0.13324369703313083, 'outstanding_debt': 0.4942383765399862, 'credit_utilization_ratio': 0.001967581077868934, 'credit_history_age': 0.004419209305794029, 'payment_of_min_amount': 0.0009897217824080707, 'total_emi_per_month': 0.015149003118183943, 'amount_invested_monthly': 0.007647115669636638, 'payment_be

In [13]:
import os
import pydotplus
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from IPython.display import Image

# Ensure the 'figures_usable' directory exists
output_dir = "figures_usable"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Train the decision tree model
tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X_train_norm_df, y_train)

# Export the decision tree in DOT format
dot_data = export_graphviz(tree, out_file=None, 
                           feature_names=X_train_norm_df.columns,  
                           filled=True, rounded=True,  
                           special_characters=True)  

# Create a graph from the DOT data
graph = pydotplus.graph_from_dot_data(dot_data)

# Define the output file path
output_path = os.path.join(output_dir, 'decision_tree.png')

# Save the decision tree as a PNG file
graph.write_png(output_path)

# Display the decision tree image (optional)
Image(output_path)

# Print the path to the saved file for reference
print(f"Decision tree saved to: {output_path}")

InvocationException: GraphViz's executables not found