In [31]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [24]:
url = 'https://raw.githubusercontent.com/alvarofavale/week7_ml/refs/heads/main/data/encoded/encoded_data.csv'
df = pd.read_csv(url)

# Set display option to show all columns
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,7.0,11.27,4.0,1,809.98,26.82262,265.0,1,49.574949,21.46538,1,312.494089,0
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,4.0,11.27,4.0,1,809.98,31.94496,266.0,1,49.574949,21.46538,2,284.629162,0
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,7.0,11.27,4.0,1,809.98,28.609352,267.0,1,49.574949,21.46538,3,331.209863,0
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,5.0,4.0,6.27,4.0,1,809.98,31.377862,268.0,1,49.574949,21.46538,4,223.45131,0
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,6.0,4.0,11.27,4.0,1,809.98,24.797347,269.0,1,49.574949,21.46538,5,341.489231,0


In [25]:
# Select only numerical columns
numerical_df = df.select_dtypes(include=['number'])

# Create the figures folder if it doesn't exist
output_folder = 'figuresb'
os.makedirs(output_folder, exist_ok=True)

#Thank StackOverflow for teaching me I don't have to keep the plots open in the notebook

# Plot box plots for each numerical column based on credit_score
for col in numerical_df.columns:
    if col != 'credit_score':  # Avoid plotting credit_score against itself
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='credit_score', y=col, data=df)
        plt.title(f'{col} vs Credit Score')
        plt.xlabel('Credit Score')
        plt.ylabel(col)
        plt.xticks(rotation=45)
        
        # Save the plot in the 'figures' folder
        plt.savefig(os.path.join(output_folder, f'{col}_vs_credit_score_boxplot.png'))
        plt.close()  # Close the plot to free memory

# Plot violin plots for each numerical column based on credit_score
for col in numerical_df.columns:
    if col != 'credit_score':  # Avoid plotting credit_score against itself
        plt.figure(figsize=(8, 6))
        sns.violinplot(x='credit_score', y=col, data=df)
        plt.title(f'{col} vs Credit Score')
        plt.xlabel('Credit Score')
        plt.ylabel(col)
        plt.xticks(rotation=45)
        
        # Save the plot in the 'figures' folder
        plt.savefig(os.path.join(output_folder, f'{col}_vs_credit_score_violinplot.png'))
        plt.close()  # Close the plot to free memory

In [27]:
# Separate features (X) and target (y)
X = df.select_dtypes(include=['number'])  # Select only numerical columns
y = df['credit_score']  # Target column

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the numerical features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the KNN classifier (you can tweak the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train_scaled, y_train)

# Predict the target on the test set
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print("Accuracy: ", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Accuracy:  0.9128

Confusion Matrix:
 [[3142  385    0]
 [ 310 9632  657]
 [   0  392 5482]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90      3527
           1       0.93      0.91      0.92     10599
           2       0.89      0.93      0.91      5874

    accuracy                           0.91     20000
   macro avg       0.91      0.91      0.91     20000
weighted avg       0.91      0.91      0.91     20000



In [37]:
# Separate features (X) and target (y)
X = df.select_dtypes(include=['number'])  # Select only numerical columns
y = df['credit_score']  # Target column

# Split the data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN Regressor (since 'credit_score' is continuous now)
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train_scaled, y_train)

# Predict the target values for the test set
y_pred = knn_regressor.predict(X_test_scaled)

# Evaluate the KNN Regressor model using Mean Squared Error (MSE) and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Calculate accuracy for regression as the percentage of predictions within a tolerance (e.g., 0.1)
tolerance = 0.1
accuracy = (abs(y_pred - y_test) <= tolerance).mean() * 100  # Percentage of predictions within tolerance

# Print the evaluation results
print(f"KNN Regressor - Mean Squared Error: {mse}")
print(f"KNN Regressor - R-squared: {r2}")
print(f"KNN Regressor - Accuracy (within tolerance of {tolerance}): {accuracy:.2f}%")

KNN Regressor - Mean Squared Error: 0.06578533333333333
KNN Regressor - R-squared: 0.856182021518232
KNN Regressor - Accuracy (within tolerance of 0.1): 59.34%


In [39]:
# Separate features (X) and target (y)
X = df.select_dtypes(include=['number'])  # Select only numerical columns
y = df['credit_score']  # Target column

# Convert 'credit_score' into categorical if it's continuous
# Optionally, you can bin the continuous values into categories (if needed)
# For now, we assume 'credit_score' is categorical or already a discrete class

# Split the data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, y_train)

# Predict the target values for the test set
y_pred = knn_classifier.predict(X_test_scaled)

# Evaluate the KNN Classifier model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"KNN Classifier - Accuracy: {accuracy:.2f}")
print("KNN Classifier - Classification Report:")
print(report)

KNN Classifier - Accuracy: 0.92
KNN Classifier - Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      5322
           1       0.92      0.91      0.92     15873
           2       0.90      0.93      0.91      8805

    accuracy                           0.92     30000
   macro avg       0.91      0.91      0.91     30000
weighted avg       0.92      0.92      0.92     30000



- Changing the train test split to test diff accuracy rates

In [43]:
# Separate features (X) and target (y)
X = df.select_dtypes(include=['number'])  # Select only numerical columns
y = df['credit_score']  # Target column

# Convert 'credit_score' into categorical if it's continuous
# Optionally, you can bin the continuous values into categories (if needed)
# For now, we assume 'credit_score' is categorical or already a discrete class

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply KNN Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, y_train)

# Predict the target values for the test set
y_pred = knn_classifier.predict(X_test_scaled)

# Evaluate the KNN Classifier model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print(f"KNN Classifier - Accuracy: {accuracy:.2f}")
print("KNN Classifier - Classification Report:")
print(report)

KNN Classifier - Accuracy: 0.91
KNN Classifier - Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      3589
           1       0.93      0.91      0.92     10585
           2       0.90      0.93      0.91      5826

    accuracy                           0.91     20000
   macro avg       0.91      0.91      0.91     20000
weighted avg       0.91      0.91      0.91     20000

