Feedback
- Make sure you can select feature variables, not only target variables
- ROC curve code was apparently not working
- Much more detailed code comments
- Update portfolio readme to include info on all projects

- Still include a requirements.txt

In [23]:
#installing kagglehub so I can import the data to my notebook
%pip install kagglehub #this line is specific to Jupyter notebooks, use it in terminal if you are not in this environment
import kagglehub #importing kagglehub to use its functions

#downloading latests version of the dataset from kagglehub
path = kagglehub.dataset_download("subhamjain/loan-prediction-based-on-customer-behavior") #setting it equal to path to recall more easily later

#displaying the path to the downloaded dataset
print("Path to dataset: ", path) #printing the path to the dataset so that my output is easily readable

import streamlit as st #importing streamlit to create a web app later
import pandas as pd #importing pandas to work with the dataset
import numpy as np #importing numpy to work with arrays and numerical data
import matplotlib.pyplot as plt #importing matplotlib to create visualizations
import seaborn as sns #importing seaborn to create more advanced visualizations
import os #importing os to work with the file system

Note: you may need to restart the kernel to use updated packages.
Path to dataset:  /Users/ardenjennings/.cache/kagglehub/datasets/subhamjain/loan-prediction-based-on-customer-behavior/versions/1


In [24]:
#assuming the CSV file is located inside the downloaded directory
path = os.path.join(path, 'Training Data.csv') #setting the path to the CSV file of the training data

#read the CSV file using the file_path
data = pd.read_csv(path) #reading the CSV file into a pandas DataFrame

data.head() #print the first few rows of the DataFrame to see if it worked

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [30]:
#setting the title of the web app
st.title("Interactive Unsupervised ML App: Predicting Loans Based on Customer Behavior")

#adding a description of the apps functionality 
st.write("This app uses unsupervised machine learning techniques to predict loan approvals based on customer behavior data.") 

#asking the user to be interactive
st.sidebar.header("1. Upload or Select Dataset")

sample_dataset = {
    "Prediction Dataset (only features)": pd.read_csv("Sample Prediction Dataset.csv"),
    "Test Dataset": pd.read_csv("Test Data.csv"),
}
dataset_option = st.sidebar.selectbox("Choose a dataset", options=["Upload your own"] + list(sample_dataset.keys()))

#if else loop to check if the user has uploaded a file or selected a sample dataset and proceed accordingly
if dataset_option == "Upload your own":
    uploaded_file = st.sidebar.file_uploader("Upload CSV", type=["csv"])
    if uploaded_file:
        df = pd.read_csv(uploaded_file)
    else:
        st.warning("Please upload a CSV to continue.")
        st.stop()
else:
    df = sample_dataset[dataset_option] #if the user is not uploading a file, they can select a sample dataset

st.write("You selected:", dataset_option , ". Here is a preview of your data.") #displaying the name of the selected dataset
st.dataframe(data.head()) #displaying the first few rows of the DataFrame to see if the upload worked

DeltaGenerator()

In [None]:
#In this step, I will use PCA to reduce the dimensionality of the dataset and make it easier to use

#preprocessing for PCA: dropping non numerical values
X = df.select_dtypes(include=[np.number]).dropna()

#ask users for input on choosing the number of PCA components and K-Means clusters
st.sidebar.header("2. Set Model Parameters")

# PCA Components
n_components = st.sidebar.slider("Number of PCA components", 2, min(10, X.shape[1]), 2)
#by having a slider the user is able to easily visualize the number of components they are selecting

# K-Means Clusters
n_clusters = st.sidebar.slider("Number of K-Means clusters", 2, 10, 3)
#the slider is effective in the same way for selecting the number of clusters

#importing PCA
# centering and scaling it because PCA is sensitive to the scale of the data
    #Centering ensures that each feature has a mean of zero, and scaling ensures that each feature has unit variance.
    #This prevents features with larger numerical ranges from dominating the PCA results.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#actually scaling the data
X_scaled = scaler.fit_transform(X) 

#reducing the data to 2 components for visualization and further analysis.
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

#displaying the Explained Variance Ratio. This tells us the proportion of the variance that is explained by each of the selected components.
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)
print("Cumulative Explained Variance:", np.cumsum(explained_variance))

In [None]:
#in this next step I will run K means. I chose to do KMeans clustering instead of hierarchical clustering because KMeans is more scalable
#importing KMeans
from sklearn.cluster import KMeans

#setting the number of clusters to 2, as we have two classes (malignant and benign) #FIX THIS
k = 2
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

#outputting the centroids and first few cluster assignments
print("Centroids:\n", kmeans.cluster_centers_)
print("First 10 cluster labels:", clusters[:10])

#or
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Add cluster labels to DataFrame for plotting
df_clustered = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(n_components)])
df_clustered['Cluster'] = clusters

In [None]:
#now I am going to plot results 

if n_components >= 2:
    fig, ax = plt.subplots()
    sns.scatterplot(x="PC1", y="PC2", hue="Cluster", data=df_clustered, palette="tab10", ax=ax)
    ax.set_title("K-Means Clusters (PCA-reduced)")
    st.pyplot(fig)

#this evaluates thr right number of clusters
st.sidebar.header("3. Optional: Elbow Plot")
if st.sidebar.button("Generate Elbow Plot"):
    distortions = []
    K_range = range(1, 11)
    for k in K_range:
        km = KMeans(n_clusters=k, random_state=42)
        km.fit(X_pca)
        distortions.append(km.inertia_)

    fig2, ax2 = plt.subplots()
    ax2.plot(K_range, distortions, marker='o')
    ax2.set_title("Elbow Method for Optimal Clusters")
    ax2.set_xlabel("Number of clusters")
    ax2.set_ylabel("Inertia")
    st.pyplot(fig2)
