# Midterm

First, import the neccessary libraries and read the csv file in as a DataFrame.

In [618]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# load the data
df = pd.read_csv('data/LoanStats3a.csv', skiprows=1, low_memory=False)
df



Let's take inventory of features we have to work with.

In [619]:
list(df.columns)



Let's take inventory of the unique values in the columns.

In [620]:
unique_values = {col: df[col].unique() for col in df.columns}
for col, values in unique_values.items():
    print(f"Unique values in '{col}' column: {values}\n")



Let's check the datatypes to know how the data was read in by Pandas.

In [621]:
print(list(df.dtypes))



## 1. What kind of interest rate are borrowers paying? (min, max and mean rate values)

In [622]:
# convert percent values to floats
df.loc[:, 'int_rate'] = df['int_rate'].str.rstrip('%').astype('float') / 100

# get int rate summary statistics
print("Summary statistics for the interest rate column:")
df.describe()

# reconverting to percent
df.loc[:, 'int_rate'] = df['int_rate'] * 100

# print the answer.
print("\nThe lowest interest rate (%) is: ", df['int_rate'].min())    
print("The highest interest rate (%) is: ", df['int_rate'].max())
print("The average interest rate (%) is: ", df['int_rate'].mean().round(2))



## 2. How long are the loan terms? (Min, Max and Mean term values)

In [623]:
# count the number of NaN values in the 'term' column
nan_count = df['term'].isna().sum()
print(f"Number of NaN values in 'term' column: {nan_count}")

# print the unique values in the 'term' column
print("Unique values in 'term' column: ", df['term'].unique())

## drop the NaN values from the 'term' column
# print the number of rows before dropping NaN values
pre = df.shape[0]
print(f"\nNumber of rows before dropping NaN values: {pre}")

# drop the NaN values from the 'term' column
df = df.dropna(subset=['term'])

# print the number of rows after dropping NaN values
post = df.shape[0]
print(f"Number of rows after dropping NaN values: {post}")

# convert values to integers for computations
df.loc[:, 'term'] = df['term'].astype(str)
df.loc[:, 'term'] = df['term'].str.replace(' months', '').astype(int)

# calculate the term feature's statistics summary
print("\nSummary statistics for the 'term' column:")
print(df['term'].describe())

# print the answer
print(f"\nThe shortest term is: {df['term'].min()} months")
print(f"The longest term is: {df['term'].max()} months")
print(f"The average term is: {df['term'].mean().round()} months") # rounded to nearest whole number to account for significant figures



## 3. How much are people borrowing? (Min, Max and Mean amounts)

In [624]:
nan_count = df.loc[:, 'loan_amnt'].isna().sum()
print(f"Number of NaN values in the 'loan_amnt' column: {nan_count}\n")

print(df.loc[:, 'loan_amnt'].describe())

# print the answer
print("\nThe smallest loan amount is: $", df['loan_amnt'].min())
print("The largest loan amount is: $", df['loan_amnt'].max())
print("The average loan amount is: $", df['loan_amnt'].mean().round(2))



## 4. What are people taking these loans out for? (list the items)

In [625]:
print("Unique values in 'purpose' column:", list(df['purpose'].unique()))



## 5. Are the borrowers renters or homeowners? (list the ownership status items)

In [626]:
print("The borrowers are categorized into the following living situations:", list(df['home_ownership'].unique()))



## 6. Where do these borrowers live? (list of states)

In [627]:
print("The borrowers live in the following states:", list(df['addr_state'].unique()))



## 7. Use a predictor and evaluate your model. (You can use a machine learning classifier, and evaluate your model with metrics, e.g., you can predict the failed vs successful loans)

Let's start by categorizing the feature sets.

In [628]:
# Print non-numeric columns
for column in df.columns:
    try:
        pd.to_numeric(df[column])
    except:
        print(f"Column '{column}' is non-numeric.")



We can filter out non-numeric columns to focus on the numeric columns with plenty of valid data.

In [629]:
# specify the columns to be used
selected_features = ['loan_amnt', 'annual_inc', 'dti', 'int_rate']

# remove redundant context within the loan_status column. We just care if they defaulted or not.
df['loan_status'] = df['loan_status'].str.replace("Does not meet the credit policy. Status:", "", regex=False)

# defined function to convert loan_status to binary
def map_loan_status(status):
    if pd.isna(status):
        return np.nan
    return 1 if isinstance(status, str) and 'Fully Paid' in status else 0 # 1 for fully paid, 0 for charged off

# apply the encoding to the loan_status column
df['loan_status'] = df['loan_status'].apply(map_loan_status)

# drop rows with any NaN values in the selected features and loan_status
df_clean = df[selected_features + ['loan_status']].dropna()
df_clean





It was difficult to wrangle the 'loan_status' column to be binary, however, isolating the string text worked. 

In [630]:
# store dataFrame values in X and y
X_clean = df_clean[selected_features].copy()
y_clean = df_clean['loan_status']

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)
# print unique values in the training and test sets to confirm the presence of predict labels.
print("Unique values in X_train:")
print(X_train.nunique())
print("\nUnique values in X_test:")
print(X_test.nunique())
print("\nUnique values in y_train:")
print(y_train.nunique())
print("\nUnique values in y_test:")
print(y_test.nunique())

# scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nKNN Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))





Through clear data use and understanding, the model was fit and able to be evaluated.

## 8. Visualize your findings.

In [631]:
# call confusion matrix function
matrix = confusion_matrix(y_test, y_pred)

# plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.show()




### Interpretation
The model is heavily leaning towards predicting loans as fully paid inaccurately. This can be a huge impact to businesses because if you're categorizing loans as not risky by predicting them to be fully paid, that can expose your business to extensive losses through loan defaults. I believe based on the number of true negatives that there was an imbalance in the data set which led the model to automatically lean towards an outcome of fully paid if it was somewhere in the middle. Moving forward and building off this, analysts could look at oversampling or undersampling along with trying different models in addition to the KNN model.