In [1]:
import pandas as pd

In [2]:
file_path = r'C:\Users\adith\Desktop\7th Sem\Predicitve Analytics\Loan_Dataset.csv'
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
data = data.fillna(data.median()) # Check for missing values and handle them

In [8]:
data = data.drop_duplicates()

In [10]:
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data.loc[:, column] = label_encoders[column].fit_transform(data[column])

In [12]:
X = data.drop('not.fully.paid', axis=1) #Seperating features and target variables
y = data['not.fully.paid']

In [13]:
scaler = MinMaxScaler() # Scaling
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [14]:
selector = SelectKBest(score_func=f_classif, k=8)  # Selecting top 8 features on the basis of correlation
X_selected = selector.fit_transform(X_scaled, y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
#Splitting the dataset

In [17]:
X_train[:5], y_train[:5] #Displaying transformed and selected feature set

(array([[5.47817048e-01, 1.00000000e+00, 9.06215921e-01, 9.84622807e-01,
         5.90142672e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [7.94178794e-01, 7.01812327e-04, 6.60305344e-01, 6.53263158e-01,
         4.26718547e-01, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [7.41164241e-01, 2.37542831e-01, 4.41657579e-01, 5.40842105e-01,
         2.74967575e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [8.66943867e-01, 3.54580358e-01, 4.10577972e-01, 0.00000000e+00,
         3.09987030e-01, 5.00000000e-01, 0.00000000e+00, 1.00000000e+00],
        [7.07900208e-01, 4.92259423e-01, 6.04689204e-01, 9.81052632e-01,
         8.41763943e-01, 5.00000000e-01, 0.00000000e+00, 1.00000000e+00]]),
 0     0
 4     0
 16    0
 5     0
 13    0
 Name: not.fully.paid, dtype: int64)

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
# Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)

In [22]:
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_rmse = np.sqrt(linear_mse)
linear_r2 = r2_score(y_test, y_pred_linear)
linear_mae = mean_absolute_error(y_test, y_pred_linear)

print("Linear Regression:")
print("Mean Squared Error (MSE):", linear_mse)
print("Root Mean Squared Error (RMSE):", linear_rmse)
print("R² Score:", linear_r2)
print("Mean Absolute Error (MAE):", linear_mae)

Linear Regression:
Mean Squared Error (MSE): 0.15947138454974089
Root Mean Squared Error (RMSE): 0.3993386840136338
R² Score: -0.6146477685661265
Mean Absolute Error (MAE): 0.2098964118876679


In [27]:
threshold = 0.5 # Calculation for confusion matrix and classification report
y_pred_linear_class = (y_pred_linear >= threshold).astype(int)  # Convert to binary classes
linear_confusion_matrix = confusion_matrix(y_test, y_pred_linear_class)
print("Confusion Matrix")
print(linear_confusion_matrix)
print("Classification Report")
print(classification_report(y_test, y_pred_linear_class))

Confusion Matrix
[[8 0]
 [1 0]]
Classification Report
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.00      0.00      0.00         1

    accuracy                           0.89         9
   macro avg       0.44      0.50      0.47         9
weighted avg       0.79      0.89      0.84         9



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# Gradient Boosting Regressor
gboost_reg = GradientBoostingRegressor(random_state=42)
gboost_reg.fit(X_train, y_train)
y_pred_gboost = gboost_reg.predict(X_test)

In [29]:
gboost_mse = mean_squared_error(y_test, y_pred_gboost)
gboost_rmse = np.sqrt(gboost_mse)
gboost_r2 = r2_score(y_test, y_pred_gboost)
gboost_mae = mean_absolute_error(y_test, y_pred_gboost)

print("Gradient Boosting Regressor:")
print("Mean Squared Error (MSE):", gboost_mse)
print("Root Mean Squared Error (RMSE):", gboost_rmse)
print("R² Score:", gboost_r2)
print("Mean Absolute Error (MAE):", gboost_mae)

Gradient Boosting Regressor:
Mean Squared Error (MSE): 0.1852025706862612
Root Mean Squared Error (RMSE): 0.43035168256469175
R² Score: -0.8751760281983947
Mean Absolute Error (MAE): 0.23308208947452622


In [30]:
threshold = 0.5
y_pred_gboost_class = (y_pred_gboost >= threshold).astype(int)  # Convert to binary classes
gboost_confusion_matrix = confusion_matrix(y_test, y_pred_gboost_class)
print("Confusion Matrix")
print(gboost_confusion_matrix)
print("Classification Report")
print(classification_report(y_test, y_pred_gboost_class))

Confusion Matrix
[[7 1]
 [1 0]]
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.88      0.88         8
           1       0.00      0.00      0.00         1

    accuracy                           0.78         9
   macro avg       0.44      0.44      0.44         9
weighted avg       0.78      0.78      0.78         9



In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled_kmeans = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)  # Fitting KMeans while assuming 3 clusters
kmeans_labels = kmeans.fit_predict(X_scaled_kmeans)

kmeans_silhouette = silhouette_score(X_scaled_kmeans, kmeans_labels)
print("Silhouette Score:", kmeans_silhouette)



Silhouette Score: 0.11183384435562753


In [32]:
kmeans_confusion_matrix = confusion_matrix(y, kmeans_labels)
print("Confusion Matrix")
print(kmeans_confusion_matrix)
print("Classification Report for K-Means Clustering:")
print(classification_report(y, kmeans_labels))

Confusion Matrix
[[11 10  6]
 [ 0  2  1]
 [ 0  0  0]]
Classification Report for K-Means Clustering:
              precision    recall  f1-score   support

           0       1.00      0.41      0.58        27
           1       0.17      0.67      0.27         3
           2       0.00      0.00      0.00         0

    accuracy                           0.43        30
   macro avg       0.39      0.36      0.28        30
weighted avg       0.92      0.43      0.55        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
