In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import warnings

warnings.filterwarnings('ignore')


In [2]:
# Step 1: Load data
def load_data(data_path):
    try:
        data = pd.read_csv(data_path)
        print(f'Dataset loaded: {data.shape[0]} rows, {data.shape[1]} columns.')
        return data
    except FileNotFoundError:
        print(f'File was not found: {data_path}')
        return None
    
def validate_data(df):
    'Validate data structure and quality'

    validation_report = {
        'total_rows': len(df),
        'total_cols': len(df.columns),
        'missing_values': df.isnull().sum().to_dict(),
        'has_target': 'not.fully.paid' in df.columns
    }

    print('Data validation completed')
    return validation_report

In [3]:
# Step 2: Display data information:
def display_data_info(data):
    '''Display basic information about the dataset: '''

    print('\n' + '=' * 60)
    print('Dataset Information')
    print('=' * 60)

    print(f'Shape: {data.shape}')
    print('\n Column names and data type:')
    
    for col, dtype in data.dtypes.items():
        print(f' {col}: {dtype}')
    
    print(f'\n Missing values: ')
    missing = data.isnull().sum()
    if missing.sum() == 0:
        print('No missing values')
    else:
        print(missing[missing > 0])
        

In [4]:
# Module 2: 
def analyze_numerical_features(df):
    "Analyse statistical properties of numerical features"
    numerical_cols = df.select_dtypes(include=[np.number]).columns

    print('\n' + '=' * 60)
    print('Numerical features analysis')
    print('=' * 60)
    stats = df[numerical_cols].describe()
    print(stats)
    return stats

def analyze_categorical_features(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    print('\n' + '=' * 60)
    print('Categorical features analysis')
    print('=' * 60)
    for col in categorical_cols:
        print(f'\n {col} (Value Counts):')
        print(df[col].value_counts())
    return categorical_cols.tolist()

def analyze_target_distribution(df):
    '''Analyze the target variable distribution'''
    target_col = 'not.fully.paid'

    if target_col in df.columns:
        print('\n' + '=' * 60)
        print('Target variable distribution')
        print('=' * 60)

        distribution = df[target_col].value_counts()
        percentage = (df[target_col].value_counts(normalize = True) * 100)
        print(f'Fully Paid (0): {distribution[0]} ({percentage[0]:.2f}%)')
        print(f'Defaulted (0): {distribution[1]} ({percentage[1]:.2f}%)')

        return distribution


def generate_eda_report(df):
    '''Generate complete EDA report:'''
    print('\n' + '=' * 60)
    print('Exploratory data Analysis report')
    print('=' * 60)

    analyze_numerical_features(df)
    analyze_categorical_features(df)
    analyze_target_distribution(df)

    print('Eda Report Generated')

def create_visualization(df, output_dir):
    '''Create and save visualization'''

    #1. Target Distribution
    plt.figure(figsize = (10,5))
    df['not.fully.paid'].value_counts().plot(kind= 'bar')
    plt.title('Loan default distribution', fontsize = 14, fontweight= 'bold')
    plt.xlabel('Not full paid (0=Paid, 1=Defaulted)')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/target_distribution.png', dpi=100)
    plt.close()

    # Interest rate distribution
    plt.figure(figsize=(10,5))
    plt.hist(df['int.rate'], bins=50, edgecolor='black')
    plt.title('Interest rate distribution', fontsize= 14, fontweight= 'bold')
    plt.xlabel('Interest Rate')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/interest_rate_distribution.png', dpi= 100)
    plt.close()

    # 3. FICO score distribution
    plt.figure(figsize = (10,5))
    plt.hist(df['fico'], bins = 50, edgecolor='black')
    plt.title('FICO score distribution', fontsize=14, fontweight = 'bold')
    plt.xlabel('FICO score')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/fico_distribution.png', dpi=100)
    plt.close()

    print(f'Visualizations saved to {output_dir}/')

    




In [5]:
# Module 3: Feature Correlation analysis
def calculate_correlation_matrix(df):
    '''Calculate Pearson correlation matrix'''

    numerical_cols = df.select_dtypes(include=[np.numbers]).columns
    corr_matrix = df[numerical_cols].corr()
    print('\n Correlation matrix calculated')
    return corr_matrix

def identify_high_correlation_features(df, threshold = 0.9):
    '''Identify highly correlated feature pairs'''
    corr_matrix = df.corr(numeric_only = True)
    high_corr_pairs = []

    for i in range(len(corr_matrix.columns)):
        for j in range(i+1 , len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i,j]) > threshold:
                high_corr_pairs.append({
                    'feature1': corr_matrix.columns[i],
                    'feature2': corr_matrix.columns[j],
                    'correlation': corr_matrix.iloc[i,j]
                })

    if high_corr_pairs: 
        print(f'Found {len(high_corr_pairs)} highly correlation pairs (threshold= {threshold}):')
        for pair in high_corr_pairs:
            print(f'{pair['feature1']} <-> {pair['feature2']}: {pair['correlation']:.3f}')
    else: 
        print(f' No Highly correlated featues found (threshold = {threshold})')

    return high_corr_pairs

def analyze_feature_importance_with_target(df):
    '''Analyze correlation of features with target variable'''
    target_col = 'not.fully.paid'
    feature_cols = [col for col in df.columns if col != target_col]

    correlations = []

    for col in feature_cols:
        if df[col].dtypes in [np.int64, np.float64]:
            corr = df[col].corr(df[target_col])
            correlations.append({'feature': col, 'correlation': abs(corr)})
    
    # sort by abs correlation
    correlations = sorted(correlations, key = lambda x: x['correlation'], reverse= True)

    print('Feature importance (correlation with target): ')
    for item in correlations[:10]:
        print(f'{item['feature']}: {item['correlation']:.4f}')

    return correlations

def visualize_correlation(df, output_dir = 'outputs'):
    '''Create and Save correlation heatmap'''
    os.makedirs(output_dir, exist_ok = True)

    # Select numerical cols: 
    numerical_cols = df.select_dtypes(include= [np.number])
    corr_matrix = numerical_cols.corr()

    # Create Heatmap
    plt.figure(figsize = (12, 10))
    sns.heatmap(corr_matrix, annot= True, fmt = '.2f', cmap = 'coolwarm', center = 0)
    plt.title('Feature correlation matrix', fontsize = 14, fontweight = 'bold')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/correlation_heatmap.png', dpi= 100)
    plt.close()

    print(f'Correlation heatmap saved to {output_dir}/correlation_heatmap.png')




In [6]:
# Module 4: Feature Reduction:
def select_best_features(df, threshold = 0.9):
    '''Select best features by removing highly correlated ones'''
    target_col = 'not.fully.paid'
    features_to_keep = [col for col in df.columns if col != target_col]

    # Get correlation matrix
    corr_matrix = df[features_to_keep].corr(numeric_only = True)
    
    # Remove highly correlated features:
    to_drop = set()

    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i,j] > threshold):
                to_drop.add(corr_matrix.columns[j])

    selected_features = [col for col in features_to_keep if col not in to_drop]

    print(f'\nFeature Selection Complete:')
    print(f'Orignal Features: {len(features_to_keep)}')
    print(f'Removed Features: {len(to_drop)} {list(to_drop)}')
    print(f'Selected Features: {len(selected_features)} {selected_features}')

    return selected_features

def reduce_features(df, correlation_threshold = 0.9):
    '''Complete feature reduction pipeline'''

    print('\n' + '=' * 60)
    print('Feature Reduction')
    print('=' * 60)

    # Identify correlations
    identify_high_correlation_features(df, threshold=correlation_threshold)

    # Analyze importance
    analyze_feature_importance_with_target(df)

    # Select Best Features
    best_features = select_best_features(df, threshold=correlation_threshold)

    return best_features


    

In [16]:
def encode_categorical_features(df):
    df_encoded = df.copy()
    # Find categorical columns (Excluding target)
    categorical_cols = df_encoded.select_dtypes(include= ['object']).columns.tolist()
    print(f'Encoding categorical columns: {categorical_cols}')

    for col in categorical_cols:
        df_encoded = pd.get_dummies(df_encoded, columns=[col], prefix= col, drop_first=True)

    print(f'Categorical encoding completed.')
    return df_encoded


def handle_missing_values(df, strategy= 'drop'):
    if df.isnull().sum().sum() == 0:
        print('No Null values to handle.')
        return df
    
    if strategy == 'drop':
        df_clean = df.dropna()
        print(f'Dropped {len(df) - len(df_clean)} rows with missing values.')
    else:
        numerical_cols = df.select_dtypes(include= [np.number]).columns
        df_clean = df.copy()
        for col in numerical_cols:
            df_clean[col].fillna(df_clean[col].mean(), inplace = True)
        print('Filled null column values with mean.')
    
    return df_clean

def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print('Features scaled using Standard Scaler')
    return X_train_scaled, X_test_scaled, scaler


def prepare_dataset(df, test_size = 0.2, random_state= 42):
    '''Complete data prepration pipeline'''
    print('\n' + '=' * 60)
    print('Data Preprocessing')
    print('=' * 60)

    # Handle missing values : Numerical columns
    df = handle_missing_values(df)

    # Eocode categorical features:
    df = encode_categorical_features(df)

    # Saperate feature and target:
    target_col = 'not.fully.paid'
    y = df[target_col].values
    X = df.drop(columns=[target_col]).values

    # Split Features:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state= random_state)

    print(f'Data Split:')
    print(f'Training Set: {X_train.shape}')
    print(f'Testing Set: {X_test.shape}')

    # Scale features:
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

    return {
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train,
        'y_test': y_test,
        'scaler': scaler,
        'input_shape': X_train_scaled.shape[1]
    }
        

In [23]:
# Module 6: Deep Learning Model

def build_neural_network(input_dim):
    '''Build Neural Network Model'''
    model = keras.Sequential([
        layers.Dense(128, activation= 'relu', input_dim= input_dim),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer = keras.optimizers.Adam(learning_rate= 0.001), loss= 'binary_crossentropy', metrics = ['accuracy'])

    print('Neural network model built')
    return model

In [27]:
def train_neural_network(model, X_train, y_train, X_test, y_test, epochs = 50, batch_size = 32):
    ''''Train Neural network'''
    # Early stopping callback:

    early_stop = callbacks.EarlyStopping(
        monitor= 'val_loss',
        patience = 5, 
        restore_best_weights = True
    )

    # Learning rate reduction:
    lr_reduce = callbacks.ReduceLROnPlateau(
        monitor = 'val_loss',
        factor = 0.5,
        patience = 3,
        min_lr = 1e-7
    )

    print('\n' + '=' * 60)
    print('Model Training')
    print('=' * 60)

    # train model:
    history = model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, 
                        validation_data =  (X_test, y_test), callbacks= [early_stop, lr_reduce],
                        verbose = 1)
    
    print('Training completed')
    return history

In [28]:
def main (data_path = 'loan_data.csv', output_dir = 'outputs'):

    'Execute complete pipeline:'

    print('\n' + '=' * 60)
    print('Lending club loan default Prediction')
    print('=' * 60)
    
    # Step 1: Load and validate the data:
    print('\n Step 1: Loading Data...')

    df = load_data(data_path)
    if df is None:
        print('Data set is empty')
        return
    
    # Display data information

    display_data_info(df)
    validation = validate_data(df)

    # Step 2: Performing EDA Exploratory Data Analysis
    print('\n Step 2: Perform EDA: ')
    generate_eda_report(df)
    # create_visualization(df, output_dir)

    # Step 3: Feature Correlation Analysis
    print('\n Step 3: Analyzing feature analysis: ')
    visualize_correlation(df, output_dir)

    # Step 4: Reducing Features:
    print('\n Step 4: Feature Reduction: ')
    best_features = reduce_features(df, correlation_threshold = 0.9)

    # Keep target and selected features only
    df_reduced = df[best_features + ['not.fully.paid']].copy()
    print(f'Dataset reduced from {df.shape[1]} to {df_reduced.shape[1]} columns.')


    # Step 5: Data preprocessing
    print('\n Step 5: Preprocessing data')
    data_prep = prepare_dataset(df_reduced)

    # Step 6: Build Model:
    print('\n Step 6: Building Neural Network...')
    model = build_neural_network(data_prep['input_shape'])
    print('Model Summary:')
    model.summary()

    # Step 7: Train model:
    print('\n Step 7: Training Model...')
    history = train_neural_network(model, data_prep['X_train'], 
                                   data_prep['y_train'],
                                   data_prep['X_test'],
                                   data_prep['y_test'],
                                   epochs = 50,
                                   batch_size = 32)



    
    
    
    
    '''
    history = train_neural_network(
        model, 
        data_pred['X_train'],
        data_prep['y_train'],
        data_pred['X_test'],
        data_prep['y_test'],
        epochs = 50,
        batch_size = 32
    )

    # Plot training history
    # plot_training_history(history, output_dir)
    '''




if __name__ == '__main__':
    main()


Lending club loan default Prediction

 Step 1: Loading Data...
Dataset loaded: 9578 rows, 14 columns.

Dataset Information
Shape: (9578, 14)

 Column names and data type:
 credit.policy: int64
 purpose: object
 int.rate: float64
 installment: float64
 log.annual.inc: float64
 dti: float64
 fico: int64
 days.with.cr.line: float64
 revol.bal: int64
 revol.util: float64
 inq.last.6mths: int64
 delinq.2yrs: int64
 pub.rec: int64
 not.fully.paid: int64

 Missing values: 
No missing values
Data validation completed

 Step 2: Perform EDA: 

Exploratory data Analysis report

Numerical features analysis
       credit.policy     int.rate  installment  log.annual.inc          dti  \
count    9578.000000  9578.000000  9578.000000     9578.000000  9578.000000   
mean        0.804970     0.122640   319.089413       10.932117    12.606679   
std         0.396245     0.026847   207.071301        0.614813     6.883970   
min         0.000000     0.060000    15.670000        7.547502     0.000000   
25


 Step 7: Training Model...

Model Training
Epoch 1/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8365 - loss: 0.4494 - val_accuracy: 0.8408 - val_loss: 0.4059 - learning_rate: 0.0010
Epoch 2/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8387 - loss: 0.4287 - val_accuracy: 0.8413 - val_loss: 0.4091 - learning_rate: 0.0010
Epoch 3/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8403 - loss: 0.4245 - val_accuracy: 0.8408 - val_loss: 0.4122 - learning_rate: 0.0010
Epoch 4/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8406 - loss: 0.4214 - val_accuracy: 0.8413 - val_loss: 0.4128 - learning_rate: 0.0010
Epoch 5/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8397 - loss: 0.4187 - val_accuracy: 0.8408 - val_loss: 0.4044 - learning_rate: 5.0000e-04
Epoch 6/50
[1m240/240