# Data Exploration for Credit Scoring Analysis

This notebook explores the accepted and rejected loan datasets to understand their structure, missing values, and basic statistics.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

In [None]:
# Read both datasets
accepted_df = pd.read_csv('../data/raw/accepted_loans.csv')
rejected_df = pd.read_csv('../data/raw/rejected_loans.csv')

## 2. Accepted Loans Dataset Analysis

In [None]:
print("ACCEPTED LOANS DATASET EXPLORATION")
print("="*80)
print(f"\nDataset Shape: {accepted_df.shape}")

print("\nColumn Types:")
print(accepted_df.dtypes)

print("\nMissing Values Analysis:")
missing_accepted = pd.DataFrame({
    'Column': accepted_df.columns,
    'Data Type': accepted_df.dtypes,
    'Missing Values': accepted_df.isnull().sum(),
    'Missing Percentage': (accepted_df.isnull().sum() / len(accepted_df) * 100).round(2)
})
print(missing_accepted.sort_values('Missing Percentage', ascending=False))

print("\nSample of accepted loans (first 5 rows):")
accepted_df.head()

## 3. Rejected Loans Dataset Analysis

In [None]:
print("REJECTED LOANS DATASET EXPLORATION")
print("="*80)
print(f"\nDataset Shape: {rejected_df.shape}")

print("\nColumn Types:")
print(rejected_df.dtypes)

print("\nMissing Values Analysis:")
missing_rejected = pd.DataFrame({
    'Column': rejected_df.columns,
    'Data Type': rejected_df.dtypes,
    'Missing Values': rejected_df.isnull().sum(),
    'Missing Percentage': (rejected_df.isnull().sum() / len(rejected_df) * 100).round(2)
})
print(missing_rejected.sort_values('Missing Percentage', ascending=False))

print("\nSample of rejected loans (first 5 rows):")
rejected_df.head()

## 4. Missing Values Visualization

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
sns.heatmap(accepted_df.isnull(), yticklabels=False, cbar=False)
plt.title('Missing Values in Accepted Loans')

plt.subplot(1, 2, 2)
sns.heatmap(rejected_df.isnull(), yticklabels=False, cbar=False)
plt.title('Missing Values in Rejected Loans')

plt.tight_layout()
plt.show()

## 5. Dataset Comparison

In [None]:
# Compare common columns between datasets
common_columns = set(accepted_df.columns) & set(rejected_df.columns)
unique_to_accepted = set(accepted_df.columns) - set(rejected_df.columns)
unique_to_rejected = set(rejected_df.columns) - set(accepted_df.columns)

print("COLUMN COMPARISON")
print("="*80)
print(f"\nCommon columns between datasets: {len(common_columns)}")
print(sorted(common_columns))
print(f"\nColumns unique to accepted loans: {len(unique_to_accepted)}")
print(sorted(unique_to_accepted))
print(f"\nColumns unique to rejected loans: {len(unique_to_rejected)}")
print(sorted(unique_to_rejected))

## 6. Basic Statistics

In [None]:
print("BASIC STATISTICS")
print("="*80)
print(f"\nTotal number of loan applications: {len(accepted_df) + len(rejected_df)}")
print(f"Number of accepted loans: {len(accepted_df)}")
print(f"Number of rejected loans: {len(rejected_df)}")
print(f"Acceptance rate: {(len(accepted_df) / (len(accepted_df) + len(rejected_df)) * 100):.2f}%")

# Visualize acceptance vs rejection ratio
plt.figure(figsize=(8, 6))
plt.pie([len(accepted_df), len(rejected_df)], 
        labels=['Accepted', 'Rejected'], 
        autopct='%1.1f%%',
        colors=['lightgreen', 'lightcoral'])
plt.title('Loan Application Distribution')
plt.show()