In [1]:
import os
import sys

print("Creating project structure...")

Creating project structure...


In [2]:
# List of directories to create
directories = [
    '.github/workflows',
    'data/raw',
    'data/processed',
    'notebooks',
    'src',
    'src/api',
    'tests'
]

for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)
    print(f"Created: {dir_path}")

Created: .github/workflows
Created: data/raw
Created: data/processed
Created: notebooks
Created: src
Created: src/api
Created: tests


In [3]:
# List of files to create
files_to_create = [
    ('.github/workflows/ci.yml', ''),
    ('notebooks/eda.ipynb', ''),
    ('src/__init__.py', ''),
    ('src/data_processing.py', ''),
    ('src/train.py', ''),
    ('src/predict.py', ''),
    ('src/api/main.py', ''),
    ('src/api/pydantic_models.py', ''),
    ('tests/test_data_processing.py', ''),
    ('Dockerfile', ''),
    ('docker-compose.yml', ''),
    ('requirements.txt', ''),
    ('.gitignore', ''),
    ('README.md', '')
]

for file_path, content in files_to_create:
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"Created: {file_path}")

Created: .github/workflows/ci.yml
Created: notebooks/eda.ipynb
Created: src/__init__.py
Created: src/data_processing.py
Created: src/train.py
Created: src/predict.py
Created: src/api/main.py
Created: src/api/pydantic_models.py
Created: tests/test_data_processing.py
Created: Dockerfile
Created: docker-compose.yml
Created: requirements.txt
Created: .gitignore
Created: README.md


In [4]:
# Create .gitignore content
gitignore_content = """# Data
data/
*.csv
*.parquet
*.feather

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
.venv/
ENV/
env.bak/
venv.bak/

# IDE
.vscode/
.idea/
*.swp
*.swo

# Jupyter
.ipynb_checkpoints/

# MLFlow
mlruns/
mlartifacts/

# OS
.DS_Store
Thumbs.db

# Logs
*.log
"""

# Write to .gitignore
with open('.gitignore', 'w', encoding='utf-8') as f:
    f.write(gitignore_content)

print(".gitignore created successfully!")

.gitignore created successfully!


In [5]:
# Create requirements.txt content
requirements_content = """# Core
pandas==2.0.3
numpy==1.24.3
scikit-learn==1.3.0
matplotlib==3.7.2
seaborn==0.12.2
jupyter==1.0.0
notebook==6.5.4

# Feature Engineering
category-encoders==2.6.3
xverse==0.2.0

# ML & MLOps
mlflow==2.8.1
xgboost==1.7.6
lightgbm==4.1.0

# API & Deployment
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0

# Testing & Quality
pytest==7.4.3
black==23.11.0
flake8==6.1.0

# Data Processing
pyarrow==14.0.1
"""

# Write to requirements.txt
with open('requirements.txt', 'w', encoding='utf-8') as f:
    f.write(requirements_content)

print("requirements.txt created successfully!")

requirements.txt created successfully!


In [6]:
# Create README.md content
readme_content = """# Credit Risk Probability Model for Alternative Data

## Project Overview
This project develops a credit risk scoring model for Bati Bank's buy-now-pay-later service using e-commerce transaction data. The model transforms behavioral data into predictive risk signals to assess customer creditworthiness.

## Credit Scoring Business Understanding

### 1. Basel II Accord and Model Interpretability
The Basel II Capital Accord emphasizes rigorous risk measurement and adequate capital allocation based on credit risk exposure. This influences our model development in three key ways:

- **Regulatory Compliance**: Banks must demonstrate that their risk models are robust, validated, and conceptually sound. Our model needs clear documentation of methodology and assumptions.

- **Capital Requirements**: Accurate risk probability estimates directly impact the amount of capital Bati Bank must hold. Underestimation could lead to insufficient capital buffers.

- **Explainability**: Regulators require models to be interpretable for validation purposes. We must be able to explain why a customer receives a particular risk score, which is crucial for compliance and customer communication.

### 2. Proxy Variable Necessity and Risks
Since we lack direct loan performance data, creating a proxy variable is essential:

**Why necessary:**
- No historical loan default data exists for e-commerce customers
- RFM (Recency, Frequency, Monetary) patterns serve as behavioral proxies for creditworthiness
- Disengaged customers (low frequency, low spending) may correlate with higher default risk

**Potential business risks:**
- **Misclassification Risk**: Customers labeled as high-risk based on shopping behavior may actually be creditworthy
- **Opportunity Cost**: Overly conservative models could reject profitable customers
- **Discrimination Risk**: Behavioral proxies might unintentionally exclude certain customer segments
- **Validation Challenges**: Proxy-based models are harder to validate against actual loan performance

### 3. Model Selection Trade-offs in Financial Context

**Simple, Interpretable Models (Logistic Regression with WoE):**
- **Advantages**: 
  - Easily explainable to regulators and business stakeholders
  - Linear relationships make risk factor contributions transparent
  - Well-established in credit scoring with proven regulatory acceptance
  - Lower computational requirements

- **Disadvantages**:
  - May capture fewer complex, non-linear patterns
  - Lower predictive power if relationships are non-linear
  - Requires careful feature engineering and transformation

**Complex, High-Performance Models (Gradient Boosting):**
- **Advantages**:
  - Higher predictive accuracy through complex pattern recognition
  - Automatic feature interaction detection
  - Better handling of non-linear relationships

- **Disadvantages**:
  - "Black box" nature challenges regulatory compliance
  - Harder to explain individual predictions
  - Potential overfitting without careful regularization
  - Higher computational costs and maintenance complexity

**Recommended Approach**: Start with interpretable models for regulatory acceptance, then explore ensemble methods if interpretability requirements allow and performance gains justify the complexity. Consider using SHAP values for model explainability with complex models.

## Project Structure
[Project structure as defined in requirements]

## Setup Instructions
1. Clone the repository
2. Install requirements: `pip install -r requirements.txt`
3. Download data from Kaggle to `data/raw/`
4. Run EDA: `jupyter notebook notebooks/eda.ipynb`
5. Train model: `python src/train.py`
"""

# Write to README.md
with open('README.md', 'w', encoding='utf-8') as f:
    f.write(readme_content)

print("README.md created successfully!")

README.md created successfully!


In [7]:
import os

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for file in files:
            print(f'{subindent}{file}')

print("Current project structure:")
list_files('.')

Current project structure:
./
    .gitignore
    check_notebook.py
    docker-compose.yml
    Dockerfile
    fix_notebook.py
    project_setup.ipynb
    README.md
    requirements.txt
    .git/
        COMMIT_EDITMSG
        config
        description
        HEAD
        index
        hooks/
            applypatch-msg.sample
            commit-msg.sample
            fsmonitor-watchman.sample
            post-update.sample
            pre-applypatch.sample
            pre-commit.sample
            pre-merge-commit.sample
            pre-push.sample
            pre-rebase.sample
            pre-receive.sample
            prepare-commit-msg.sample
            push-to-checkout.sample
            sendemail-validate.sample
            update.sample
        info/
            exclude
        logs/
            HEAD
            refs/
                heads/
                    main
                remotes/
                    origin/
                        main
        objects/
            00/
