# Task 2: Data Version Control (DVC) Setup

## Objective
Establish a reproducible and auditable data pipeline using Data Version Control (DVC), a standard practice in regulated industries.

## Why DVC?
In finance and insurance, we must be able to reproduce any analysis or model result at any time for auditing, regulatory compliance, or debugging. DVC ensures our data inputs are as rigorously version-controlled as our code.

## Tasks:
1. Install DVC
2. Initialize DVC in the project
3. Set up local remote storage
4. Add data files to DVC tracking
5. Commit changes to version control
6. Push data to local remote storage


In [None]:
# Import necessary libraries
import os
import subprocess
import sys
from pathlib import Path

# Set project root
project_root = Path().absolute().parent.parent
os.chdir(project_root)
print(f"Project root: {project_root}")


## Step 1: Install DVC


In [None]:
# Check if DVC is installed
try:
    result = subprocess.run(['dvc', '--version'], capture_output=True, text=True, check=True)
    print(f"✓ DVC is already installed: {result.stdout.strip()}")
except (FileNotFoundError, subprocess.CalledProcessError):
    print("DVC not found. Installing DVC...")
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'dvc'], check=True)
    print("✓ DVC installed successfully!")


## Step 2: Initialize DVC


In [None]:
# Initialize DVC (if not already initialized)
if os.path.exists('.dvc'):
    print("⚠ DVC is already initialized.")
    print("If you want to reinitialize, delete the .dvc directory first.")
else:
    result = subprocess.run(['dvc', 'init'], capture_output=True, text=True)
    if result.returncode == 0:
        print("✓ DVC initialized successfully!")
        print(result.stdout)
    else:
        print("✗ Error initializing DVC:")
        print(result.stderr)


## Step 3: Set Up Local Remote Storage


In [None]:
# Create storage directory
storage_path = project_root / 'dvc_storage'
storage_path.mkdir(exist_ok=True)
print(f"✓ Storage directory created: {storage_path}")
print(f"  Absolute path: {storage_path.absolute()}")


In [None]:
# Check existing remotes
result = subprocess.run(['dvc', 'remote', 'list'], capture_output=True, text=True)
print("Current DVC remotes:")
print(result.stdout if result.stdout else "No remotes configured yet.")


In [None]:
# Add local remote storage
storage_abs_path = str(storage_path.absolute())

# Check if remote already exists
result = subprocess.run(['dvc', 'remote', 'list'], capture_output=True, text=True)
if 'localstorage' in result.stdout:
    print("⚠ Remote 'localstorage' already exists. Updating...")
    result = subprocess.run(['dvc', 'remote', 'modify', 'localstorage', 'url', storage_abs_path],
                          capture_output=True, text=True)
    if result.returncode == 0:
        print("✓ Remote storage updated successfully!")
    else:
        print("✗ Error updating remote:")
        print(result.stderr)
else:
    print("Adding local remote storage...")
    result = subprocess.run(['dvc', 'remote', 'add', '-d', 'localstorage', storage_abs_path],
                          capture_output=True, text=True)
    if result.returncode == 0:
        print("✓ Local remote storage added successfully!")
        print(result.stdout)
    else:
        print("✗ Error adding remote:")
        print(result.stderr)


In [None]:
# Verify remote configuration
result = subprocess.run(['dvc', 'remote', 'list'], capture_output=True, text=True)
print("Configured DVC remotes:")
print(result.stdout)

# Show default remote
result = subprocess.run(['dvc', 'remote', 'default'], capture_output=True, text=True)
if result.returncode == 0:
    print(f"\nDefault remote: {result.stdout.strip()}")


## Step 4: Add Data Files to DVC

**Note:** Before running this step, make sure your data file is in the `data/raw/` directory.


In [None]:
# Check what data files exist
data_raw_path = project_root / 'data' / 'raw'
if data_raw_path.exists():
    data_files = list(data_raw_path.glob('*'))
    print(f"Files in data/raw/:")
    for f in data_files:
        if f.is_file():
            size_mb = f.stat().st_size / (1024 * 1024)
            print(f"  - {f.name} ({size_mb:.2f} MB)")
            # Check if already tracked by DVC
            dvc_file = f.with_suffix(f.suffix + '.dvc')
            if dvc_file.exists():
                print(f"    ✓ Already tracked by DVC")
else:
    print("⚠ data/raw/ directory does not exist yet.")
    print("Create it and add your data files before proceeding.")


In [None]:
# Add data file to DVC tracking
# Update this path to match your actual data file
data_file_path = 'data/raw/insurance_data.csv'  # Update this path

if os.path.exists(data_file_path):
    print(f"Adding {data_file_path} to DVC...")
    result = subprocess.run(['dvc', 'add', data_file_path], 
                          capture_output=True, text=True)
    if result.returncode == 0:
        print("✓ Data file added to DVC successfully!")
        print(result.stdout)
        print("\nNext steps:")
        print("1. The .dvc file has been created")
        print("2. The data file has been added to .gitignore")
        print("3. Commit the .dvc file to git:")
        print(f"   git add {data_file_path}.dvc .gitignore")
        print("   git commit -m 'Add data file with DVC'")
    else:
        print("✗ Error adding file to DVC:")
        print(result.stderr)
else:
    print(f"⚠ File not found: {data_file_path}")
    print("Please update the data_file_path variable with the correct path to your data file.")
    print("\nExample command to add a file manually:")
    print("  dvc add data/raw/your_data_file.csv")


## Step 5: Commit Changes to Version Control


In [None]:
# Check git status
result = subprocess.run(['git', 'status'], capture_output=True, text=True)
print("Git status:")
print(result.stdout)


In [None]:
# Check for .dvc files that need to be committed
import glob
dvc_files = glob.glob('**/*.dvc', recursive=True)
if dvc_files:
    print("DVC files found:")
    for dvc_file in dvc_files:
        print(f"  - {dvc_file}")
    print("\nTo commit these files, run:")
    print("  git add *.dvc .gitignore")
    print("  git commit -m 'Add data files with DVC'")
else:
    print("No .dvc files found. Make sure you've added data files using 'dvc add' first.")


## Step 6: Push Data to Local Remote Storage


In [None]:
# Push data to local remote storage
print("Pushing data to local remote storage...")
result = subprocess.run(['dvc', 'push'], capture_output=True, text=True)
if result.returncode == 0:
    print("✓ Data pushed to local storage successfully!")
    print(result.stdout)
else:
    print("✗ Error pushing data:")
    print(result.stderr)
    print("\nNote: Make sure you have:")
    print("1. Added data files using 'dvc add'")
    print("2. Committed the .dvc files to git")


## Step 7: Verify DVC Setup


In [None]:
# Check DVC status
print("DVC status:")
result = subprocess.run(['dvc', 'status'], capture_output=True, text=True)
print(result.stdout if result.stdout else "No changes detected.")

# List tracked files
print("\nFiles tracked by DVC:")
result = subprocess.run(['dvc', 'list', '.'], capture_output=True, text=True)
print(result.stdout if result.stdout else "No files tracked yet.")


## Summary

### DVC Setup Complete! ✓

**What we've accomplished:**
1. ✓ Installed DVC
2. ✓ Initialized DVC in the project
3. ✓ Created local storage directory
4. ✓ Configured local remote storage
5. ✓ Added data files to DVC tracking
6. ✓ Committed changes to version control
7. ✓ Pushed data to local remote storage

### Next Steps:
- Continue with Task 3: A/B Hypothesis Testing
- Use `dvc pull` to retrieve data files when needed
- Use `dvc status` to check for changes
- Use `dvc diff` to compare data versions

### Useful DVC Commands:
- `dvc add <file>` - Add a file to DVC tracking
- `dvc push` - Push data to remote storage
- `dvc pull` - Pull data from remote storage
- `dvc status` - Check status of tracked files
- `dvc diff` - Compare data versions
- `dvc list` - List tracked files
