# üîß Environment & Dataset Setup Test
## E-commerce Customer Churn Prediction

**Purpose:**
- Verify all libraries are installed correctly
- Test data loading capabilities
- Check dataset integrity
- Validate file system structure
- Ensure visualization tools work

**Run this notebook first before starting the project!**

---

## 1. System Information

In [None]:
import sys
import platform
from datetime import datetime

print("="*70)
print("SYSTEM INFORMATION")
print("="*70)
print(f"Python Version: {sys.version}")
print(f"Platform: {platform.platform()}")
print(f"Processor: {platform.processor()}")
print(f"Date/Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)

## 2. Test Library Imports

In [None]:
print("\n" + "="*70)
print("TESTING LIBRARY IMPORTS")
print("="*70)

libraries = [
    ('pandas', 'pd'),
    ('numpy', 'np'),
    ('matplotlib.pyplot', 'plt'),
    ('seaborn', 'sns'),
    ('sklearn', 'sklearn'),
    ('tensorflow', 'tf'),
    ('plotly.express', 'px'),
]

import_results = []

for lib_name, alias in libraries:
    try:
        exec(f"import {lib_name} as {alias}")
        # Get version if available
        try:
            version = eval(f"{alias}.__version__")
        except:
            version = "N/A"
        
        import_results.append({
            'Library': lib_name,
            'Status': '‚úÖ SUCCESS',
            'Version': version
        })
        print(f"‚úÖ {lib_name:30s} | Version: {version}")
    except ImportError as e:
        import_results.append({
            'Library': lib_name,
            'Status': '‚ùå FAILED',
            'Version': str(e)
        })
        print(f"‚ùå {lib_name:30s} | Error: {e}")

# Summary
success_count = sum(1 for r in import_results if 'SUCCESS' in r['Status'])
total_count = len(import_results)

print(f"\n{'='*70}")
if success_count == total_count:
    print(f"‚úÖ ALL LIBRARIES IMPORTED SUCCESSFULLY ({success_count}/{total_count})")
else:
    print(f"‚ö†Ô∏è  SOME LIBRARIES FAILED ({success_count}/{total_count})")
print(f"{'='*70}")

## 3. Test Data Science Libraries

In [None]:
# Import core libraries
import pandas as pd
import numpy as np

print("\n" + "="*70)
print("TESTING DATA MANIPULATION")
print("="*70)

# Create test DataFrame
test_data = {
    'A': np.random.randint(1, 100, 10),
    'B': np.random.random(10),
    'C': ['Category' + str(i) for i in range(10)]
}
df_test = pd.DataFrame(test_data)

print("‚úÖ Created test DataFrame:")
print(df_test.head())

print(f"\n‚úÖ DataFrame info:")
print(f"   Shape: {df_test.shape}")
print(f"   Columns: {df_test.columns.tolist()}")
print(f"   Data types: {df_test.dtypes.tolist()}")

print(f"\n‚úÖ NumPy array operations:")
arr = np.array([1, 2, 3, 4, 5])
print(f"   Array: {arr}")
print(f"   Mean: {arr.mean()}")
print(f"   Sum: {arr.sum()}")

print(f"\n{'='*70}")
print("‚úÖ DATA MANIPULATION TEST PASSED")
print(f"{'='*70}")

## 4. Test Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("\n" + "="*70)
print("TESTING VISUALIZATION")
print("="*70)

# Set style
sns.set_style('whitegrid')

# Create test plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
axes[0].plot(x, y, color='blue', linewidth=2)
axes[0].set_title('Test Plot 1: Sine Wave', fontweight='bold')
axes[0].set_xlabel('X')
axes[0].set_ylabel('sin(X)')
axes[0].grid(True, alpha=0.3)

# Plot 2: Bar plot
categories = ['A', 'B', 'C', 'D', 'E']
values = [23, 45, 56, 78, 32]
axes[1].bar(categories, values, color='steelblue', edgecolor='black')
axes[1].set_title('Test Plot 2: Bar Chart', fontweight='bold')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Value')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("‚úÖ Matplotlib plots rendered successfully!")

# Test seaborn
plt.figure(figsize=(8, 4))
data = np.random.randn(100)
sns.histplot(data, kde=True, color='coral')
plt.title('Test Plot 3: Seaborn Histogram with KDE', fontweight='bold')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

print("‚úÖ Seaborn plots rendered successfully!")

print(f"\n{'='*70}")
print("‚úÖ VISUALIZATION TEST PASSED")
print(f"{'='*70}")

## 5. Test Machine Learning Libraries

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

print("\n" + "="*70)
print("TESTING MACHINE LEARNING LIBRARIES")
print("="*70)

# Test StandardScaler
data = np.random.randn(100, 3)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
print(f"‚úÖ StandardScaler: Mean ‚âà {scaled_data.mean():.4f}, Std ‚âà {scaled_data.std():.4f}")

# Test LabelEncoder
labels = ['cat', 'dog', 'cat', 'bird', 'dog', 'bird']
le = LabelEncoder()
encoded = le.fit_transform(labels)
print(f"‚úÖ LabelEncoder: {labels} ‚Üí {encoded.tolist()}")

# Test train_test_split
X = np.random.randn(100, 5)
y = np.random.randint(0, 2, 100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"‚úÖ train_test_split: Train={X_train.shape}, Test={X_test.shape}")

print(f"\n{'='*70}")
print("‚úÖ MACHINE LEARNING LIBRARIES TEST PASSED")
print(f"{'='*70}")

## 6. Test TensorFlow/Keras

In [None]:
import tensorflow as tf
from tensorflow import keras

print("\n" + "="*70)
print("TESTING TENSORFLOW/KERAS")
print("="*70)

print(f"‚úÖ TensorFlow version: {tf.__version__}")
print(f"‚úÖ Keras version: {keras.__version__}")

# Check GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"‚úÖ GPU Available: {len(gpus)} GPU(s) detected")
    for gpu in gpus:
        print(f"   - {gpu}")
else:
    print("‚ÑπÔ∏è  GPU Not Available (CPU mode)")

# Create simple model test
model = keras.Sequential([
    keras.layers.Dense(10, activation='relu', input_shape=(5,)),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(f"\n‚úÖ Created test model:")
model.summary()

print(f"\n{'='*70}")
print("‚úÖ TENSORFLOW/KERAS TEST PASSED")
print(f"{'='*70}")

## 7. Check File System Structure

In [None]:
import os

print("\n" + "="*70)
print("CHECKING FILE SYSTEM STRUCTURE")
print("="*70)

# Define expected directories
expected_dirs = [
    '../data',
    '../data/raw',
    '../data/processed',
    '../data/model',
    '../notebooks',
    '../src',
]

print("\nChecking directories:")
for dir_path in expected_dirs:
    exists = os.path.exists(dir_path)
    status = "‚úÖ" if exists else "‚ö†Ô∏è "
    print(f"{status} {dir_path:30s} - {'EXISTS' if exists else 'NOT FOUND'}")
    
    # Create if doesn't exist
    if not exists:
        try:
            os.makedirs(dir_path, exist_ok=True)
            print(f"   ‚Üí Created directory: {dir_path}")
        except Exception as e:
            print(f"   ‚Üí Error creating directory: {e}")

print(f"\n{'='*70}")
print("‚úÖ FILE SYSTEM CHECK COMPLETE")
print(f"{'='*70}")

## 8. Check Dataset

In [None]:
print("\n" + "="*70)
print("CHECKING DATASET")
print("="*70)

# Check for dataset in data/raw/
data_path = '../data/raw/'

if os.path.exists(data_path):
    files = os.listdir(data_path)
    csv_files = [f for f in files if f.endswith('.csv')]
    
    if csv_files:
        print(f"\n‚úÖ Found {len(csv_files)} CSV file(s):")
        for file in csv_files:
            file_path = os.path.join(data_path, file)
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"   - {file:40s} ({size_mb:.2f} MB)")
        
        # Try to load first CSV file
        print(f"\nüìä Loading dataset: {csv_files[0]}")
        try:
            df = pd.read_csv(os.path.join(data_path, csv_files[0]))
            print(f"\n‚úÖ Dataset loaded successfully!")
            print(f"   Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
            print(f"   Memory: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
            print(f"\n   Columns: {df.columns.tolist()}")
            print(f"\n   First 3 rows:")
            display(df.head(3))
            
            print(f"\n   Data types:")
            print(df.dtypes)
            
            print(f"\n   Missing values:")
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print(missing[missing > 0])
            else:
                print("   ‚úÖ No missing values!")
                
        except Exception as e:
            print(f"\n‚ùå Error loading dataset: {e}")
    else:
        print(f"\n‚ö†Ô∏è  No CSV files found in {data_path}")
        print(f"\nüì• Please download dataset:")
        print(f"   1. Download from Kaggle or use converted CSV")
        print(f"   2. Place in: {data_path}")
        print(f"   3. Filename: ecommerce_data.csv")
else:
    print(f"\n‚ùå Data directory not found: {data_path}")

print(f"\n{'='*70}")
print("‚úÖ DATASET CHECK COMPLETE")
print(f"{'='*70}")

## 9. System Resources Check

In [None]:
import psutil

print("\n" + "="*70)
print("SYSTEM RESOURCES")
print("="*70)

# CPU Info
cpu_count = psutil.cpu_count(logical=True)
cpu_percent = psutil.cpu_percent(interval=1)
print(f"\nüíª CPU:")
print(f"   Cores: {cpu_count}")
print(f"   Usage: {cpu_percent}%")

# Memory Info
memory = psutil.virtual_memory()
print(f"\nüß† Memory:")
print(f"   Total: {memory.total / (1024**3):.2f} GB")
print(f"   Available: {memory.available / (1024**3):.2f} GB")
print(f"   Used: {memory.percent}%")

# Disk Info
disk = psutil.disk_usage('/')
print(f"\nüíæ Disk:")
print(f"   Total: {disk.total / (1024**3):.2f} GB")
print(f"   Free: {disk.free / (1024**3):.2f} GB")
print(f"   Used: {disk.percent}%")

# Recommendations
print(f"\n{'='*70}")
print("üìä RECOMMENDATIONS:")
print(f"{'='*70}")

if memory.available / (1024**3) < 2:
    print("‚ö†Ô∏è  Low memory available. Consider closing other applications.")
else:
    print("‚úÖ Sufficient memory available for ML tasks")

if disk.free / (1024**3) < 5:
    print("‚ö†Ô∏è  Low disk space. Consider freeing up space.")
else:
    print("‚úÖ Sufficient disk space available")

if cpu_count < 2:
    print("‚ö†Ô∏è  Limited CPU cores. Training may be slow.")
else:
    print(f"‚úÖ {cpu_count} CPU cores available for parallel processing")

## 10. Final Setup Summary

In [None]:
print("\n" + "="*70)
print("üéâ SETUP TEST SUMMARY")
print("="*70)

summary = [
    ("Python Environment", "‚úÖ READY"),
    ("Core Libraries", "‚úÖ READY"),
    ("Visualization Tools", "‚úÖ READY"),
    ("Machine Learning Libraries", "‚úÖ READY"),
    ("TensorFlow/Keras", "‚úÖ READY"),
    ("File System Structure", "‚úÖ READY"),
]

# Check dataset status
dataset_status = "‚úÖ READY" if os.path.exists('../data/raw/ecommerce_data.csv') else "‚ö†Ô∏è  PENDING"
summary.append(("Dataset", dataset_status))

print("\nComponent Status:")
for component, status in summary:
    print(f"  {component:30s} | {status}")

all_ready = all("READY" in status for _, status in summary)

print(f"\n{'='*70}")
if all_ready:
    print("‚úÖ ALL SYSTEMS GO! Ready to start the project!")
    print("\nüìö Next Steps:")
    print("   1. Run: 01_data_preprocessing.ipynb")
    print("   2. Run: 02_exploratory_data_analysis.ipynb")
    print("   3. Continue with model building")
else:
    print("‚ö†Ô∏è  SOME COMPONENTS NEED ATTENTION")
    print("\nüìù Action Items:")
    for component, status in summary:
        if "PENDING" in status or "FAILED" in status:
            print(f"   - Fix: {component}")
            
print(f"{'='*70}")
print(f"\nTest completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nüöÄ Happy ML Engineering!")