In [2]:
# Cell 1: Setup and Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import psutil
import gc
import json
from datetime import datetime
import sys
import os

# Addding src to path for imports
sys.path.append(os.path.join(os.getcwd(), '..', '..', 'src'))

# Configuring display and warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
warnings.filterwarnings('ignore')

# Setting plot style
plt.style.use('default')
sns.set_palette("husl")

print("Environment Setup Complete")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Available memory: {psutil.virtual_memory().available / (1024**3):.2f} GB")

Environment Setup Complete
Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
Pandas version: 2.3.2
NumPy version: 2.2.6
Available memory: 6.76 GB


In [3]:
# Cell 2: Defining Paths and Configuration
# Project paths
PROJECT_ROOT = Path.cwd().parent.parent
DATA_RAW_PATH = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED_PATH = PROJECT_ROOT / "data" / "processed"
RESULTS_PATH = PROJECT_ROOT / "results" / "data_exploration"

# Ensuring directories exist
DATA_PROCESSED_PATH.mkdir(parents=True, exist_ok=True)
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

# Dataset file paths
CICFLOW_FILE = DATA_RAW_PATH / "CICFlowMeter_out.csv"
DATA_FILE = DATA_RAW_PATH / "Data.csv"
LABEL_FILE = DATA_RAW_PATH / "Label.csv"

# Configuration for analysis
ANALYSIS_CONFIG = {
    'memory_limit_gb': 12,  # Conservative limit for 16GB system
    'chunk_size': 10000,    # For chunked processing
    'sample_size': 50000,   # For quick analysis
    'random_state': 42
}

print("Configuration loaded:")
for key, value in ANALYSIS_CONFIG.items():
    print(f"  {key}: {value}")
print(f"\nData files expected at:")
print(f"  CICFlowMeter: {CICFLOW_FILE}")
print(f"  Data: {DATA_FILE}")
print(f"  Labels: {LABEL_FILE}")

Configuration loaded:
  memory_limit_gb: 12
  chunk_size: 10000
  sample_size: 50000
  random_state: 42

Data files expected at:
  CICFlowMeter: C:\Users\Ashutosh\Documents\Projects\NIDS\data\raw\CICFlowMeter_out.csv
  Data: C:\Users\Ashutosh\Documents\Projects\NIDS\data\raw\Data.csv
  Labels: C:\Users\Ashutosh\Documents\Projects\NIDS\data\raw\Label.csv


In [4]:
# Cell 3: File System Analysis
def analyze_file_system():
    """Analyze the file system and dataset files"""
    
    analysis = {
        'timestamp': datetime.now().isoformat(),
        'files': {},
        'system_info': {
            'total_memory_gb': psutil.virtual_memory().total / (1024**3),
            'available_memory_gb': psutil.virtual_memory().available / (1024**3),
            'cpu_count': psutil.cpu_count(),
            'disk_usage_gb': psutil.disk_usage('.').free / (1024**3)
        }
    }
    
    # Checking each file
    files_to_check = {
        'cicflow': CICFLOW_FILE,
        'data': DATA_FILE,
        'labels': LABEL_FILE
    }
    
    for name, filepath in files_to_check.items():
        if filepath.exists():
            file_size = filepath.stat().st_size
            analysis['files'][name] = {
                'exists': True,
                'size_bytes': file_size,
                'size_mb': file_size / (1024**2),
                'size_gb': file_size / (1024**3),
                'path': str(filepath)
            }
        else:
            analysis['files'][name] = {
                'exists': False,
                'path': str(filepath)
            }
    
    return analysis

# Performing file system analysis
fs_analysis = analyze_file_system()

print("=== File System Analysis ===")
print(f"Analysis timestamp: {fs_analysis['timestamp']}")
print(f"\nSystem Information:")
for key, value in fs_analysis['system_info'].items():
    if 'gb' in key:
        print(f"  {key}: {value:.2f} GB")
    else:
        print(f"  {key}: {value}")

print(f"\nDataset Files:")
for name, info in fs_analysis['files'].items():
    if info['exists']:
        print(f"  {name.upper()}:")
        print(f"    Size: {info['size_gb']:.2f} GB ({info['size_mb']:.1f} MB)")
        print(f"    Path: {info['path']}")
    else:
        print(f"  {name.upper()}: NOT FOUND at {info['path']}")

=== File System Analysis ===
Analysis timestamp: 2025-09-28T15:38:53.794335

System Information:
  total_memory_gb: 15.35 GB
  available_memory_gb: 6.80 GB
  cpu_count: 16
  disk_usage_gb: 76.44 GB

Dataset Files:
  CICFLOW:
    Size: 1.78 GB (1825.6 MB)
    Path: C:\Users\Ashutosh\Documents\Projects\NIDS\data\raw\CICFlowMeter_out.csv
  DATA:
    Size: 0.18 GB (187.2 MB)
    Path: C:\Users\Ashutosh\Documents\Projects\NIDS\data\raw\Data.csv
  LABELS:
    Size: 0.00 GB (0.9 MB)
    Path: C:\Users\Ashutosh\Documents\Projects\NIDS\data\raw\Label.csv


In [5]:
# Cell 4: Memory-Safe Data Loading Functions
def get_memory_usage():
    """Get current memory usage"""
    process = psutil.Process()
    memory_info = process.memory_info()
    return {
        'rss_gb': memory_info.rss / (1024**3),
        'vms_gb': memory_info.vms / (1024**3),
        'available_gb': psutil.virtual_memory().available / (1024**3)
    }

def load_file_info(filepath, nrows=5):
    """Load basic information about a CSV file without loading full data"""
    try:
        # Reading just the header and a few rows
        sample_df = pd.read_csv(filepath, nrows=nrows, low_memory=False)
        
        # Getting total row count (memory efficient)
        total_rows = sum(1 for line in open(filepath)) - 1  # Subtract header
        
        info = {
            'columns': list(sample_df.columns),
            'total_rows': total_rows,
            'total_columns': len(sample_df.columns),
            'dtypes': sample_df.dtypes.to_dict(),
            'sample_data': sample_df.head(3).to_dict(),
            'memory_usage_mb': 0,  # Will calculate later
            'estimated_full_size_gb': 0  # Will estimate
        }
        
        # Estimating memory usage
        memory_per_row = sample_df.memory_usage(deep=True).sum() / len(sample_df)
        estimated_total_memory = (memory_per_row * total_rows) / (1024**3)
        info['estimated_full_size_gb'] = estimated_total_memory
        
        return info, None
        
    except Exception as e:
        return None, str(e)

print("=== Memory-Safe File Analysis ===")
print(f"Initial memory usage: {get_memory_usage()['rss_gb']:.2f} GB")

# Analyzing each file
file_info = {}
for name, filepath in [('cicflow', CICFLOW_FILE), ('data', DATA_FILE), ('labels', LABEL_FILE)]:
    if filepath.exists():
        print(f"\nAnalyzing {name.upper()} file...")
        info, error = load_file_info(filepath)
        
        if error:
            print(f"  Error loading {name}: {error}")
            file_info[name] = {'error': error}
        else:
            file_info[name] = info
            print(f"  Rows: {info['total_rows']:,}")
            print(f"  Columns: {info['total_columns']}")
            print(f"  Estimated memory if fully loaded: {info['estimated_full_size_gb']:.2f} GB")
            
            # Checking if safe to load fully
            if info['estimated_full_size_gb'] < ANALYSIS_CONFIG['memory_limit_gb'] * 0.8:
                print(f"  ✓ Safe to load fully (< {ANALYSIS_CONFIG['memory_limit_gb']*0.8:.1f} GB)")
            else:
                print(f"  ⚠ Requires chunked processing (> {ANALYSIS_CONFIG['memory_limit_gb']*0.8:.1f} GB)")
    else:
        print(f"\n{name.upper()} file not found!")
        file_info[name] = {'error': 'File not found'}

print(f"\nFinal memory usage: {get_memory_usage()['rss_gb']:.2f} GB")

=== Memory-Safe File Analysis ===
Initial memory usage: 0.20 GB

Analyzing CICFLOW file...
  Rows: 3,540,241
  Columns: 84
  Estimated memory if fully loaded: 3.29 GB
  ✓ Safe to load fully (< 9.6 GB)

Analyzing DATA file...
  Rows: 447,915
  Columns: 76
  Estimated memory if fully loaded: 0.26 GB
  ✓ Safe to load fully (< 9.6 GB)

Analyzing LABELS file...
  Rows: 447,915
  Columns: 1
  Estimated memory if fully loaded: 0.01 GB
  ✓ Safe to load fully (< 9.6 GB)

Final memory usage: 0.20 GB


In [6]:
# Cell 5: Data Structure Analysis
def analyze_data_structure():
    """Analyze the structure and basic properties of each dataset"""
    
    structure_analysis = {}
    
    for name, filepath in [('cicflow', CICFLOW_FILE), ('data', DATA_FILE), ('labels', LABEL_FILE)]:
        if not filepath.exists():
            structure_analysis[name] = {'error': 'File not found'}
            continue
            
        print(f"\n=== {name.upper()} STRUCTURE ANALYSIS ===")
        
        try:
            # Loading sample for structure analysis
            sample_df = pd.read_csv(filepath, nrows=1000, low_memory=False)
            
            analysis = {
                'shape': (file_info[name]['total_rows'], file_info[name]['total_columns']),
                'columns': list(sample_df.columns),
                'data_types': {},
                'missing_values': {},
                'unique_values': {},
                'memory_usage': {}
            }
            
            # Analyzing each column
            for col in sample_df.columns:
                analysis['data_types'][col] = str(sample_df[col].dtype)
                analysis['missing_values'][col] = sample_df[col].isnull().sum()
                analysis['unique_values'][col] = sample_df[col].nunique()
                analysis['memory_usage'][col] = sample_df[col].memory_usage(deep=True)
            
            # Summary statistics
            analysis['summary'] = {
                'total_missing': sum(analysis['missing_values'].values()),
                'total_memory_mb': sum(analysis['memory_usage'].values()) / (1024**2),
                'categorical_columns': [col for col, dtype in analysis['data_types'].items() 
                                      if dtype == 'object'],
                'numerical_columns': [col for col, dtype in analysis['data_types'].items() 
                                    if dtype in ['int64', 'float64', 'int32', 'float32']],
                'high_cardinality_columns': [col for col, unique_count in analysis['unique_values'].items()
                                           if unique_count > len(sample_df) * 0.8]
            }
            
            structure_analysis[name] = analysis
            
            # Displaying key information
            print(f"Shape: {analysis['shape']}")
            print(f"Data types distribution:")
            dtype_counts = pd.Series(list(analysis['data_types'].values())).value_counts()
            for dtype, count in dtype_counts.items():
                print(f"  {dtype}: {count} columns")
            
            print(f"Missing values: {analysis['summary']['total_missing']}")
            print(f"Memory usage: {analysis['summary']['total_memory_mb']:.2f} MB")
            
            if analysis['summary']['categorical_columns']:
                print(f"Categorical columns: {len(analysis['summary']['categorical_columns'])}")
                print(f"  {analysis['summary']['categorical_columns'][:5]}{'...' if len(analysis['summary']['categorical_columns']) > 5 else ''}")
            
            print(f"Numerical columns: {len(analysis['summary']['numerical_columns'])}")
            
        except Exception as e:
            print(f"Error analyzing {name}: {str(e)}")
            structure_analysis[name] = {'error': str(e)}
    
    return structure_analysis

# Performing structure analysis
structure_analysis = analyze_data_structure()


=== CICFLOW STRUCTURE ANALYSIS ===
Shape: (3540241, 84)
Data types distribution:
  float64: 45 columns
  int64: 34 columns
  object: 5 columns
Missing values: 0
Memory usage: 0.93 MB
Categorical columns: 5
  ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Label']
Numerical columns: 79

=== DATA STRUCTURE ANALYSIS ===
Shape: (447915, 76)
Data types distribution:
  float64: 45 columns
  int64: 31 columns
Missing values: 0
Memory usage: 0.59 MB
Numerical columns: 76

=== LABELS STRUCTURE ANALYSIS ===
Shape: (447915, 1)
Data types distribution:
  int64: 1 columns
Missing values: 0
Memory usage: 0.01 MB
Numerical columns: 1


In [7]:
# Cell 6: Column Name Analysis and Comparison
def analyze_column_relationships():
    """Analyze relationships between datasets based on column names"""
    
    print("\n=== COLUMN RELATIONSHIP ANALYSIS ===")
    
    # Extracting column names from each dataset
    columns = {}
    for name in ['cicflow', 'data', 'labels']:
        if name in structure_analysis and 'columns' in structure_analysis[name]:
            columns[name] = set(structure_analysis[name]['columns'])
        else:
            columns[name] = set()
    
    # Comparing column overlaps
    if columns['cicflow'] and columns['data']:
        common_cicflow_data = columns['cicflow'].intersection(columns['data'])
        print(f"\nCommon columns between CICFlow and Data: {len(common_cicflow_data)}")
        if common_cicflow_data:
            print(f"  Sample: {list(common_cicflow_data)[:10]}")
        
        cicflow_only = columns['cicflow'] - columns['data']
        data_only = columns['data'] - columns['cicflow']
        
        print(f"\nCICFlow unique columns: {len(cicflow_only)}")
        if cicflow_only:
            print(f"  Sample: {list(cicflow_only)[:10]}")
        
        print(f"\nData unique columns: {len(data_only)}")
        if data_only:
            print(f"  Sample: {list(data_only)[:10]}")
    
    # Analyzing label columns
    if columns['labels']:
        print(f"\nLabel file columns: {list(columns['labels'])}")
        
        # Checking if labels contain identifiers that might link to other files
        label_columns = list(columns['labels'])
        potential_ids = [col for col in label_columns if any(id_term in col.lower() 
                        for id_term in ['id', 'index', 'flow', 'record'])]
        if potential_ids:
            print(f"Potential identifier columns in labels: {potential_ids}")
    
    return columns

# Analyzing column relationships
column_relationships = analyze_column_relationships()


=== COLUMN RELATIONSHIP ANALYSIS ===

Common columns between CICFlow and Data: 76
  Sample: ['Fwd Act Data Pkts', 'Fwd IAT Std', 'Bwd URG Flags', 'Fwd Packet Length Min', 'SYN Flag Count', 'FWD Init Win Bytes', 'Packet Length Variance', 'Flow IAT Max', 'Fwd IAT Total', 'Total Length of Bwd Packet']

CICFlow unique columns: 8
  Sample: ['Flow ID', 'Timestamp', 'Src Port', 'Dst IP', 'Label', 'Src IP', 'Dst Port', 'Protocol']

Data unique columns: 0

Label file columns: ['Label']


In [8]:
# Cell 7: Sample Data Inspection
def inspect_sample_data():
    """Inspect sample data from each file to understand content and format"""
    
    print("\n=== SAMPLE DATA INSPECTION ===")
    
    sample_data = {}
    
    for name, filepath in [('cicflow', CICFLOW_FILE), ('data', DATA_FILE), ('labels', LABEL_FILE)]:
        if not filepath.exists():
            continue
            
        print(f"\n--- {name.upper()} SAMPLE DATA ---")
        
        try:
            # Loading small sample
            df_sample = pd.read_csv(filepath, nrows=5, low_memory=False)
            sample_data[name] = df_sample
            
            print(f"First 3 rows:")
            print(df_sample.head(3).to_string())
            
            print(f"\nColumn info:")
            print(f"  Total columns: {len(df_sample.columns)}")
            print(f"  Column names (first 10): {list(df_sample.columns)[:10]}")
            
            # Checking for potential label/target columns
            potential_labels = [col for col in df_sample.columns 
                              if any(term in col.lower() for term in ['label', 'attack', 'class', 'target'])]
            if potential_labels:
                print(f"  Potential label columns: {potential_labels}")
                for label_col in potential_labels:
                    unique_vals = df_sample[label_col].unique()
                    print(f"    {label_col}: {unique_vals}")
            
            # Checking for missing values in sample
            missing_in_sample = df_sample.isnull().sum()
            if missing_in_sample.sum() > 0:
                print(f"  Missing values in sample: {missing_in_sample[missing_in_sample > 0].to_dict()}")
            
        except Exception as e:
            print(f"Error loading sample from {name}: {str(e)}")
            sample_data[name] = None
    
    return sample_data

# Inspecting sample data
sample_data = inspect_sample_data()


=== SAMPLE DATA INSPECTION ===

--- CICFLOW SAMPLE DATA ---
First 3 rows:
                                    Flow ID        Src IP  Src Port          Dst IP  Dst Port  Protocol               Timestamp  Flow Duration  Total Fwd Packet  Total Bwd packets  Total Length of Fwd Packet  Total Length of Bwd Packet  Fwd Packet Length Max  Fwd Packet Length Min  Fwd Packet Length Mean  Fwd Packet Length Std  Bwd Packet Length Max  Bwd Packet Length Min  Bwd Packet Length Mean  Bwd Packet Length Std   Flow Bytes/s  Flow Packets/s  Flow IAT Mean   Flow IAT Std  Flow IAT Max  Flow IAT Min  Fwd IAT Total   Fwd IAT Mean    Fwd IAT Std  Fwd IAT Max  Fwd IAT Min  Bwd IAT Total  Bwd IAT Mean   Bwd IAT Std  Bwd IAT Max  Bwd IAT Min  Fwd PSH Flags  Bwd PSH Flags  Fwd URG Flags  Bwd URG Flags  Fwd Header Length  Bwd Header Length  Fwd Packets/s  Bwd Packets/s  Packet Length Min  Packet Length Max  Packet Length Mean  Packet Length Std  Packet Length Variance  FIN Flag Count  SYN Flag Count  RST Flag Cou

In [9]:
# Cell 8: Data Consistency Validation
def validate_data_consistency():
    """Validate consistency between the three datasets"""
    
    print("\n=== DATA CONSISTENCY VALIDATION ===")
    
    validation_results = {
        'row_count_analysis': {},
        'potential_relationships': {},
        'data_alignment': {},
        'recommendations': []
    }
    
    # Row count analysis
    print("\n--- ROW COUNT ANALYSIS ---")
    row_counts = {}
    for name in ['cicflow', 'data', 'labels']:
        if name in file_info and 'total_rows' in file_info[name]:
            row_counts[name] = file_info[name]['total_rows']
            print(f"{name.upper()}: {row_counts[name]:,} rows")
    
    validation_results['row_count_analysis'] = row_counts
    
    # Analyzing potential relationships
    print("\n--- RELATIONSHIP ANALYSIS ---")
    if 'data' in row_counts and 'labels' in row_counts:
        if row_counts['data'] == row_counts['labels']:
            print("✓ Data.csv and Label.csv have matching row counts - likely aligned")
            validation_results['potential_relationships']['data_labels'] = 'aligned'
            validation_results['recommendations'].append("Data.csv and Label.csv appear to be aligned by row index")
        else:
            print("⚠ Data.csv and Label.csv have different row counts")
            validation_results['potential_relationships']['data_labels'] = 'misaligned'
            validation_results['recommendations'].append("Investigate row count mismatch between Data.csv and Label.csv")
    
    if 'cicflow' in row_counts:
        cicflow_rows = row_counts['cicflow']
        data_rows = row_counts.get('data', 0)
        
        if cicflow_rows < data_rows:
            ratio = data_rows / cicflow_rows if cicflow_rows > 0 else 0
            print(f"📊 CICFlow has fewer rows than Data ({ratio:.1f}x difference)")
            print("   This suggests CICFlow contains aggregated flow data while Data contains individual records")
            validation_results['potential_relationships']['cicflow_data'] = 'hierarchical'
            validation_results['recommendations'].append("CICFlow appears to contain aggregated flows - investigate packet-to-flow mapping")
        elif cicflow_rows > data_rows:
            print("📊 CICFlow has more rows than Data - unexpected pattern")
            validation_results['potential_relationships']['cicflow_data'] = 'unexpected'
        else:
            print("📊 CICFlow and Data have same row count - might be different feature sets of same data")
            validation_results['potential_relationships']['cicflow_data'] = 'same_level'
    
    return validation_results

# Validating data consistency
validation_results = validate_data_consistency()


=== DATA CONSISTENCY VALIDATION ===

--- ROW COUNT ANALYSIS ---
CICFLOW: 3,540,241 rows
DATA: 447,915 rows
LABELS: 447,915 rows

--- RELATIONSHIP ANALYSIS ---
✓ Data.csv and Label.csv have matching row counts - likely aligned
📊 CICFlow has more rows than Data - unexpected pattern


In [10]:
# Cell 9: Memory Usage and Loading Strategy Analysis
def analyze_memory_strategy():
    """Analyze memory requirements and recommend loading strategies"""
    
    print("\n=== MEMORY USAGE AND LOADING STRATEGY ===")
    
    memory_analysis = {
        'current_usage': get_memory_usage(),
        'file_requirements': {},
        'loading_strategies': {},
        'recommendations': []
    }
    
    available_memory = memory_analysis['current_usage']['available_gb']
    memory_limit = ANALYSIS_CONFIG['memory_limit_gb']
    
    print(f"Available memory: {available_memory:.2f} GB")
    print(f"Memory limit for analysis: {memory_limit:.2f} GB")
    
    # Analyzing each file's memory requirements
    for name in ['cicflow', 'data', 'labels']:
        if name in file_info and 'estimated_full_size_gb' in file_info[name]:
            estimated_size = file_info[name]['estimated_full_size_gb']
            memory_analysis['file_requirements'][name] = estimated_size
            
            print(f"\n{name.upper()}:")
            print(f"  Estimated memory: {estimated_size:.2f} GB")
            
            if estimated_size < memory_limit * 0.6:  # Conservative threshold
                strategy = "full_load"
                print(f"  ✓ Strategy: Full load in memory")
            elif estimated_size < memory_limit * 0.9:
                strategy = "careful_load"
                print(f"  ⚠ Strategy: Careful loading with monitoring")
            else:
                strategy = "chunked_processing"
                print(f"  🔄 Strategy: Chunked processing required")
                chunk_size = max(1000, int(memory_limit * 0.3 * 1024**3 / (estimated_size * 1024**3 / file_info[name]['total_rows'])))
                print(f"    Recommended chunk size: {chunk_size:,} rows")
            
            memory_analysis['loading_strategies'][name] = {
                'strategy': strategy,
                'estimated_size_gb': estimated_size,
                'chunk_size': chunk_size if strategy == 'chunked_processing' else None
            }
    
    # Generating recommendations
    total_estimated = sum(memory_analysis['file_requirements'].values())
    print(f"\nTotal estimated memory for all files: {total_estimated:.2f} GB")
    
    if total_estimated > memory_limit:
        memory_analysis['recommendations'].append("Cannot load all files simultaneously - process sequentially")
        print("⚠ Recommendation: Process files sequentially")
    else:
        memory_analysis['recommendations'].append("Can potentially load multiple files simultaneously")
        print("✓ Recommendation: Can load multiple files if needed")
    
    return memory_analysis

# Analyzing memory strategy
memory_strategy = analyze_memory_strategy()


=== MEMORY USAGE AND LOADING STRATEGY ===
Available memory: 6.75 GB
Memory limit for analysis: 12.00 GB

CICFLOW:
  Estimated memory: 3.29 GB
  ✓ Strategy: Full load in memory

DATA:
  Estimated memory: 0.26 GB
  ✓ Strategy: Full load in memory

LABELS:
  Estimated memory: 0.01 GB
  ✓ Strategy: Full load in memory

Total estimated memory for all files: 3.57 GB
✓ Recommendation: Can load multiple files if needed


In [12]:
# Cell 10: Generate Comprehensive Summary Report
def generate_exploration_summary():
    """Generate a comprehensive summary of the data exploration"""
    
    print("\n" + "="*60)
    print("           COMPREHENSIVE DATA EXPLORATION SUMMARY")
    print("="*60)
    
    summary_report = {
        'analysis_timestamp': datetime.now().isoformat(),
        'dataset_overview': {},
        'technical_specifications': {},
        'data_relationships': {},
        'processing_recommendations': {},
        'next_steps': []
    }
    
    # Dataset Overview
    print("\n📊 DATASET OVERVIEW")
    print("-" * 30)
    
    total_rows = sum(file_info[name].get('total_rows', 0) for name in ['cicflow', 'data', 'labels'] 
                    if name in file_info and 'total_rows' in file_info[name])
    total_features = sum(file_info[name].get('total_columns', 0) for name in ['cicflow', 'data'] 
                        if name in file_info and 'total_columns' in file_info[name])
    
    summary_report['dataset_overview'] = {
        'total_data_points': total_rows,
        'total_features': total_features,
        'files_analyzed': len([name for name in file_info if 'error' not in file_info[name]]),
        'estimated_total_size_gb': sum(file_info[name].get('estimated_full_size_gb', 0) 
                                     for name in file_info if 'estimated_full_size_gb' in file_info[name])
    }
    
    print(f"Total data points across files: {total_rows:,}")
    print(f"Total features: {total_features}")
    print(f"Successfully analyzed files: {summary_report['dataset_overview']['files_analyzed']}/3")
    print(f"Estimated total size: {summary_report['dataset_overview']['estimated_total_size_gb']:.2f} GB")
    
    # Technical Specifications
    print(f"\n⚙️ TECHNICAL SPECIFICATIONS")
    print("-" * 30)
    
    summary_report['technical_specifications'] = {
        'hardware_constraints': {
            'available_memory_gb': memory_strategy['current_usage']['available_gb'],
            'memory_limit_gb': ANALYSIS_CONFIG['memory_limit_gb']
        },
        'processing_requirements': memory_strategy['loading_strategies']
    }
    
    for name, strategy_info in memory_strategy['loading_strategies'].items():
        print(f"{name.upper()}: {strategy_info['strategy']} ({strategy_info['estimated_size_gb']:.2f} GB)")
    
    # Data Relationships
    print(f"\n🔗 DATA RELATIONSHIPS")
    print("-" * 30)
    
    summary_report['data_relationships'] = validation_results['potential_relationships']
    
    for relationship, status in validation_results['potential_relationships'].items():
        print(f"{relationship}: {status}")
    
    # Processing Recommendations
    print(f"\n💡 PROCESSING RECOMMENDATIONS")
    print("-" * 30)
    
    all_recommendations = (validation_results['recommendations'] + 
                         memory_strategy['recommendations'])
    
    summary_report['processing_recommendations'] = all_recommendations
    
    for i, rec in enumerate(all_recommendations, 1):
        print(f"{i}. {rec}")
    
    # Next Steps
    print(f"\n🚀 RECOMMENDED NEXT STEPS")
    print("-" * 30)
    
    next_steps = [
        "Implement chunked data loading for CICFlowMeter.csv",
        "Analyze label distribution and class imbalance",
        "Investigate feature correlations and engineering opportunities",
        "Design multi-modal feature separation strategy",
        "Create graph construction pipeline for GraphVAE",
        "Develop memory-efficient preprocessing pipeline"
    ]
    
    summary_report['next_steps'] = next_steps
    
    for i, step in enumerate(next_steps, 1):
        print(f"{i}. {step}")
    
    # Save summary report
    report_path = RESULTS_PATH / "data_exploration_summary.json"
    with open(report_path, 'w') as f:
        json.dump(summary_report, f, indent=2, default=str)
    
    print(f"\n💾 Summary report saved to: {report_path}")
    
    return summary_report

# Generate comprehensive summary
final_summary = generate_exploration_summary()


           COMPREHENSIVE DATA EXPLORATION SUMMARY

📊 DATASET OVERVIEW
------------------------------
Total data points across files: 4,436,071
Total features: 160
Successfully analyzed files: 3/3
Estimated total size: 3.57 GB

⚙️ TECHNICAL SPECIFICATIONS
------------------------------
CICFLOW: full_load (3.29 GB)
DATA: full_load (0.26 GB)
LABELS: full_load (0.01 GB)

🔗 DATA RELATIONSHIPS
------------------------------
data_labels: aligned
cicflow_data: unexpected

💡 PROCESSING RECOMMENDATIONS
------------------------------
1. Data.csv and Label.csv appear to be aligned by row index
2. Can potentially load multiple files simultaneously

🚀 RECOMMENDED NEXT STEPS
------------------------------
1. Implement chunked data loading for CICFlowMeter.csv
2. Analyze label distribution and class imbalance
3. Investigate feature correlations and engineering opportunities
4. Design multi-modal feature separation strategy
5. Create graph construction pipeline for GraphVAE
6. Develop memory-efficient

In [13]:
# Cell 11: Export Analysis Results
# Saving all analysis results for future reference
print("\n=== EXPORTING ANALYSIS RESULTS ===")

# Creating comprehensive analysis export
analysis_export = {
    'metadata': {
        'analysis_date': datetime.now().isoformat(),
        'analyst': 'Bachelor Student',
        'project': 'NIDS-Bachelor-Project',
        'analysis_version': '1.0'
    },
    'file_system_analysis': fs_analysis,
    'file_information': file_info,
    'structure_analysis': structure_analysis,
    'column_relationships': column_relationships,
    'validation_results': validation_results,
    'memory_strategy': memory_strategy,
    'summary_report': final_summary
}

# Exporting to JSON
export_path = RESULTS_PATH / "complete_data_exploration.json"
with open(export_path, 'w') as f:
    json.dump(analysis_export, f, indent=2, default=str)

print(f"✅ Complete analysis exported to: {export_path}")

# Creating a quick reference guide
quick_ref = f"""
# CIC-UNSW-NB15 Dataset - Quick Reference Guide

## Dataset Files
- CICFlowMeter_out.csv: {file_info.get('cicflow', {}).get('total_rows', 'Unknown'):,} rows, {file_info.get('cicflow', {}).get('total_columns', 'Unknown')} columns
- Data.csv: {file_info.get('data', {}).get('total_rows', 'Unknown'):,} rows, {file_info.get('data', {}).get('total_columns', 'Unknown')} columns  
- Label.csv: {file_info.get('labels', {}).get('total_rows', 'Unknown'):,} rows, {file_info.get('labels', {}).get('total_columns', 'Unknown')} columns

## Processing Strategy
- CICFlowMeter: {memory_strategy['loading_strategies'].get('cicflow', {}).get('strategy', 'Unknown')}
- Data: {memory_strategy['loading_strategies'].get('data', {}).get('strategy', 'Unknown')}
- Labels: {memory_strategy['loading_strategies'].get('labels', {}).get('strategy', 'Unknown')}

## Key Findings
{chr(10).join([f"- {rec}" for rec in validation_results['recommendations']])}

## Next Phase: Feature Analysis (02_feature_analysis.ipynb)
"""

quick_ref_path = RESULTS_PATH / "quick_reference.md"
with open(quick_ref_path, 'w') as f:
    f.write(quick_ref)

print(f"📝 Quick reference guide saved to: {quick_ref_path}")
print(f"\n🎯 Data exploration complete! Ready for feature analysis phase.")
print(f"Total analysis time: {datetime.now()}")


=== EXPORTING ANALYSIS RESULTS ===
✅ Complete analysis exported to: C:\Users\Ashutosh\Documents\Projects\NIDS\results\data_exploration\complete_data_exploration.json
📝 Quick reference guide saved to: C:\Users\Ashutosh\Documents\Projects\NIDS\results\data_exploration\quick_reference.md

🎯 Data exploration complete! Ready for feature analysis phase.
Total analysis time: 2025-09-28 15:44:09.730579
