# Bosch Production Line Performance Analysis

## Competition Overview

This notebook analyzes the Bosch Production Line Performance dataset from Kaggle competition.

**Goal**: Predict internal failures using thousands of measurements and tests made for each component along the assembly line.

**Evaluation Metric**: Matthews Correlation Coefficient (MCC)

**Dataset Characteristics**:
- Highly imbalanced dataset (failure rate < 1%)
- Large number of features (thousands)
- Three types of features: numeric, categorical, and date
- Anonymous feature names for confidentiality

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from zipfile import ZipFile
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, confusion_matrix, classification_report
import gc

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## 2. Data Loading and Extraction

In [None]:
# Define data directory
data_dir = './data/'

# List all files in the data directory
data_files = os.listdir(data_dir)
print("Available files:")
for file in data_files:
    print(f"  - {file}")

In [None]:
# Extract zip files if not already extracted
def extract_zip_files(data_dir):
    zip_files = [f for f in os.listdir(data_dir) if f.endswith('.zip')]
    
    for zip_file in zip_files:
        file_path = os.path.join(data_dir, zip_file)
        csv_file = zip_file.replace('.zip', '')
        csv_path = os.path.join(data_dir, csv_file)
        
        if not os.path.exists(csv_path):
            print(f"Extracting {zip_file}...")
            with ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(data_dir)
            print(f"  Extracted to {csv_file}")
        else:
            print(f"{csv_file} already exists")

extract_zip_files(data_dir)

## 3. Data Exploration - Sample Loading

Due to the large size of the dataset, we'll first load a sample to understand the structure.

In [None]:
# Load a sample of numeric data to understand structure
print("Loading sample of train_numeric data...")
train_numeric_sample = pd.read_csv(data_dir + 'train_numeric.csv', nrows=10000)
print(f"Sample shape: {train_numeric_sample.shape}")
print(f"\nFirst few columns: {list(train_numeric_sample.columns[:10])}")

In [None]:
# Check target variable distribution
print("Target variable (Response) distribution:")
print(train_numeric_sample['Response'].value_counts())
print(f"\nFailure rate: {train_numeric_sample['Response'].mean():.4%}")

In [None]:
# Basic info about the sample
print("Dataset Info:")
print(f"Number of features: {len(train_numeric_sample.columns) - 2}")  # Excluding Id and Response
print(f"Number of samples: {len(train_numeric_sample)}")
print(f"\nMissing values percentage:")
missing_percent = (train_numeric_sample.isnull().sum() / len(train_numeric_sample) * 100).sort_values(ascending=False)
print(missing_percent.head(10))

## 4. Feature Analysis

In [None]:
# Analyze feature groups based on naming patterns
def analyze_feature_groups(df):
    feature_groups = {}
    
    for col in df.columns:
        if col not in ['Id', 'Response']:
            # Extract feature group from column name
            parts = col.split('_')
            if len(parts) >= 2:
                group = parts[0] + '_' + parts[1]
                if group not in feature_groups:
                    feature_groups[group] = []
                feature_groups[group].append(col)
    
    # Sort groups by number of features
    sorted_groups = sorted(feature_groups.items(), key=lambda x: len(x[1]), reverse=True)
    
    print("Feature Groups (Top 10):")
    for group, features in sorted_groups[:10]:
        print(f"  {group}: {len(features)} features")
    
    return feature_groups

feature_groups = analyze_feature_groups(train_numeric_sample)

In [None]:
# Visualize missing data patterns
plt.figure(figsize=(12, 6))

# Calculate missing percentage for each column
missing_df = pd.DataFrame({
    'column': train_numeric_sample.columns,
    'missing_percent': (train_numeric_sample.isnull().sum() / len(train_numeric_sample) * 100)
})

# Plot histogram of missing percentages
plt.subplot(1, 2, 1)
plt.hist(missing_df['missing_percent'], bins=50, edgecolor='black')
plt.xlabel('Missing Percentage (%)')
plt.ylabel('Number of Features')
plt.title('Distribution of Missing Data Across Features')

# Plot cumulative distribution
plt.subplot(1, 2, 2)
sorted_missing = np.sort(missing_df['missing_percent'].values)
plt.plot(range(len(sorted_missing)), sorted_missing)
plt.xlabel('Feature Index')
plt.ylabel('Missing Percentage (%)')
plt.title('Cumulative Missing Data Pattern')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Memory-Efficient Data Processing Strategy

In [None]:
def reduce_memory_usage(df):
    """Reduce memory usage by downcasting numeric types"""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe: {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    
    return df

# Apply memory reduction to sample
train_numeric_sample = reduce_memory_usage(train_numeric_sample)

## 6. Feature Engineering Ideas

In [None]:
def create_basic_features(df):
    """Create basic aggregate features"""
    feature_df = pd.DataFrame()
    feature_df['Id'] = df['Id']
    
    # Count of non-null values per row
    feature_df['count_non_null'] = df.drop(['Id', 'Response'], axis=1, errors='ignore').count(axis=1)
    
    # Count of zero values per row
    feature_df['count_zeros'] = (df.drop(['Id', 'Response'], axis=1, errors='ignore') == 0).sum(axis=1)
    
    # Basic statistics per row
    numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(['Id', 'Response'], errors='ignore')
    feature_df['mean'] = df[numeric_cols].mean(axis=1)
    feature_df['std'] = df[numeric_cols].std(axis=1)
    feature_df['min'] = df[numeric_cols].min(axis=1)
    feature_df['max'] = df[numeric_cols].max(axis=1)
    feature_df['median'] = df[numeric_cols].median(axis=1)
    
    # Percentage of missing values
    feature_df['missing_percent'] = df[numeric_cols].isnull().sum(axis=1) / len(numeric_cols)
    
    if 'Response' in df.columns:
        feature_df['Response'] = df['Response']
    
    return feature_df

# Create features for sample
engineered_features = create_basic_features(train_numeric_sample)
print("Engineered features shape:", engineered_features.shape)
print("\nFeature columns:", list(engineered_features.columns))

In [None]:
# Analyze correlation with target
if 'Response' in engineered_features.columns:
    correlations = engineered_features.drop(['Id', 'Response'], axis=1).corrwith(engineered_features['Response'])
    correlations = correlations.sort_values(ascending=False)
    
    plt.figure(figsize=(10, 6))
    correlations.plot(kind='bar')
    plt.title('Correlation of Engineered Features with Target')
    plt.xlabel('Features')
    plt.ylabel('Correlation')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("Feature Correlations with Target:")
    print(correlations)

## 7. Processing Strategy for Full Dataset

In [None]:
def process_chunk(chunk, feature_list=None):
    """Process a chunk of data"""
    # Select only specified features if provided
    if feature_list is not None:
        available_cols = [col for col in feature_list if col in chunk.columns]
        chunk = chunk[available_cols + ['Id', 'Response']]
    
    # Reduce memory
    chunk = reduce_memory_usage(chunk)
    
    # Create engineered features
    features = create_basic_features(chunk)
    
    return features

# Example of processing in chunks (commented out for sample notebook)
# chunk_size = 50000
# chunks = []
# for chunk in pd.read_csv(data_dir + 'train_numeric.csv', chunksize=chunk_size):
#     processed_chunk = process_chunk(chunk)
#     chunks.append(processed_chunk)
#     gc.collect()
# 
# full_features = pd.concat(chunks, ignore_index=True)

print("Chunk processing strategy defined for handling large dataset")

## 8. Feature Selection Strategy

In [None]:
def select_important_features(df, target_col='Response', threshold=0.95):
    """Select features based on missing value threshold and variance"""
    
    # Remove features with too many missing values
    missing_percent = df.isnull().sum() / len(df)
    keep_cols = missing_percent[missing_percent < threshold].index.tolist()
    
    print(f"Features after missing value filter: {len(keep_cols)} / {len(df.columns)}")
    
    # Remove features with zero or very low variance
    numeric_cols = df[keep_cols].select_dtypes(include=[np.number]).columns
    variances = df[numeric_cols].var()
    keep_cols = variances[variances > 0.01].index.tolist()
    
    # Always keep Id and Response
    if 'Id' not in keep_cols:
        keep_cols.append('Id')
    if target_col in df.columns and target_col not in keep_cols:
        keep_cols.append(target_col)
    
    print(f"Features after variance filter: {len(keep_cols)}")
    
    return keep_cols

# Apply feature selection to sample
important_features = select_important_features(train_numeric_sample)
print(f"\nSelected {len(important_features)} features from {len(train_numeric_sample.columns)}")

## 9. Model Training Strategy

In [None]:
# Import modeling libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

print("Model libraries imported successfully")

In [None]:
def prepare_data_for_modeling(df, target_col='Response'):
    """Prepare data for modeling"""
    # Separate features and target
    X = df.drop(['Id', target_col], axis=1, errors='ignore')
    y = df[target_col] if target_col in df.columns else None
    
    # Handle missing values
    X = X.fillna(X.median())
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y, scaler

# Prepare sample data
X_sample, y_sample, scaler = prepare_data_for_modeling(engineered_features)
print(f"Prepared data shape: X={X_sample.shape}, y={y_sample.shape if y_sample is not None else 'None'}")

In [None]:
def handle_imbalance(X, y, strategy='undersample', ratio=0.1):
    """Handle class imbalance"""
    if strategy == 'oversample':
        sampler = SMOTE(sampling_strategy=ratio, random_state=42)
    elif strategy == 'undersample':
        sampler = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
    else:
        return X, y
    
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    print(f"Resampled data: {len(y_resampled)} samples")
    print(f"Class distribution: {pd.Series(y_resampled).value_counts()}")
    
    return X_resampled, y_resampled

# Example of handling imbalance (commented for sample)
# X_balanced, y_balanced = handle_imbalance(X_sample, y_sample, strategy='undersample')

## 10. Next Steps and Recommendations

### Key Findings:
1. **Extreme Class Imbalance**: The failure rate is less than 1%, requiring special handling
2. **High Dimensionality**: Thousands of features with high sparsity
3. **Missing Data**: Many features have significant missing values

### Recommended Approach:

#### 1. Data Processing:
- Process data in chunks due to size
- Implement aggressive feature selection
- Create aggregate features per production line/station

#### 2. Feature Engineering:
- Count-based features (non-nulls, zeros)
- Statistical aggregations
- Time-based features from date files
- Interaction features between stations

#### 3. Modeling Strategy:
- Use Matthews Correlation Coefficient (MCC) for evaluation
- Try ensemble methods (XGBoost, LightGBM)
- Implement cross-validation with stratification
- Consider anomaly detection approaches

#### 4. Class Imbalance Handling:
- Adjust class weights
- Try SMOTE for oversampling
- Use ensemble methods with balanced subsampling
- Optimize threshold for prediction

### Code Template for Full Processing:

In [None]:
# Template for full dataset processing
def full_pipeline():
    """
    Complete pipeline for Bosch dataset
    """
    # 1. Extract all files
    extract_zip_files(data_dir)
    
    # 2. Process numeric data in chunks
    chunk_size = 50000
    numeric_features = []
    
    for chunk in pd.read_csv(data_dir + 'train_numeric.csv', chunksize=chunk_size):
        # Process chunk
        chunk = reduce_memory_usage(chunk)
        features = create_basic_features(chunk)
        numeric_features.append(features)
        gc.collect()
    
    # 3. Combine features
    full_features = pd.concat(numeric_features, ignore_index=True)
    
    # 4. Add categorical features (simplified)
    # Process categorical data similarly
    
    # 5. Add date features
    # Process date data for time-based features
    
    # 6. Train model
    X, y, scaler = prepare_data_for_modeling(full_features)
    
    # 7. Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # 8. Train XGBoost
    model = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        scale_pos_weight=100,  # Handle imbalance
        random_state=42
    )
    
    # 9. Evaluate and predict
    # ...
    
    return model

print("Full pipeline template defined")
print("\nTo run the full pipeline, uncomment and execute:")
print("# model = full_pipeline()")

## 11. Submission Preparation

In [None]:
# Load submission template
submission = pd.read_csv(data_dir + 'sample_submission.csv')
print("Submission template shape:", submission.shape)
print("\nSubmission columns:", submission.columns.tolist())
print("\nFirst few rows:")
submission.head()

In [None]:
def create_submission(model, test_features, submission_template):
    """
    Create submission file
    """
    # Make predictions
    predictions = model.predict_proba(test_features)[:, 1]
    
    # Create submission dataframe
    submission = submission_template.copy()
    submission['Response'] = predictions
    
    # Save submission
    submission.to_csv('submission.csv', index=False)
    print("Submission saved to submission.csv")
    
    return submission

print("Submission function ready")

## Summary

This notebook provides a comprehensive framework for analyzing the Bosch Production Line Performance dataset:

1. **Data Understanding**: Explored the structure and characteristics of the dataset
2. **Memory Management**: Implemented techniques to handle large files efficiently
3. **Feature Engineering**: Created aggregate features to capture patterns
4. **Imbalance Handling**: Prepared strategies for the extreme class imbalance
5. **Pipeline Template**: Provided a complete processing pipeline

To proceed with full analysis:
1. Run the extraction to get all CSV files
2. Implement the chunk processing for the full dataset
3. Experiment with different feature engineering approaches
4. Train and validate models using MCC metric
5. Optimize prediction threshold for best performance