# Machine Learning Model: ModelType.LOGISTIC_REGRESSION for TaskType.CLASSIFICATION
    
Generated on 2025-03-11 09:19:26

This notebook demonstrates an end-to-end machine learning workflow for classification using a logistic regression model.

**Dataset:** ba6aa228-bd5e-457f-9b99-bee9d2a908a5
**Target Variable:** Survived
**Task Type:** TaskType.CLASSIFICATION


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


## Data Loading and Overview

In [None]:
# Load the dataset
# In a production environment, this would load from a file path or database
# For this notebook, we're using the data that was uploaded
df = pd.read_csv('data.csv')  # Placeholder - this will be replaced with actual data

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
display(df.head())

# Summary statistics
print(f"\nSummary statistics:")
display(df.describe(include='all').T)

# Check for missing values
print(f"\nMissing values per column:")
display(df.isnull().sum())


## Exploratory Data Analysis

In [None]:
# Explore the target variable
plt.figure(figsize=(10, 6))
target_counts = df['Survived'].value_counts()
sns.barplot(x=target_counts.index, y=target_counts.values)
plt.title('Distribution of Target Classes')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Correlation analysis for numerical features
numeric_df = df.select_dtypes(include=['number'])
if numeric_df.shape[1] > 1:  # Only if we have numeric features
    corr = numeric_df.corr()
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr, dtype=bool))
    sns.heatmap(corr, mask=mask, cmap='coolwarm', annot=True, fmt='.2f', square=True)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

# Feature distribution by class
numerical_features = df.select_dtypes(include=['number']).columns.tolist()
numerical_features = [f for f in numerical_features if f != 'Survived'][:3]  # Top 3 numerical features

if numerical_features:
    fig, axes = plt.subplots(len(numerical_features), 1, figsize=(12, 4*len(numerical_features)))
    if len(numerical_features) == 1:
        axes = [axes]  # Make axes iterable if only one feature
        
    for i, feature in enumerate(numerical_features):
        sns.boxplot(x='Survived', y=feature, data=df, ax=axes[i])
        axes[i].set_title(f'{feature} by Survived')
    
    plt.tight_layout()
    plt.show()


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


## Data Preprocessing and Split

In [None]:
# Split data into features and target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

# Check for complex object features that may cause issues
for col in categorical_features:
    sample = df[col].iloc[0]
    if isinstance(sample, dict) or isinstance(sample, list):
        print(f"Warning: Column {col} contains complex objects which may not work with standard transformations.")
        print(f"Sample value: {sample}")
        print(f"Consider extracting specific fields from this object or excluding it from your model.")

# Apply one-hot encoding to Name
if 'Name' in categorical_features:
    # Check if the feature contains complex values that need special handling
    if X['Name'].apply(lambda x: isinstance(x, (dict, list))).any():
        print(f"Warning: Cannot apply one-hot encoding to complex objects in Name")
        print(f"Converting to string representation first")
        X['Name'] = X['Name'].astype(str)
    try:
        onehot = pd.get_dummies(X['Name'], prefix='Name')
        X = pd.concat([X.drop('Name', axis=1), onehot], axis=1)
        print(f"Created one-hot encoding for: Name")
    except Exception as e:
        print(f"Error one-hot encoding Name: {e}")
# Apply one-hot encoding to Sex
if 'Sex' in categorical_features:
    # Check if the feature contains complex values that need special handling
    if X['Sex'].apply(lambda x: isinstance(x, (dict, list))).any():
        print(f"Warning: Cannot apply one-hot encoding to complex objects in Sex")
        print(f"Converting to string representation first")
        X['Sex'] = X['Sex'].astype(str)
    try:
        onehot = pd.get_dummies(X['Sex'], prefix='Sex')
        X = pd.concat([X.drop('Sex', axis=1), onehot], axis=1)
        print(f"Created one-hot encoding for: Sex")
    except Exception as e:
        print(f"Error one-hot encoding Sex: {e}")
# Apply one-hot encoding to Ticket
if 'Ticket' in categorical_features:
    # Check if the feature contains complex values that need special handling
    if X['Ticket'].apply(lambda x: isinstance(x, (dict, list))).any():
        print(f"Warning: Cannot apply one-hot encoding to complex objects in Ticket")
        print(f"Converting to string representation first")
        X['Ticket'] = X['Ticket'].astype(str)
    try:
        onehot = pd.get_dummies(X['Ticket'], prefix='Ticket')
        X = pd.concat([X.drop('Ticket', axis=1), onehot], axis=1)
        print(f"Created one-hot encoding for: Ticket")
    except Exception as e:
        print(f"Error one-hot encoding Ticket: {e}")
# Apply one-hot encoding to Cabin
if 'Cabin' in categorical_features:
    # Check if the feature contains complex values that need special handling
    if X['Cabin'].apply(lambda x: isinstance(x, (dict, list))).any():
        print(f"Warning: Cannot apply one-hot encoding to complex objects in Cabin")
        print(f"Converting to string representation first")
        X['Cabin'] = X['Cabin'].astype(str)
    try:
        onehot = pd.get_dummies(X['Cabin'], prefix='Cabin')
        X = pd.concat([X.drop('Cabin', axis=1), onehot], axis=1)
        print(f"Created one-hot encoding for: Cabin")
    except Exception as e:
        print(f"Error one-hot encoding Cabin: {e}")
# Apply one-hot encoding to Embarked
if 'Embarked' in categorical_features:
    # Check if the feature contains complex values that need special handling
    if X['Embarked'].apply(lambda x: isinstance(x, (dict, list))).any():
        print(f"Warning: Cannot apply one-hot encoding to complex objects in Embarked")
        print(f"Converting to string representation first")
        X['Embarked'] = X['Embarked'].astype(str)
    try:
        onehot = pd.get_dummies(X['Embarked'], prefix='Embarked')
        X = pd.concat([X.drop('Embarked', axis=1), onehot], axis=1)
        print(f"Created one-hot encoding for: Embarked")
    except Exception as e:
        print(f"Error one-hot encoding Embarked: {e}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


## Model Training

In [None]:
# Initialize the model
model = LogisticRegression(random_state=42, max_iter=1000)

# Create a pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

print("Model training complete!")


## Model Evaluation

In [None]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate classification metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(conf_matrix)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# ROC curve and AUC (for binary classification)
if len(np.unique(y_test)) == 2:
    try:
        y_prob = pipeline.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc = roc_auc_score(y_test, y_prob)
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.tight_layout()
        plt.show()
        
        print(f"\nAUC: {auc:.4f}")
    except:
        print("Could not calculate ROC curve and AUC.")


## Feature Importance

In [None]:
# Try to get feature importance if available
try:
    if hasattr(pipeline['model'], 'feature_importances_'):
        # Get feature names from the preprocessor
        feature_names = []
        for name, trans, cols in pipeline['preprocessor'].transformers_:
            if name == 'cat' and cols:
                # Get the one-hot encoded feature names
                cat_features = trans.named_steps['onehot'].get_feature_names_out(cols)
                feature_names.extend(cat_features)
            else:
                feature_names.extend(cols)
        
        # Get feature importances from the model
        importances = pipeline['model'].feature_importances_
        
        # Create a dataframe with feature importances
        if len(feature_names) == len(importances):
            feature_importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importances
            }).sort_values(by='Importance', ascending=False)
            
            # Plot feature importances
            plt.figure(figsize=(10, 8))
            sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15))
            plt.title('Feature Importance')
            plt.tight_layout()
            plt.show()
            
            print(feature_importance_df.head(15))
    else:
        print("Feature importances not available for this model.")
except Exception as e:
    print(f"Could not compute feature importances: {e}")


## Summary and Next Steps

In [None]:
# Summary of the model
print("Model Training Summary:")
print(f"Dataset: {len(X_train) + len(X_test)} samples ({len(X_train)} train, {len(X_test)} test)")
print(f"Task Type: TaskType.CLASSIFICATION")
print(f"Model Type: ModelType.LOGISTIC_REGRESSION")
print(f"Features Used: {len(X.columns)}")

print("\nNext Steps:")
print("1. Try different feature transformations to improve performance")
print("2. Experiment with hyperparameter tuning to find optimal model settings")
print("3. Consider feature selection to focus on the most important variables")
print("4. For deployment, save the model using joblib or pickle")
print("5. Monitor model performance over time and retrain as needed")


## SHAP Values for Model Explainability

In [None]:
# Try to compute SHAP values for model explainability
try:
    import shap
    
    # Sample a subset of the test data for SHAP analysis (for performance)
    n_samples = min(100, X_test.shape[0])
    X_sample = X_test.iloc[:n_samples]
    
    # Create a SHAP explainer
    try:
        # For sklearn models
        explainer = shap.Explainer(pipeline['model'], pipeline['preprocessor'].transform(X_sample))
        shap_values = explainer(pipeline['preprocessor'].transform(X_sample))
        
        # Summary plot
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, pipeline['preprocessor'].transform(X_sample))
        plt.tight_layout()
    except Exception as shap_error:
        print(f"First SHAP approach failed: {shap_error}")
        
        # Try alternative approach for tree-based models
        if hasattr(pipeline['model'], 'feature_importances_'):
            explainer = shap.TreeExplainer(pipeline['model'])
            # Transform the data first
            X_transformed = pipeline['preprocessor'].transform(X_sample)
            shap_values = explainer.shap_values(X_transformed)
            
            plt.figure(figsize=(10, 8))
            if isinstance(shap_values, list):  # For multi-class classification
                shap.summary_plot(shap_values[0], X_transformed)
            else:  # For regression or binary classification
                shap.summary_plot(shap_values, X_transformed)
            plt.tight_layout()
        else:
            print("Model type not supported for detailed SHAP analysis")
            
    plt.show()
except Exception as e:
    print(f"Could not compute SHAP values: {e}")
    print("Note: SHAP analysis requires additional setup in some environments.")
    print("If you want to use SHAP, try installing it separately with: pip install shap")


## Model Export

In [None]:
# Save the trained model to a file
import joblib

try:
    # Save the pipeline (includes preprocessor and model)
    joblib.dump(pipeline, 'trained_model_pipeline.joblib')
    print("Model pipeline saved successfully!")
    
    # How to load the model
    print("\nTo load and use this model in another script:")
    print("import joblib")
    print("loaded_pipeline = joblib.load('trained_model_pipeline.joblib')")
    print("predictions = loaded_pipeline.predict(new_data)")
except Exception as e:
    print(f"Error saving model: {e}")


## Conclusion

This notebook demonstrated how to:

1. Load and explore the dataset
2. Preprocess the data for machine learning
3. Train a ModelType.LOGISTIC_REGRESSION model for TaskType.CLASSIFICATION
4. Evaluate the model's performance 
5. Understand feature importance and model explainability
6. Export the model for deployment

The model can be improved by:

* Feature engineering and selection
* Hyperparameter tuning
* Trying different algorithms
* Collecting more data
* Addressing class imbalance (if present)
