In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns

def train_risk_appetite_model(data_path):
    # Load dataset
    df = pd.read_csv(data_path)
    
    # 1. Data Exploration & Preprocessing
    # print(f"Dataset shape: {df.shape}")
    # print(df.info())
    # print(df.describe())
    
    # Check for missing values
    missing_values = df.isnull().sum()
    # print(f"Missing values:\n{missing_values}")
    
    # Handle missing values if any
    df = df.fillna(df.median(numeric_only=True))
    
    # 2. Feature Engineering
    df = df.drop(['client_id'], axis=1)
    # Identify categorical and numerical features
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    print(f"Categorical features: {categorical_features}")
    # print(f"Numerical features: {numerical_features}")

    # # Extract target variable
    # X = df.drop([ 'risk_appetite_label'], axis=1)
    # y = df['risk_appetite_label']
    
    # # 3. Data Splitting with 70:15:15 ratio
    # # First split: 70% train, 30% temp
    # X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    # # Second split: Split the temp into validation and test (50% each, which is 15% of original)
    # X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    
    # print(f"Train set: {X_train.shape[0]} samples ({100*X_train.shape[0]/X.shape[0]:.1f}%)")
    # print(f"Validation set: {X_val.shape[0]} samples ({100*X_val.shape[0]/X.shape[0]:.1f}%)")
    # print(f"Test set: {X_test.shape[0]} samples ({100*X_test.shape[0]/X.shape[0]:.1f}%)")

    # 4. Feature Preprocessing
    # Create preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    print(X_train)
    print(y_train)
    # 5. Model Selection & Training
    models = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'LightGBM': LGBMClassifier(random_state=42)
    }
    
    best_model = None
    best_accuracy = 0
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        # Create pipeline with preprocessing and model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        # Extract target variable
        X = df.drop([ 'risk_appetite_label'], axis=1)
        y = df['risk_appetite_label']
        
        # 3. Data Splitting with 70:15:15 ratio
        # First split: 70% train, 30% temp
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        
        # Second split: Split the temp into validation and test (50% each, which is 15% of original)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Validate model
        y_val_pred = pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        print(f"{name} Validation Accuracy: {accuracy:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = pipeline
            best_model_name = name
    
    print(f"\nBest model is {best_model_name} with validation accuracy: {best_accuracy:.4f}")
    
    # 6. Hyperparameter Tuning for the best model
    print("\nPerforming hyperparameter tuning...")
    
    if best_model_name == 'Random Forest':
        param_grid = {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    elif best_model_name == 'Gradient Boosting':
        param_grid = {
            'model__n_estimators': [100, 200, 300],
            'model__learning_rate': [0.01, 0.05, 0.1],
            'model__max_depth': [3, 5, 7]
        }
    elif best_model_name == 'XGBoost':
        param_grid = {
            'model__n_estimators': [100, 200, 300],
            'model__learning_rate': [0.01, 0.05, 0.1],
            'model__max_depth': [3, 5, 7],
            'model__subsample': [0.8, 0.9, 1.0]
        }
    else:  # LightGBM
        param_grid = {
            'model__n_estimators': [100, 200, 300],
            'model__learning_rate': [0.01, 0.05, 0.1],
            'model__max_depth': [3, 5, 7],
            'model__num_leaves': [31, 63, 127]
        }
    
    # Grid search for hyperparameter tuning
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
    
    # 7. Feature Importance Analysis
    final_model = grid_search.best_estimator_
    
    # Evaluate on validation set again
    y_val_pred = final_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"\nFinal model validation accuracy: {val_accuracy:.4f}")
    
    # 8. Feature Selection based on importance
    # Get feature importances if model supports it
    if hasattr(final_model.named_steps['model'], 'feature_importances_'):
        # Get feature names after preprocessing
        feature_names = []
        for name, transformer, columns in preprocessor.transformers_:
            if hasattr(transformer, 'get_feature_names_out'):
                if name == 'cat':
                    feature_names.extend(transformer.get_feature_names_out(columns))
                else:
                    feature_names.extend(columns)
            else:
                feature_names.extend(columns)
        
        importances = final_model.named_steps['model'].feature_importances_
        if len(feature_names) == len(importances):
            feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})
            feature_importance = feature_importance.sort_values('importance', ascending=False)
            
            print("\nTop 10 important features:")
            print(feature_importance.head(10))
            
            # Plot feature importances
            plt.figure(figsize=(12, 8))
            sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
            plt.title('Feature Importance')
            plt.tight_layout()
            plt.savefig('feature_importance.png')
            
            # Use top features for a simpler model
            sfm = SelectFromModel(final_model.named_steps['model'], threshold='median')
            sfm.fit(preprocessor.transform(X_train), y_train)
            
            # Create a pipeline with selected features
            selected_features_model = Pipeline([
                ('preprocessor', preprocessor),
                ('feature_selection', sfm),
                ('model', final_model.named_steps['model'])
            ])
            
            selected_features_model.fit(X_train, y_train)
            y_val_pred_selected = selected_features_model.predict(X_val)
            selected_accuracy = accuracy_score(y_val, y_val_pred_selected)
            
            print(f"\nModel with selected features validation accuracy: {selected_accuracy:.4f}")
            
            if selected_accuracy >= val_accuracy:
                final_model = selected_features_model
                print("Using model with selected features as it performs better or equally well.")
    
    # 9. Final Evaluation on Test Set
    y_test_pred = final_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"\nFinal model test accuracy: {test_accuracy:.4f}")
    
    # Print classification report for detailed metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    
    # 10. Save the final model
    import joblib
    joblib.dump(final_model, 'risk_appetite_model.pkl')
    print("\nModel saved as 'risk_appetite_model.pkl'")
    
    return final_model, test_accuracy

# Usage example
if __name__ == "__main__":
    model, accuracy = train_risk_appetite_model('client_portfolio_data.csv')
    print(f"Model training complete with test accuracy: {accuracy:.4f}")

Categorical features: ['income_bracket', 'employment_status', 'education_level', 'risk_appetite_label', 'holdings']


UnboundLocalError: local variable 'X_train' referenced before assignment

In [None]:
train_risk_appetite_model(client_portfolio_data.csv)

In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.3-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25ldone
[?25h  Created wheel for lightgbm: filename=lightgbm-4.6.0-py3-none-linux_x86_64.whl size=2737778 sha256=548676bd080a312841ff78db3a2ed24424ffb9ee5352f4cf691dd1376422bbcf
  Stored in directory: /home/ec2-user/.cache/pip/wheels/bb/db/6d/7814aed03437129dc284a055c084f201b765deb54b6908efab
Successfully built lightgbm
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install XGBClassifier


[31mERROR: Could not find a version that satisfies the requirement XGBClassifier (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for XGBClassifier[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('client_portfolio_data.csv')

# Examine basic information
print(f"Dataset shape: {df.shape}")
print(df['risk_appetite_label'].value_counts())

# Data preprocessing
# Identify categorical and numerical columns
categorical_cols = ['client_id', 'income_bracket', 'employment_status', 'education_level', 'holdings']
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['risk_appetite_label']]

# Handle missing values
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)
    
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop client_id as it's an identifier
if 'client_id' in df.columns:
    categorical_cols.remove('client_id')
    df = df.drop('client_id', axis=1)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define features and target
X = df.drop('risk_appetite_label', axis=1)
y = df['risk_appetite_label']

# Split the data into training, validation and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameters for tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# GridSearch with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluation on validation set
y_val_pred = best_model.predict(X_val)
print("\nValidation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Confusion Matrix for validation set
plt.figure(figsize=(10, 8))
conf_matrix = confusion_matrix(y_val, y_val_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_model.classes_, 
            yticklabels=best_model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Validation Set)')
plt.savefig('confusion_matrix_validation.png')
plt.close()

# Evaluation on test set
y_test_pred = best_model.predict(X_test)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix for test set
plt.figure(figsize=(10, 8))
conf_matrix = confusion_matrix(y_test, y_test_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_model.classes_, 
            yticklabels=best_model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Test Set)')
plt.savefig('confusion_matrix_test.png')
plt.close()

# Feature importance (for the Random Forest component)
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    # Get feature names after preprocessing
    feature_names = []
    for name, trans, cols in preprocessor.transformers_:
        if name == 'cat':
            # Get one-hot encoded feature names for categorical variables
            for i, col in enumerate(cols):
                categories = trans.categories_[i]
                for cat in categories:
                    feature_names.append(f"{col}_{cat}")
        else:
            # Add numerical feature names as is
            feature_names.extend(cols)
    
    # Extract feature importances
    importances = best_model.named_steps['classifier'].feature_importances_
    
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importances for Risk Appetite Prediction')
    plt.bar(range(len(indices)), importances[indices], align='center')
    plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.savefig('feature_importances.png')
    plt.close()
    
    # Print top 15 features
    print("\nTop 15 important features:")
    for i in indices[:15]:
        print(f"{feature_names[i]}: {importances[i]:.4f}")

# Save the model
import joblib
joblib.dump(best_model, 'risk_appetite_model.pkl')
print("\nModel saved as 'risk_appetite_model.pkl'")

# Function for predicting risk appetite for new clients
def predict_risk_appetite(client_data):
    """
    Predict risk appetite for a new client
    
    Parameters:
    client_data (dict): Dictionary containing client features
    
    Returns:
    str: Predicted risk appetite label
    """
    # Convert dictionary to DataFrame
    client_df = pd.DataFrame([client_data])
    
    # Make prediction
    prediction = best_model.predict(client_df)[0]
    
    # Get probability scores
    probabilities = best_model.predict_proba(client_df)[0]
    prob_dict = {best_model.classes_[i]: prob for i, prob in enumerate(probabilities)}
    
    return prediction, prob_dict

print("\nExample of using the prediction function:")
print("predict_risk_appetite({'age': 35, 'income_bracket': 'High', ...})")