In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('client_portfolio_data.csv')

# Examine basic information
print(f"Dataset shape: {df.shape}")
print(df['risk_appetite_label'].value_counts())

# Data preprocessing
# Identify categorical and numerical columns
categorical_cols = ['client_id', 'income_bracket', 'employment_status', 'education_level', 'holdings']
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['risk_appetite_label']]

# Handle missing values
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)
    
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop client_id as it's an identifier
if 'client_id' in df.columns:
    categorical_cols.remove('client_id')
    df = df.drop('client_id', axis=1)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define features and target
X = df.drop('risk_appetite_label', axis=1)
y = df['risk_appetite_label']

# Split the data into training, validation and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameters for tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# GridSearch with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluation on validation set
y_val_pred = best_model.predict(X_val)
print("\nValidation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Confusion Matrix for validation set
plt.figure(figsize=(10, 8))
conf_matrix = confusion_matrix(y_val, y_val_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_model.classes_, 
            yticklabels=best_model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Validation Set)')
plt.savefig('confusion_matrix_validation.png')
plt.close()

# Evaluation on test set
y_test_pred = best_model.predict(X_test)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

  from pandas.core.computation.check import NUMEXPR_INSTALLED
Matplotlib is building the font cache; this may take a moment.


Dataset shape: (2000, 29)
risk_appetite_label
Moderate                   622
Moderately Conservative    521
Conservative               385
Moderately Aggressive      298
Aggressive                 174
Name: count, dtype: int64
Training set size: 1400
Validation set size: 300
Test set size: 300


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)



Validation Set Performance:
Accuracy: 0.9567

Classification Report:
                         precision    recall  f1-score   support

             Aggressive       1.00      0.81      0.89        26
           Conservative       0.98      1.00      0.99        58
               Moderate       0.94      0.99      0.96        93
  Moderately Aggressive       0.87      0.87      0.87        45
Moderately Conservative       1.00      0.99      0.99        78

               accuracy                           0.96       300
              macro avg       0.96      0.93      0.94       300
           weighted avg       0.96      0.96      0.96       300


Test Set Performance:
Accuracy: 0.9300

Classification Report:
                         precision    recall  f1-score   support

             Aggressive       1.00      0.81      0.89        26
           Conservative       0.98      0.97      0.97        58
               Moderate       0.89      0.99      0.93        94
  Moderately Aggr

In [2]:
# Confusion Matrix for test set
plt.figure(figsize=(10, 8))
conf_matrix = confusion_matrix(y_test, y_test_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_model.classes_, 
            yticklabels=best_model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Test Set)')
plt.savefig('confusion_matrix_test.png')
plt.close()

# Feature importance (for the Random Forest component)
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    # Get feature names after preprocessing
    feature_names = []
    for name, trans, cols in preprocessor.transformers:
        if name == 'cat':
            # Get one-hot encoded feature names for categorical variables
            for i, col in enumerate(cols):
                categories = trans.categories[i]
                for cat in categories:
                    feature_names.append(f"{col}_{cat}")
        else:
            # Add numerical feature names as is
            feature_names.extend(cols)
    
    # Extract feature importances
    importances = best_model.named_steps['classifier'].feature_importances_
    
    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importances for Risk Appetite Prediction')
    plt.bar(range(len(indices)), importances[indices], align='center')
    # plt.xticks(range(len(indices)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.savefig('feature_importances.png')
    plt.close()
    
    # Print top 15 features
    print("\nTop 15 important features:")
    for i in indices[:15]:
        print(f"{feature_names[i]}: {importances[i]:.4f}")

# Save the model
import joblib
joblib.dump(best_model, 'random_forest_risk_appetite_model.pkl')
print("\nModel saved as 'random_forest_risk_appetite_model.pkl'")


Top 15 important features:
equities_percentage: 0.1204
bonds_percentage: 0.1159
international_exposure_percentage: 0.0868
max_drawdown: 0.0781
portfolio_volatility: 0.0771
sharpe_ratio: 0.0697
cash_percentage: 0.0396
international_ratio: 0.0294
real_estate_percentage: 0.0260
portfolio_concentration: 0.0229
alternatives_percentage: 0.0206
avg_annual_return: 0.0202
total_portfolio_value: 0.0155
risk_capacity_score: 0.0151
net_worth: 0.0149

Model saved as 'random_forest_risk_appetite_model.pkl'


In [13]:
# Function for predicting risk appetite for new clients
#def predict_risk_appetite(client_data):
   # """
    #Predict risk appetite for a new client
    
    #Parameters:
    #client_data (dict): Dictionary containing client features
    
    #Returns:
    #str: Predicted risk appetite label
    #"""
    # Convert dictionary to DataFrame
  #  client_df = client_data
    
    # Make prediction
  #  prediction = best_model.predict(client_df)[0]
    
    # Get probability scores
  #  probabilities = best_model.predict_proba(client_df)[0]
  #  prob_dict = {best_model.classes_[i]: prob for i, prob in enumerate(probabilities)}
    
  #  return prediction, prob_dict



In [2]:
print("\nExample of using the prediction function:")
import pandas as pd
import numpy as np
import pickle
import joblib
with open('random_forest_risk_appetite_model.pkl', 'rb') as file:
    loaded_model =  joblib.load(file)
df = pd.read_csv('client_portfolio_data.csv')
last = df.drop('risk_appetite_label', axis=1).tail(1)
predictions = loaded_model.predict(last)
print("Prediction")
print(predictions)


Example of using the prediction function:
Prediction
['Conservative']


In [4]:
df.tail(1)

Unnamed: 0,client_id,age,income_bracket,net_worth,investment_horizon,investment_experience_years,financial_dependents,employment_status,education_level,risk_appetite_label,...,alternatives_percentage,real_estate_percentage,commodities_percentage,international_exposure_percentage,holdings,portfolio_concentration,num_holdings,avg_annual_return,international_ratio,risk_capacity_score
1999,CL101999,26,\$30K-\$60K,21000.0,35,8,0,Unemployed,Professional Degree,Conservative,...,0.0,0.0315,0.0481,0.0,"[{'category': 'Equities', 'type': 'Mutual Fund...",0.107314,17,0.07835,0.0,87.412587


{'client_id': 'CL101999',
 'age': 26,
 'income_bracket': '\\$30K-\\$60K',
 'net_worth': 21000.0,
 'investment_horizon': 35,
 'investment_experience_years': 8,
 'financial_dependents': 0,
 'employment_status': 'Unemployed',
 'education_level': 'Professional Degree',
 'risk_appetite_label': 'Conservative',
 'total_portfolio_value': 10518.6999043021,
 'portfolio_volatility': 0.0705,
 'sharpe_ratio': 0.77,
 'max_drawdown': -0.06,
 'years_to_retirement': 39,
 'liquidity_needs_score': 10,
 'equities_percentage': 0.3115,
 'bonds_percentage': 0.5357,
 'cash_percentage': 0.0732,
 'alternatives_percentage': 0.0,
 'real_estate_percentage': 0.0315,
 'commodities_percentage': 0.0481,
 'international_exposure_percentage': 0.0,
 'holdings': "[{'category': 'Equities', 'type': 'Mutual Fund', 'name': 'SWPPX', 'sector': 'Broad Market', 'value': 815.43, 'purchase_date': '2021-04-06', 'annual_return': 0.1224, 'is_international': False}, {'category': 'Equities', 'type': 'Mutual Fund', 'name': 'FXAIX', 'sect

AttributeError: 'str' object has no attribute 'dtype'