In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
import joblib

In [3]:
data = pd.read_csv("investor_risk_return_data.csv")
data.head()

Unnamed: 0,age,monthly_income,investment_knowledge,investment_experience,num_assets,time_horizon,goal_type,investment_frequency,dependents,market_sensitivity,net_worth,preferred_asset,education_level,employment_type,risk_profile,expected_return
0,56,10930,7,16,2,10,Retirement,Monthly,0,5,749613,Real Estate,High School,Salaried,Speculative,2.68
1,69,4285,4,16,4,10,Retirement,Quarterly,4,2,1609455,Crypto,High School,Retired,Speculative,2.0
2,46,4095,5,0,4,21,Education,Monthly,1,2,762546,Mixed,Masters,Unemployed,Speculative,3.7
3,32,17704,9,0,2,18,Wealth Accumulation,Yearly,3,5,137805,Crypto,Masters,Salaried,Moderate,2.0
4,60,19705,7,8,9,21,Wealth Accumulation,Yearly,3,9,1879055,Bonds,Bachelors,Retired,Speculative,3.82


In [4]:
data.describe()

Unnamed: 0,age,monthly_income,investment_knowledge,investment_experience,num_assets,time_horizon,dependents,market_sensitivity,net_worth,expected_return
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,43.5394,10569.4314,5.4756,9.5819,5.0265,15.0235,2.002,4.9786,1003934.0,3.366692
std,14.911636,5510.965004,2.85724,5.713734,2.571198,8.392091,1.413717,3.161794,570364.0,0.958894
min,18.0,1002.0,1.0,0.0,1.0,1.0,0.0,0.0,5055.0,2.0
25%,31.0,5782.0,3.0,5.0,3.0,8.0,1.0,2.0,523482.0,2.61
50%,43.0,10586.5,5.0,10.0,5.0,15.0,2.0,5.0,1002044.0,3.31
75%,56.0,15431.25,8.0,15.0,7.0,22.0,3.0,8.0,1488106.0,4.03
max,69.0,19998.0,10.0,19.0,9.0,29.0,4.0,10.0,1999981.0,6.91


In [5]:
data.isnull().sum()

age                      0
monthly_income           0
investment_knowledge     0
investment_experience    0
num_assets               0
time_horizon             0
goal_type                0
investment_frequency     0
dependents               0
market_sensitivity       0
net_worth                0
preferred_asset          0
education_level          0
employment_type          0
risk_profile             0
expected_return          0
dtype: int64

In [6]:
le = LabelEncoder()

In [7]:
data['goal_type'] = le.fit_transform(data['goal_type'])
data['investment_frequency'] = le.fit_transform(data['investment_frequency'])
data['preferred_asset'] = le.fit_transform(data['preferred_asset'])
data['education_level'] = le.fit_transform(data['education_level'])
data['employment_type'] = le.fit_transform(data['employment_type'])
data['risk_profile'] = le.fit_transform(data['risk_profile'])

In [8]:
data.head()

Unnamed: 0,age,monthly_income,investment_knowledge,investment_experience,num_assets,time_horizon,goal_type,investment_frequency,dependents,market_sensitivity,net_worth,preferred_asset,education_level,employment_type,risk_profile,expected_return
0,56,10930,7,16,2,10,2,0,0,5,749613,3,1,2,3,2.68
1,69,4285,4,16,4,10,2,1,4,2,1609455,1,1,1,3,2.0
2,46,4095,5,0,4,21,0,0,1,2,762546,2,2,3,3,3.7
3,32,17704,9,0,2,18,3,2,3,5,137805,1,2,2,2,2.0
4,60,19705,7,8,9,21,3,2,3,9,1879055,0,0,1,3,3.82


In [9]:
X = data.drop(columns=['expected_return'])
X

Unnamed: 0,age,monthly_income,investment_knowledge,investment_experience,num_assets,time_horizon,goal_type,investment_frequency,dependents,market_sensitivity,net_worth,preferred_asset,education_level,employment_type,risk_profile
0,56,10930,7,16,2,10,2,0,0,5,749613,3,1,2,3
1,69,4285,4,16,4,10,2,1,4,2,1609455,1,1,1,3
2,46,4095,5,0,4,21,0,0,1,2,762546,2,2,3,3
3,32,17704,9,0,2,18,3,2,3,5,137805,1,2,2,2
4,60,19705,7,8,9,21,3,2,3,9,1879055,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,55,9456,5,19,6,20,2,2,4,7,1846674,1,2,1,3
9996,51,7331,8,10,6,29,3,2,2,6,758730,0,2,0,3
9997,57,14107,9,10,8,24,0,1,1,4,1645259,3,1,2,3
9998,64,10494,9,7,7,23,3,0,0,9,720632,0,3,3,3


In [10]:
y = data['expected_return']
y

0       2.68
1       2.00
2       3.70
3       2.00
4       3.82
        ... 
9995    3.42
9996    4.15
9997    2.85
9998    5.80
9999    2.75
Name: expected_return, Length: 10000, dtype: float64

In [11]:
y.value_counts()

expected_return
2.00    1096
2.97      52
3.49      49
3.74      48
3.53      47
        ... 
6.34       1
6.28       1
6.32       1
6.58       1
5.99       1
Name: count, Length: 429, dtype: int64

In [12]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
x_train.shape

(8000, 15)

In [14]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [15]:
x_train_scaled.shape,x_test_scaled.shape

((8000, 15), (2000, 15))

In [16]:
from tensorflow.keras.models import Sequential

model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train_scaled.shape[1],)), 
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1) 
])

model.compile(
    optimizer='adam', 
    loss=MeanSquaredError(), 
    metrics=['mae']
    )

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
model.summary()

In [18]:
log_dir = "tensorflowlogs"
tensorboard_cb = TensorBoard(log_dir = log_dir)

In [19]:
early_stop_cb =  EarlyStopping(monitor='val_loss', patience=5,verbose=1)

In [20]:
checkpoint_cb = ModelCheckpoint(filepath='best_model.h5',monitor='val_loss',save_best_only=True,save_freq=10)

In [21]:
hist = model.fit(
    x_train_scaled, y_train,
    epochs=100,
    callbacks=[tensorboard_cb,checkpoint_cb],
    validation_split=0.1
)
#  early_stop_cb, 

Epoch 1/100
[1m 53/225[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 5ms/step - loss: 9.0109 - mae: 2.6896

  self._save_model(epoch=self._current_epoch, batch=batch, logs=logs)


[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 4.8225 - mae: 1.7674 - val_loss: 1.0694 - val_mae: 0.8331
Epoch 2/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.4821 - mae: 0.9742 - val_loss: 0.9969 - val_mae: 0.8026
Epoch 3/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.2095 - mae: 0.8805 - val_loss: 1.0132 - val_mae: 0.8046
Epoch 4/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0879 - mae: 0.8396 - val_loss: 0.9680 - val_mae: 0.7885
Epoch 5/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0056 - mae: 0.8056 - val_loss: 0.9452 - val_mae: 0.7808
Epoch 6/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.9701 - mae: 0.8009 - val_loss: 0.9211 - val_mae: 0.7734
Epoch 7/100
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss:

In [22]:
model.save('best_model.h5')
joblib.dump(scaler, 'scaler.joblib')



['scaler.joblib']

In [23]:
from tensorflow.keras.models import load_model
import joblib
import numpy as np

user_input = np.array([[25, 5000, 8, 6, 5, 10, 0, 2, 0, 5, 100000, 1, 2, 1,4]])  # Example user input
user_input_scaled = scaler.transform(user_input)  # Scale the input

# Load the saved model
model = load_model('best_model.h5')

# Predict the expected return
predicted_return = model.predict(user_input_scaled)
print(f"Predicted expected return: {predicted_return[0][0]}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step
Predicted expected return: 4.360814571380615
