In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sklearn
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('histoy_investment_results.csv')

#Remove duplicated data
df.duplicated().shape
df = df.drop_duplicates()

In [None]:
#Determining Dependent Variable
investment = df.drop('result', axis=1)
investment_labels = df['result'].copy()

### Preprocessing

In [None]:
#Imputation for numerical variables
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median') # using the median

#Select the numerical varibles
investment_num = investment.select_dtypes(include=[np.number])

imputer.fit(investment_num)

### Handling categorical attributes 

In [None]:
investment_cat = investment[['categorical data']]

#Classify them to 1Hot & ordinal (0/1)
investment_cat_oneHot = investment_cat[['1 hot categorical data']]
investment_cat_ordinal = investment_cat[['ordinal categorical data']]

#Changing them to 1Hot
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
investment_cat_1hot = cat_encoder.fit_transform(investment_cat_oneHot)

#Change them to ordinal
from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder()
investment_ordinal_transform = ordinal.fit_transform(investment_cat_ordinal)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer
#Standardization
std_scaler = StandardScaler()
investment_scaled = std_scaler.fit_transform(investment_num)

### Transformation Pipeline  

In [None]:
from sklearn.pipeline import Pipeline

#1st - Imputation & Standardization
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

investment_prepared = num_pipeline.fit_transform(investment_num)

from sklearn.compose import ColumnTransformer

#Classifying the variables
num_features = ['list out all the numerical features']
cat_features_1hot = ['list out all the 1 hot features']
cat_features_ordinal = ['list out all the ordinal features']

#2nd
OneHot_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='most_frequent')),
    ('1hot_encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

#3rd
Ordinal_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("1hot", OneHot_pipeline, cat_features_1hot),
    ("ordinal", Ordinal_pipeline, cat_features_ordinal),
    ], remainder='passthrough')

from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy="minority")

## Model 

### Logistic Regression 

In [None]:
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.linear_model import LogisticRegression
logReg = imbpipeline(steps = [
    ["preprocessing", preprocessing],
    ["SMOTE", SMOTE(random_state=0, sampling_strategy='minority')],
    ["logistic", LogisticRegression(solver='lbfgs', random_state=0, penalty='l2')]
])

In [None]:
X = investment
Y = investment_labels
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=0)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay

logReg.fit(X_train, Y_train)
y_pred = logReg.predict(X_test)
y_proba = logReg.decision_function(X_test)
phat = y_proba
decision_boundary = X_test[y_proba >= 0.5]

print("\n Logistic Regression Evaluation:\n")
print(classification_report(y_test, y_pred))

#display the confusion matrix to visualize the result
print("\n Confusion matrix:\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

### Random Search to fine tune the Logistic Regression 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

param_grid={
    'GBM__n_estimators':[100, 200, 300],
    'GBM__learning_rate': [0.05, 0.1, 0.2]
}

random_search = RandomizedSearchCV(estimator = logReg, param_distributions = param_grid, n_iter = 50, cv = 5, scoring = 'f1', random_state = 0)

random_search.fit(X_train, y_train)

best_params_LR = random_search.best_params_

print(best_params_LR)

### Super Vector Machine (SVM) 

In [None]:
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import SVC

#Initialize the SVM model
svm = imbpipeline(steps=[
    ["preprocessing", preprocessing],
    ["SMOTE", SMOTE(random_state=0, sampling_strategy='minority')],
    ["SVM", SVC(kernel='linear')]
])

In [None]:
X = investment
y = investment_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("\n SVM Evaluation:\n")
print(classification_report(y_test, y_pred))

print("\n Confusion matrix:\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

### Random Search to fine tune the SVM 

In [None]:
param_grid_SVM = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'gamma':['scale', 'auto'],
              'kernel': ['linear']}

random_search = RandomizedSearchCV(estimator = svm, param_distributions = param_grid_SVM, n_iter = 50, cv = 5, scoring = 'f1', random_state = 0)

random_search.fit(X_train, y_train)

best_params_SVM = random_search.best_params_

print(best_params_SVM)


### Neural Network Model 

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from imblearn.pipeline import Pipeline as imbpipeline

# Initialize the Naive Bayes model
NNM = imbpipeline(steps = [
    ["preprocessing", preprocessing],
    ["SMOTE", SMOTE(random_state=0, sampling_strategy='minority')],
    ["NNM", MLPClassifier(solver='lbfgs', alpha=0.01, hidden_layer_sizes=(10,), random_state=1)]
])

X = investment
Y = investment_labels
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify = Y, test_size=0.2, random_state=0)

NNM.fit(X_train, y_train)
y_pred = NNM.predict(X_test)

print("\n NNM Evaluation:\n")
print(classification_report(y_test, y_pred))

print("\n Confusion matrix:\n")
cm = confusion_matrix(y_test, y_pred)
print(cm)

### Random Search to fine tune the NNM 

In [None]:
param_distributions = {
    'NNM__alpha': [1e-5, 1e-4, 1e-3, 1e-2],
    'NNM__hidden_layer_sizes': [(5,), (10,), (5, 2), (10, 5)],
}

random_search = RandomizedSearchCV(estimator = NNM, param_distributions = param_distributions, n_iter = 50, cv = 5, scoring = 'f1', random_state = 0)

random_search.fit(X_train, y_train)

best_params_NNM = random_search.best_params_

print(best_params_NNM)


### Select the best model with its best parameters to predict the result of new investments 

In [None]:
# Initialize the best model
Best_model = imbpipeline(steps = [
    ["preprocessing", preprocessing],
    ["SMOTE", SMOTE(random_state=0, sampling_strategy='minority')],
    ["Best_model",]
])

Best_model.fit(X, Y)

df2 = pd.read_csv('new_investment_data')

#Determining Dependent Variable
new_investment = df2.drop('result', axis=1)

y_pred_new = Best_model.predict(new_investment)

print("Predicted outcomes for new investments suggestion:")
print(y_pred_new)

#Visualize the result in a table
new_investment['result'] = y_pred_new
new_investment