In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# # Load the dataset
# file_path = 'BankRecords.csv'
# data = pd.read_csv(file_path)

# # Display the first few rows of the dataframe to understand its structure
# data.head()

Unnamed: 0,ID,Age,Experience(Years),Income(Thousands's),Sort Code,Family,Credit Score,Education,Mortgage(Thousands's),Personal Loan,Securities Account,CD Account,Online Banking,CreditCard
0,1,25,1,49,91107,4,1.6,Diploma,0,No,Yes,No,No,No
1,2,45,19,34,90089,3,1.5,Diploma,0,No,Yes,No,No,No
2,3,39,15,11,94720,1,1.0,Diploma,0,No,No,No,No,No
3,4,35,9,100,94112,1,2.7,Degree,0,No,No,No,No,No
4,5,35,8,45,91330,4,1.0,Degree,0,No,No,No,No,Yes


In [3]:
from google.colab import files
import pandas as pd

# Upload the file
uploaded = files.upload()

# Get the filename
filename = list(uploaded.keys())[0]

# Read the CSV file into a DataFrame
data = pd.read_csv(filename)

# Display the first few rows of the DataFrame
print(data.head())

Saving BankRecords (1).csv to BankRecords (1).csv
   ID  Age  Experience(Years)  Income(Thousands's)  Sort Code  Family  \
0   1   25                  1                   49      91107       4   
1   2   45                 19                   34      90089       3   
2   3   39                 15                   11      94720       1   
3   4   35                  9                  100      94112       1   
4   5   35                  8                   45      91330       4   

   Credit Score Education  Mortgage(Thousands's) Personal Loan  \
0           1.6   Diploma                      0            No   
1           1.5   Diploma                      0            No   
2           1.0   Diploma                      0            No   
3           2.7    Degree                      0            No   
4           1.0    Degree                      0            No   

  Securities Account CD Account Online Banking CreditCard  
0                Yes         No             No        

In [4]:
# Checking for missing values
missing_values = data.isnull().sum()
missing_values


ID                       0
Age                      0
Experience(Years)        0
Income(Thousands's)      0
Sort Code                0
Family                   0
Credit Score             0
Education                0
Mortgage(Thousands's)    0
Personal Loan            0
Securities Account       0
CD Account               0
Online Banking           0
CreditCard               0
dtype: int64

In [5]:

# List of categorical and numerical features
categorical_features = ['Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online Banking', 'CreditCard']
numerical_features = ['Age', 'Experience(Years)', 'Sort Code', 'Family', 'Credit Score', 'Mortgage(Thousands\'s)']

# Creating transformers for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')  # Use drop='first' to avoid multicollinearity

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply the transformations
data_preprocessed = preprocessor.fit_transform(data)

# Convert the preprocessed data to a DataFrame for better readability
# Getting feature names for the new columns created by OneHotEncoder
encoded_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_columns = numerical_features + list(encoded_columns)

# Create a DataFrame
data_preprocessed_df = pd.DataFrame(data_preprocessed, columns=all_columns)

# Adding the target variable back to the DataFrame
data_preprocessed_df['Income(Thousands\'s)'] = data['Income(Thousands\'s)']


data_preprocessed_df.head()

Unnamed: 0,Age,Experience(Years),Sort Code,Family,Credit Score,Mortgage(Thousands's),Education_Diploma,Education_Masters,Personal Loan_Yes,Securities Account_Yes,CD Account_Yes,Online Banking_Yes,CreditCard_Yes,Income(Thousands's)
0,-1.774417,-1.666078,-0.964114,1.397414,-0.193371,-0.555524,1.0,0.0,0.0,1.0,0.0,0.0,0.0,49
1,-0.029524,-0.09633,-1.443932,0.525991,-0.250595,-0.555524,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34
2,-0.552992,-0.445163,0.738814,-1.216855,-0.53672,-0.555524,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11
3,-0.90197,-0.968413,0.452243,-1.216855,0.436103,-0.555524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100
4,-0.90197,-1.055621,-0.859007,1.397414,-0.53672,-0.555524,0.0,0.0,0.0,0.0,0.0,0.0,1.0,45


In [6]:
# Define features and target
X = data_preprocessed_df.drop('Income(Thousands\'s)', axis=1)
y = data_preprocessed_df['Income(Thousands\'s)']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Train an MLP Regressor
mlp_model = MLPRegressor(
    solver='adam',
    learning_rate='constant',
    hidden_layer_sizes=(100, 50),
    alpha=0.0001,
    activation='logistic',
    max_iter=1000,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Predictions
y_pred_linear = linear_model.predict(X_test)
y_pred_mlp = mlp_model.predict(X_test)

# Evaluate the models
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

mse_mlp = mean_squared_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

(mse_linear, r2_linear), (mse_mlp, r2_mlp)




((925.2997153649975, 0.5636450907451657),
 (681.5915362872177, 0.6785735389012282))

In [None]:
# New customer details
new_customer = {
    'Age': 30,
    'Experience(Years)': 5,
    'Sort Code': 92011,
    'Family': 2,
    'Credit Score': 1.2,
    'Mortgage(Thousands\'s)': 20,
    'Education': 'Degree',
    'Personal Loan': 'No',
    'Securities Account': 'Yes',
    'CD Account': 'No',
    'Online Banking': 'Yes',
    'CreditCard': 'No'
}

# Convert to DataFrame
new_customer_df = pd.DataFrame([new_customer])

# Apply the same preprocessing to the new customer data
new_customer_preprocessed = preprocessor.transform(new_customer_df)

# Predict income using the MLP model
new_customer_income_prediction = mlp_model.predict(new_customer_preprocessed)

new_customer_income_prediction[0]




37.50119494620124

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 25, 10)],
    'activation': ['identity','logistic', 'relu', 'tanh'],
    'solver': ['adam', 'sgd','lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive','invscaling']
}

# Initialize the MLPRegressor
mlp = MLPRegressor(max_iter=1000, random_state=42)

# Initialize GridSearchCV with verbose output
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best MLP Regressor MSE: {mse_best}, R²: {r2_best}")



Fitting 3 folds for each of 432 candidates, totalling 1296 fits


37 fits failed out of a total of 1296.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
37 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\35383\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\35383\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 749, in fit
    return self._fit(X, y, incremental=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\35383\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 491, in _fit
    raise ValueError(
ValueError: Solver produced non-finite parameter weights. The

Best parameters found:  {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
Best MLP Regressor MSE: 681.5915362872175, R²: 0.6785735389012283




In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from scipy.stats import uniform

# Define the parameter distribution
param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (50, 25, 10)],
    'activation': ['identity','logistic', 'relu', 'tanh'],
    'solver': ['adam', 'sgd','lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive','invscaling']
}
# Initialize the MLPRegressor
mlp = MLPRegressor(max_iter=1000, random_state=42)

# Initialize RandomizedSearchCV with verbose output
random_search = RandomizedSearchCV(estimator=mlp, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=3, random_state=42)

# Fit the random search to the training data
random_search.fit(X_train, y_train)

# Retrieve the best parameters and best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

print("Best parameters found: ", best_params)

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best MLP Regressor MSE: {mse_best}, R²: {r2_best}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits


5 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\35383\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\35383\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 749, in fit
    return self._fit(X, y, incremental=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\35383\anaconda3\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 491, in _fit
    raise ValueError(
ValueError: Solver produced non-finite parameter weights. The in

Best parameters found:  {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (100, 50), 'alpha': 0.0001, 'activation': 'logistic'}
Best MLP Regressor MSE: 681.5915362872175, R²: 0.6785735389012283




In [None]:
Best parameters found:  {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (100, 50), 'alpha': 0.0001, 'activation': 'logistic'}
Best parameters found:  {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant', 'solver': 'adam'}


SyntaxError: invalid syntax (3770892985.py, line 1)