In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
# read data
train_data = pd.read_excel("./hearing/dataset/hearing_data_train_val.xlsx", header=0)
test_data = pd.read_excel("./hearing/dataset/hearing_data_test.xlsx", header=0)

### Classification

In [None]:
# Extract label
labels_train = train_data['术后PTA-BTA>20']
labels_test = test_data['术后PTA-BTA>20']3

# Drop the label from the DataFrame
features_train = train_data.drop('术后PTA-BTA>20', axis=1)
features_test = test_data.drop('术后PTA-BTA>20', axis=1)

DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

# Define the number of folds for cross-validation
num_folds = 5

# Define the K-fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=0)

# Set up the parameter grid for grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2', None]
}

# Create a DecisionTreeClassifier object
dtc = DecisionTreeClassifier(random_state=0)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=kf, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(features_train, labels_train)

# Print the best parameter settings
print('Best parameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)

# Compute and print the mean precision, recall, F-score, and ROC-AUC scores
metrics = ['precision', 'recall', 'f1', 'roc_auc']
for metric in metrics:
    scores = cross_val_score(grid_search.best_estimator_, features_train, labels_train, cv=kf, scoring=metric)
    mean_score = np.mean(scores)
    print(f'Mean {metric}: {mean_score}')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Test the dataset using the best estimator
best_estimator = grid_search.best_estimator_
labels_pred = best_estimator.predict(features_test)

# Evaluate the test dataset
accuracy = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

print(f'Test accuracy: {accuracy}')
print(f'Test precision: {precision}')
print(f'Test recall: {recall}')
print(f'Test f1-score: {f1}')
print(f'Test ROC-AUC: {roc_auc}')

# Create a confusion matrix and normalize it
cm = confusion_matrix(labels_test, labels_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot the heat map for the confusion matrix with raw numbers
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='viridis', fmt='d', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()



In [None]:
# Fit the decision tree model using the best parameters on the training dataset
model = DecisionTreeClassifier(**grid_search.best_params_, random_state=0)
model.fit(features_train, labels_train)

# Predict the target variable for the test dataset
labels_pred = model.predict(features_test)

# Compute the precision, recall, f1, and ROC-AUC scores on the predicted values for the test dataset
acc = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

# Print the scores
print('Accuracy score on test set:', acc)
print(f'Precision score: {precision}')
print(f'Recall score: {recall}')
print(f'F1 score: {f1}')
print(f'ROC-AUC score: {roc_auc}')


SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

# Define the number of folds for cross-validation
num_folds = 5

# Define the K-fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=0)

# Set up the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

# Create an SVC object
svc = SVC(random_state=0)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=kf, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(features_train, labels_train)

# Print the best parameter settings
print('Best parameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)

metrics = ['precision', 'recall', 'f1', 'roc_auc']
for metric in metrics:
    scores = cross_val_score(grid_search.best_estimator_, features_train, labels_train, cv=kf, scoring=metric)
    mean_score = np.mean(scores)
    print(f'Mean {metric}: {mean_score}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Test the dataset using the best estimator
best_estimator = grid_search.best_estimator_
labels_pred = best_estimator.predict(features_test)

# Evaluate the test dataset
accuracy = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

print(f'Test accuracy: {accuracy}')
print(f'Test precision: {precision}')
print(f'Test recall: {recall}')
print(f'Test f1-score: {f1}')
print(f'Test ROC-AUC: {roc_auc}')

# Create a confusion matrix and normalize it
cm = confusion_matrix(labels_test, labels_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


# Plot the heat map for the confusion matrix with raw numbers
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='viridis', fmt='d', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()



In [None]:
# Fit the decision tree model using the best parameters on the training dataset
model = SVC(**grid_search.best_params_, random_state=0)
model.fit(features_train, labels_train)

# Predict the target variable for the test dataset
labels_pred = model.predict(features_test)

# Compute the precision, recall, f1, and ROC-AUC scores on the predicted values for the test dataset
acc = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

# Print the scores
print('Accuracy score on test set:', acc)
print(f'Precision score: {precision}')
print(f'Recall score: {recall}')
print(f'F1 score: {f1}')
print(f'ROC-AUC score: {roc_auc}')


RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np

# Define the number of folds for cross-validation
num_folds = 5

# Define the K-fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=0)

# Set up the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create a RandomForestClassifier object
rfc = RandomForestClassifier(random_state=0)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=kf, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(features_train, labels_train)

# Print the best parameter settings
print('Best parameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)

metrics = ['precision', 'recall', 'f1', 'roc_auc']
for metric in metrics:
    scores = cross_val_score(grid_search.best_estimator_, features_train, labels_train, cv=kf, scoring=metric)
    mean_score = np.mean(scores)
    print(f'Mean {metric}: {mean_score}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Test the dataset using the best estimator
best_estimator = grid_search.best_estimator_
labels_pred = best_estimator.predict(features_test)

# Evaluate the test dataset
accuracy = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

print(f'Test accuracy: {accuracy}')
print(f'Test precision: {precision}')
print(f'Test recall: {recall}')
print(f'Test f1-score: {f1}')
print(f'Test ROC-AUC: {roc_auc}')

# Create a confusion matrix and normalize it
cm = confusion_matrix(labels_test, labels_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Plot the heat map for the normalized confusion matrix
# Plot the heat map for the confusion matrix with raw numbers
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='viridis', fmt='d', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()



In [None]:
# Fit the decision tree model using the best parameters on the training dataset
model = RandomForestClassifier(**grid_search.best_params_, random_state=0)
model.fit(features_train, labels_train)

# Predict the target variable for the test dataset
labels_pred = model.predict(features_test)

# Compute the precision, recall, f1, and ROC-AUC scores on the predicted values for the test dataset
acc = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

# Print the scores
print('Accuracy score on test set:', acc)
print(f'Precision score: {precision}')
print(f'Recall score: {recall}')
print(f'F1 score: {f1}')
print(f'ROC-AUC score: {roc_auc}')


neural network

In [None]:
def create_model(activation='relu', optimizer='adam', num_hidden_nodes=64, num_hidden_layers=1):
    model = Sequential()
    model.add(Dense(num_hidden_nodes, activation=activation))
    # model.add(Dense(hidden_nodes, activation=activation, input_dim=X_train.shape[1]))
    for i in range(num_hidden_layers):
        model.add(Dense(num_hidden_nodes, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create a new model using the best hyperparameters
best_model = create_model(activation='relu', optimizer='sgd', num_hidden_nodes=128, num_hidden_layers=1)

# Train the model on the training dataset
best_model.fit(features_train, labels_train, epochs=grid_result.best_params_['epochs'], batch_size=grid_result.best_params_['batch_size'], verbose=0)

# Predict the target variable for the test dataset
# labels_pred = best_model.predict(features_test)
labels_pred = np.round(best_model.predict(features_test))

# Compute the precision, recall, f1, and ROC-AUC scores on the predicted values for the test dataset
acc = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

# Print the scores
print('Accuracy score on test set:', acc)
print(f'Precision score: {precision}')
print(f'Recall score: {recall}')
print(f'F1 score: {f1}')
print(f'ROC-AUC score: {roc_auc}')


In [None]:
import numpy as np
np.random.seed(0)
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf

# Define the neural network model as a function
def create_model(activation='relu', optimizer='adam', num_hidden_nodes=64, num_hidden_layers=1):
    model = Sequential()
    model.add(Dense(num_hidden_nodes, activation=activation))
    # model.add(Dense(hidden_nodes, activation=activation, input_dim=X_train.shape[1]))
    for i in range(num_hidden_layers):
        model.add(Dense(num_hidden_nodes, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Define the hyperparameters to be tuned
param_grid = {
    'activation': ['relu', 'sigmoid'],
    'optimizer': ['adam', 'sgd'],
    'epochs': [100, 200, 300, 350, 400],
    'batch_size': [64, 128],
    'num_hidden_nodes': [32, 64, 128, 256],
    'num_hidden_layers': [0, 1]
}

# Create a KerasClassifier wrapper for the model function
model = KerasClassifier(build_fn=create_model, verbose=0)

# Perform grid search with cross-validation
with tf.device('/GPU:0'):
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=2)
    grid_result = grid.fit(features_train, labels_train)

    # Print the best hyperparameters and accuracy score
    print('Best hyperparameters:', grid_result.best_params_)
    print('Accuracy score:', grid_result.best_score_)

    metrics = ['precision', 'recall', 'f1', 'roc_auc']
    for metric in metrics:
        scores = cross_val_score(grid.best_estimator_, features_train, labels_train, cv=5, scoring=metric)
        mean_score = np.mean(scores)
        print(f'Mean {metric}: {mean_score}')

    # Get the best model from the grid search result
best_model = grid_result.best_estimator_.model

# Save the weights of the best model to a file
best_model.save_weights('best_model_weights_classification.h5')


In [None]:
#     # Get the best model from the grid search result
# best_model = grid_result.best_estimator_.model

# # Save the weights of the best model to a file
# best_model.save_weights('best_model_weights_classification.h5')

In [None]:
import numpy as np
np.random.seed(42)
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def create_model(activation='relu', optimizer='adam', num_hidden_nodes=64, num_hidden_layers=1):
    model = Sequential()
    model.add(Dense(num_hidden_nodes, activation=activation))
    # model.add(Dense(hidden_nodes, activation=activation, input_dim=X_train.shape[1]))
    for i in range(num_hidden_layers):
        model.add(Dense(num_hidden_nodes, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create a new model using the best hyperparameters
best_model = create_model(activation='relu', optimizer='sgd', num_hidden_nodes=32, num_hidden_layers=0)

# Train the model on the training dataset
best_model.fit(features_train, labels_train, epochs=400, batch_size=128, verbose=0)

# Predict the target variable for the test dataset
# labels_pred = best_model.predict(features_test)
labels_pred = np.round(best_model.predict(features_test))

# Compute the precision, recall, f1, and ROC-AUC scores on the predicted values for the test dataset
acc = accuracy_score(labels_test, labels_pred)
precision = precision_score(labels_test, labels_pred)
recall = recall_score(labels_test, labels_pred)
f1 = f1_score(labels_test, labels_pred)
roc_auc = roc_auc_score(labels_test, labels_pred)

# Print the scores
print('Accuracy score on test set:', acc)
print(f'Precision score: {precision}')
print(f'Recall score: {recall}')
print(f'F1 score: {f1}')
print(f'ROC-AUC score: {roc_auc}')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Generate the confusion matrix
cm = confusion_matrix(labels_test, labels_pred)

# Plot the heat map for the confusion matrix with raw numbers
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='viridis', fmt='d', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()


### Regression

In [None]:
# Extract label
labels_train = train_data['术后PTA-BTA']
labels_test = test_data['术后PTA-BTA']

# Drop the label from the DataFrame
features_train = train_data.drop('术后PTA-BTA', axis=1)
features_test = test_data.drop('术后PTA-BTA', axis=1)

neural network

In [None]:
import numpy as np
np.random.seed(0)
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from sklearn.model_selection import cross_val_score

tf.random.set_seed(0)
# Define the neural network model as a function
def create_model(activation='relu', optimizer='adam', num_hidden_nodes=64, num_hidden_layers=1):
    model = Sequential()
    model.add(Dense(num_hidden_nodes, activation=activation))
    for i in range(num_hidden_layers):
        model.add(Dense(num_hidden_nodes, activation=activation))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

# Define the hyperparameters to be tuned
param_grid = {
    'activation': ['relu', 'sigmoid'],
    'optimizer': ['adam', 'sgd'],
    'epochs': [100, 200, 300, 350, 400],
    'batch_size': [64, 128],
    'num_hidden_nodes': [32, 64, 128, 256],
    'num_hidden_layers': [0, 1]
}

# Create a KerasRegressor wrapper for the model function
model = KerasRegressor(build_fn=create_model, verbose=0)

# Perform grid search with cross-validation
with tf.device('/GPU:0'):
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=2)
    grid_result = grid.fit(features_train, labels_train)

    # Print the best hyperparameters and accuracy score
    print('Best hyperparameters:', grid_result.best_params_)
    print('MAE score:', -grid_result.best_score_)

    metrics = ['neg_mean_absolute_error']
    metrics = ['neg_mean_absolute_error']
    for metric in metrics:
        scores = cross_val_score(grid_result.best_estimator_, features_train, labels_train, cv=5, scoring=metric)
        mean_score = -np.mean(scores)
        print(f'Mean {metric}: {mean_score}')



In [None]:
for metric in metrics:
        scores = cross_val_score(grid_result.best_estimator_, features_train, labels_train, cv=5, scoring=metric)
        mean_score = -np.mean(scores)
        print(f'Mean {metric}: {mean_score}')

In [None]:
def create_model(activation='relu', optimizer='adam', num_hidden_nodes=64, num_hidden_layers=1):
    model = Sequential()
    model.add(Dense(num_hidden_nodes, activation=activation))
    for i in range(num_hidden_layers):
        model.add(Dense(num_hidden_nodes, activation=activation))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

# Create a new model using the best hyperparameters
best_model = create_model(activation='sigmoid', optimizer='sgd', num_hidden_nodes=256, num_hidden_layers=0)

# Train the model on the training dataset
best_model.fit(features_train, labels_train, epochs=grid_result.best_params_['epochs'], batch_size=grid_result.best_params_['batch_size'], verbose=0)

# Predict the target variable for the test dataset
preds = best_model.predict(features_test)

# Calculate the MAE
mae = mean_absolute_error(labels_test, preds)
print('MAE:', mae)


In [None]:
# best_model.save_weights('best_model_weights.h5')

In [None]:
import matplotlib.pyplot as plt

# create scatter plot with labels_test as x-axis and preds as y-axis
plt.scatter(labels_test, preds)

# add title and labels to axes
plt.title('Scatter Plot of Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# add grid
plt.grid(True)

# add dotted line of y=x
max_value = max(labels_test.max(), preds.max())
plt.plot([0, max_value], [0, max_value], linestyle='--', color='red')

# add the MSE value to the plot
# mse = 9.4365
# plt.text(0.75 * max_value, 0.15 * max_value, f'MSE: {mse}', bbox=dict(facecolor='lightblue', alpha=0.5))
plt.savefig('scatter_plot.tif', format='tif', dpi=300)

# display the plot
plt.show()


Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Train the model
linear_reg = LinearRegression()
linear_reg.fit(features_train, labels_train)

# Predict on test data
linear_preds = linear_reg.predict(features_test)

# Calculate the MAE
linear_mae = mean_absolute_error(labels_test, linear_preds)
print('Linear Regression MAE:', linear_mae)


Random Forest Regression

In [None]:
# No parameter tuning
from sklearn.ensemble import RandomForestRegressor

# Train the model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=0)
rf_reg.fit(features_train, labels_train)

# Predict on test data
rf_preds = rf_reg.predict(features_test)

# Calculate the MAE
rf_mae = mean_absolute_error(labels_test, rf_preds)
print('Random Forest Regression MAE:', rf_mae)


In [None]:
# parameter tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define hyperparameters for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_reg = RandomForestRegressor(random_state=0)

rf_grid_search = GridSearchCV(estimator=rf_reg, param_grid=rf_param_grid, 
                              cv=5, verbose=2, scoring='neg_mean_absolute_error')

rf_grid_search.fit(features_train, labels_train)

print('Best hyperparameters for Random Forest:', rf_grid_search.best_params_)
print('Best MAE score for Random Forest:', -rf_grid_search.best_score_)

# Using best estimator for predictions
rf_best = rf_grid_search.best_estimator_
rf_preds = rf_best.predict(features_test)
rf_mae = mean_absolute_error(labels_test, rf_preds)
print('MAE with best Random Forest model:', rf_mae)


Support Vector Regression (SVR)

In [None]:
# no parameter tuning
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create and train the model (with scaling)
svr_reg = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
svr_reg.fit(features_train, labels_train)

# Predict on test data
svr_preds = svr_reg.predict(features_test)

# Calculate the MAE
svr_mae = mean_absolute_error(labels_test, svr_preds)
print('Support Vector Regression MAE:', svr_mae)


In [None]:
# parameter tuning
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])

# Define hyperparameters for SVR
svr_param_grid = {
    'svr__C': [0.1, 1, 10, 100],
    'svr__epsilon': [0.1, 0.2, 0.3],
    'svr__kernel': ['linear', 'rbf', 'poly']
}

svr_grid_search = GridSearchCV(estimator=pipe, param_grid=svr_param_grid, 
                               cv=5, verbose=2, scoring='neg_mean_absolute_error')

svr_grid_search.fit(features_train, labels_train)

print('Best hyperparameters for SVR:', svr_grid_search.best_params_)
print('Best MAE score for SVR:', -svr_grid_search.best_score_)

# Using best estimator for predictions
svr_best = svr_grid_search.best_estimator_
svr_preds = svr_best.predict(features_test)
svr_mae = mean_absolute_error(labels_test, svr_preds)
print('MAE with best SVR model:', svr_mae)
