# ITD105 CASE STUDY #1
### Comparing Machine Learning Algorithms
Name: Allan Raymart C. Paraiso <br>
Dataset Name: Insurance Cost <br>
Dataset Type: Regression <br>

### SET A
- Resampling Technique: Train/Test Split (80:20)
- Regression Metrics: Mean Absolute Error (MAE)

#### 1. CART (Classification and Regression Trees)

In [66]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable

# Splitting the dataset into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=50)

# Decision Tree Regressor
model = DecisionTreeRegressor(random_state=50)

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")


Mean Absolute Error (MAE): 2651.093


#### 2. Elastic Net

In [67]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Elastic Net model
model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=50)  
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 7423.916


#### 3. Gradient Boosting Machines (AdaBoost)

In [68]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# AdaBoost Regressor model
model = AdaBoostRegressor(n_estimators=50, random_state=50)  
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 3922.616


#### 4. K-Nearest Neighbors (K-NN)

In [69]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Spliting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# K-Nearest Neighbors (K-NN) Regressor model
model = KNeighborsRegressor(n_neighbors=5)  
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 7872.695


#### 5. Lasso Regression

In [70]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Lasso Regression model
model = Lasso(alpha=1.0)  
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 3972.271


#### 6. Ridge Regression

In [71]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Split the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Ridge Regression model
model = Ridge(alpha=1.0)  

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 3984.913


#### 7. Linear Regression

In [72]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Linear Regression model
model = LinearRegression()

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 3971.629


#### 8. Multi-Layer Perceptron (MLP)

In [73]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# Multi-Layer Perceptron (MLP) Regressor model
model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000, random_state=42)

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 3970.469




#### 9. Random Forest

In [74]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)

# Selecting features and target variable
X = dataframe[['age', 'sex', 'bmi', 'children', 'smoker','region']]  # Features
Y = dataframe['charges']  # Target variable


# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=50)

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 2575.167


### SET B
- Resampling Technique: K-Fold Cross-Validation
- Regression Metrics: Mean Squared Error (MSE)

#### 1. CART (Classification and Regression Trees)

In [75]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# CART (Decision Tree Regressor) model
model = DecisionTreeRegressor()

# Initializing the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Performing K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Training the model on the training data
    model.fit(X_train, Y_train)

    # Making predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculating the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculating the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Displaying the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Displaying the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

Fold 1 - Mean Squared Error (MSE): 46384985.612
Fold 2 - Mean Squared Error (MSE): 31937414.029
Fold 3 - Mean Squared Error (MSE): 43545523.251
Fold 4 - Mean Squared Error (MSE): 54299403.863
Fold 5 - Mean Squared Error (MSE): 48009707.537
Fold 6 - Mean Squared Error (MSE): 44944175.901
Fold 7 - Mean Squared Error (MSE): 37748030.552
Fold 8 - Mean Squared Error (MSE): 41642946.183
Fold 9 - Mean Squared Error (MSE): 45994270.061
Fold 10 - Mean Squared Error (MSE): 47927535.007
Mean MSE across all folds: 44243399.200


#### 2. Elastic Net

In [76]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Elastic Net model
model = ElasticNet(alpha=1.0, l1_ratio=0.5)

# Initializing the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=50)

# Performing K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")


Fold 1 - Mean Squared Error (MSE): 80625515.187
Fold 2 - Mean Squared Error (MSE): 109085227.723
Fold 3 - Mean Squared Error (MSE): 103484457.802
Fold 4 - Mean Squared Error (MSE): 74671879.653
Fold 5 - Mean Squared Error (MSE): 85598839.611
Fold 6 - Mean Squared Error (MSE): 91482166.743
Fold 7 - Mean Squared Error (MSE): 88585041.027
Fold 8 - Mean Squared Error (MSE): 113077994.736
Fold 9 - Mean Squared Error (MSE): 81208205.421
Fold 10 - Mean Squared Error (MSE): 70071423.439
Mean MSE across all folds: 89789075.134


#### 3. Gradient Boosting Machines (AdaBoost)

In [77]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# AdaBoost Regressor model
model = AdaBoostRegressor(n_estimators=50, random_state=42)

# Initializing the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Performing K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Training the model on the training data
    model.fit(X_train, Y_train)

    # Making predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculating the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculating the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Displaying the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

Fold 1 - Mean Squared Error (MSE): 25352470.148
Fold 2 - Mean Squared Error (MSE): 25028263.948
Fold 3 - Mean Squared Error (MSE): 23599945.191
Fold 4 - Mean Squared Error (MSE): 26549615.885
Fold 5 - Mean Squared Error (MSE): 23058028.890
Fold 6 - Mean Squared Error (MSE): 25241173.348
Fold 7 - Mean Squared Error (MSE): 20097805.926
Fold 8 - Mean Squared Error (MSE): 26967477.739
Fold 9 - Mean Squared Error (MSE): 33560961.683
Fold 10 - Mean Squared Error (MSE): 22506811.207
Mean MSE across all folds: 25196255.396


#### 4. K-Nearest Neighbors (K-NN)

In [78]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create a K-Nearest Neighbors (K-NN) Regressor model
model = KNeighborsRegressor(n_neighbors=5)  

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

Fold 1 - Mean Squared Error (MSE): 99950405.272
Fold 2 - Mean Squared Error (MSE): 150531410.352
Fold 3 - Mean Squared Error (MSE): 107432288.160
Fold 4 - Mean Squared Error (MSE): 138514281.647
Fold 5 - Mean Squared Error (MSE): 142959734.435
Fold 6 - Mean Squared Error (MSE): 115358753.500
Fold 7 - Mean Squared Error (MSE): 104743715.588
Fold 8 - Mean Squared Error (MSE): 91142269.190
Fold 9 - Mean Squared Error (MSE): 157227758.715
Fold 10 - Mean Squared Error (MSE): 106363626.404
Mean MSE across all folds: 121422424.326


#### 5. Lasso Regression

In [79]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create a Lasso Regression model
model = Lasso(alpha=1.0)  # You can adjust the regularization strength (alpha) as needed

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")


Fold 1 - Mean Squared Error (MSE): 32799214.424
Fold 2 - Mean Squared Error (MSE): 35708802.602
Fold 3 - Mean Squared Error (MSE): 33784824.638
Fold 4 - Mean Squared Error (MSE): 40351588.540
Fold 5 - Mean Squared Error (MSE): 33557643.741
Fold 6 - Mean Squared Error (MSE): 33535741.391
Fold 7 - Mean Squared Error (MSE): 39926983.212
Fold 8 - Mean Squared Error (MSE): 42321565.451
Fold 9 - Mean Squared Error (MSE): 47622845.733
Fold 10 - Mean Squared Error (MSE): 30426005.788
Mean MSE across all folds: 37003521.552


#### 6. Ridge Regression

In [80]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create a Ridge Regression model
model = Ridge(alpha=1.0)  # You can adjust the regularization strength (alpha) as needed

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

Fold 1 - Mean Squared Error (MSE): 32768511.621
Fold 2 - Mean Squared Error (MSE): 35804236.747
Fold 3 - Mean Squared Error (MSE): 33791956.101
Fold 4 - Mean Squared Error (MSE): 40394878.761
Fold 5 - Mean Squared Error (MSE): 33564900.348
Fold 6 - Mean Squared Error (MSE): 33583726.090
Fold 7 - Mean Squared Error (MSE): 39813453.694
Fold 8 - Mean Squared Error (MSE): 42197582.789
Fold 9 - Mean Squared Error (MSE): 47656964.075
Fold 10 - Mean Squared Error (MSE): 30477287.168
Mean MSE across all folds: 37005349.740


#### 7. Linear Regression

In [81]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create a Linear Regression model
model = LinearRegression()

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

Fold 1 - Mean Squared Error (MSE): 32802945.334
Fold 2 - Mean Squared Error (MSE): 35708661.083
Fold 3 - Mean Squared Error (MSE): 33782750.293
Fold 4 - Mean Squared Error (MSE): 40347979.377
Fold 5 - Mean Squared Error (MSE): 33559606.243
Fold 6 - Mean Squared Error (MSE): 33536207.389
Fold 7 - Mean Squared Error (MSE): 39937477.978
Fold 8 - Mean Squared Error (MSE): 42326309.135
Fold 9 - Mean Squared Error (MSE): 47619600.070
Fold 10 - Mean Squared Error (MSE): 30423432.982
Mean MSE across all folds: 37004496.989


#### 8. Multi-Layer Perceptron (MLP)

In [82]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create an MLP Regression model
model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=250, random_state=42)

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")



Fold 1 - Mean Squared Error (MSE): 111746319.115
Fold 2 - Mean Squared Error (MSE): 139540362.846
Fold 3 - Mean Squared Error (MSE): 112574235.655
Fold 4 - Mean Squared Error (MSE): 132931811.297
Fold 5 - Mean Squared Error (MSE): 153662552.248
Fold 6 - Mean Squared Error (MSE): 118763327.135
Fold 7 - Mean Squared Error (MSE): 108979365.109
Fold 8 - Mean Squared Error (MSE): 93082704.690
Fold 9 - Mean Squared Error (MSE): 150889797.245
Fold 10 - Mean Squared Error (MSE): 112218737.074
Mean MSE across all folds: 123438921.241




#### 9. Random Forest

In [83]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from joblib import dump

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create a Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

dump(model, '../CS1/model.joblib')

Fold 1 - Mean Squared Error (MSE): 23314202.527
Fold 2 - Mean Squared Error (MSE): 21269279.504
Fold 3 - Mean Squared Error (MSE): 22062739.474
Fold 4 - Mean Squared Error (MSE): 25857905.063
Fold 5 - Mean Squared Error (MSE): 20979305.574
Fold 6 - Mean Squared Error (MSE): 27159647.911
Fold 7 - Mean Squared Error (MSE): 25115276.427
Fold 8 - Mean Squared Error (MSE): 29768690.514
Fold 9 - Mean Squared Error (MSE): 35647547.959
Fold 10 - Mean Squared Error (MSE): 23769485.053
Mean MSE across all folds: 25494408.001


['../CS1/model.joblib']

#### Set A Tuning
- ML Algorithm: Random Forest
- Sampling Technique: Train/Test Split (80:20) 
- Regression Metrics: Mean Absolute Error (MAE)

In [91]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Support Vector Machine (SVM) model
epsilon = 0.1  # Set your epsilon value
kernel = 'linear'  # You can also use 'rbf' (radial basis function) or other kernels
C = 1.0  # Set your C value

model = SVR(epsilon=epsilon, kernel=kernel, C=C)

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")


Mean Absolute Error (MAE): 6912.019


In [85]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Random Forest Regressor model
model = RandomForestRegressor(n_estimators=150, random_state=25, max_depth=None)

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 2669.855


In [86]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Splitting the data into a training set and a testing set (80:20 split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50)

# Random Forest Regressor model
model = RandomForestRegressor(n_estimators=200, random_state=0, max_depth=None)

# Training the model on the training data
model.fit(X_train, Y_train)

# Making predictions on the test data
Y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"Mean Absolute Error (MAE): {mae:.3f}")

Mean Absolute Error (MAE): 2691.393


#### Set B Tuning
- ML Algorithm: Support Vector Machines (SVM)
- Sampling Technique: K-Fold Cross Validation
- Regression Metrics: Mean Squared Error (MSE)

In [93]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mae_scores = []

# Create a Support Vector Machine (SVM) Regression model
epsilon = 0.2  # Set your epsilon value
kernel = 'linear'  # You can also use 'rbf' (radial basis function) or other kernels
C = 1.25  # Set your C value

model = SVR(epsilon=epsilon, kernel=kernel, C=C)

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Absolute Error (MAE) for this fold
    mae = mean_absolute_error(Y_test, Y_pred)
    mae_scores.append(mae)

# Calculate the mean MAE across all folds
mean_mae = np.mean(mae_scores)

# Display the MAE for each fold
for fold, mae in enumerate(mae_scores, start=1):
    print(f"Fold {fold} - Mean Absolute Error (MAE): {mae:.3f}")

# Display the mean MAE across all folds
print(f"Mean MAE across all folds: {mean_mae:.3f}")


Fold 1 - Mean Absolute Error (MAE): 6001.009
Fold 2 - Mean Absolute Error (MAE): 6853.711
Fold 3 - Mean Absolute Error (MAE): 6103.746
Fold 4 - Mean Absolute Error (MAE): 7217.863
Fold 5 - Mean Absolute Error (MAE): 8282.548
Fold 6 - Mean Absolute Error (MAE): 6178.653
Fold 7 - Mean Absolute Error (MAE): 6176.869
Fold 8 - Mean Absolute Error (MAE): 5621.439
Fold 9 - Mean Absolute Error (MAE): 7469.451
Fold 10 - Mean Absolute Error (MAE): 5640.639
Mean MAE across all folds: 6554.593


In [94]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target

# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mae_scores = []

# Create a Support Vector Machine (SVM) Regression model with a polynomial kernel
epsilon = 0.2  # Set your epsilon value
kernel = 'poly'  # Use a polynomial kernel
C = 1.0  # Set your C value

model = SVR(epsilon=epsilon, kernel=kernel, C=C)

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Absolute Error (MAE) for this fold
    mae = mean_absolute_error(Y_test, Y_pred)
    mae_scores.append(mae)

# Calculate the mean MAE across all folds
mean_mae = np.mean(mae_scores)

# Display the MAE for each fold
for fold, mae in enumerate(mae_scores, start=1):
    print(f"Fold {fold} - Mean Absolute Error (MAE): {mae:.3f}")

# Display the mean MAE across all folds
print(f"Mean MAE across all folds: {mean_mae:.3f}")


Fold 1 - Mean Absolute Error (MAE): 7460.851
Fold 2 - Mean Absolute Error (MAE): 8196.171
Fold 3 - Mean Absolute Error (MAE): 6947.435
Fold 4 - Mean Absolute Error (MAE): 8272.296
Fold 5 - Mean Absolute Error (MAE): 9085.898
Fold 6 - Mean Absolute Error (MAE): 7341.842
Fold 7 - Mean Absolute Error (MAE): 6989.161
Fold 8 - Mean Absolute Error (MAE): 6737.231
Fold 9 - Mean Absolute Error (MAE): 8635.806
Fold 10 - Mean Absolute Error (MAE): 6887.753
Mean MAE across all folds: 7655.444


In [89]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Loading the dataset
filename = 'insurance.csv'
dataframe = read_csv(filename)
array = dataframe.values
X = array[:, 0:5]  # Features
Y = array[:, 6]  # Target
# Set the number of folds for K-Fold Cross Validation
num_folds = 10
mse_scores = []

# Create a Random Forest Regression model
model = RandomForestRegressor(n_estimators=200, random_state=22, max_depth=None)

# Initialize the K-Fold cross-validator
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Train the model on the training data
    model.fit(X_train, Y_train)

    # Make predictions on the test data
    Y_pred = model.predict(X_test)

    # Calculate the Mean Squared Error (MSE) for this fold
    mse = mean_squared_error(Y_test, Y_pred)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)

# Display the MSE for each fold
for fold, mse in enumerate(mse_scores, start=1):
    print(f"Fold {fold} - Mean Squared Error (MSE): {mse:.3f}")

# Display the mean MSE across all folds
print(f"Mean MSE across all folds: {mean_mse:.3f}")

Fold 1 - Mean Squared Error (MSE): 23456021.190
Fold 2 - Mean Squared Error (MSE): 20989722.103
Fold 3 - Mean Squared Error (MSE): 22363489.711
Fold 4 - Mean Squared Error (MSE): 27287909.223
Fold 5 - Mean Squared Error (MSE): 20095021.109
Fold 6 - Mean Squared Error (MSE): 27765124.138
Fold 7 - Mean Squared Error (MSE): 24124779.664
Fold 8 - Mean Squared Error (MSE): 29298183.313
Fold 9 - Mean Squared Error (MSE): 35949867.546
Fold 10 - Mean Squared Error (MSE): 23690426.517
Mean MSE across all folds: 25502054.451
