In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
from scipy.stats import boxcox

In [2]:
# Reading the Sonar Data CSV file into a pandas DataFrame.
# The 'header=None' argument indicates that the dataset does not have a header row, 
# so pandas will assign numerical column labels by default (0, 1, 2, ..., n).
# Replace the file path with your own if needed.
df= pd.read_csv("C:/Users/ABHISHEK DEORE/OneDrive/Desktop/Projects/Rock VS Mine Prediction/Data/SonarData.csv", header = None)

In [3]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# Displaying the first 5 rows of the DataFrame to get a quick overview of the data.
# This is useful for checking if the data was loaded correctly and to inspect the first few records.
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.066,0.2273,0.31,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.555,0.6711,0.6415,0.7104,0.808,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.051,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,0.4918,0.6552,0.6919,0.7797,0.7464,0.9444,1.0,0.8874,0.8024,0.7818,0.5212,0.4052,0.3957,0.3914,0.325,0.32,0.3271,0.2767,0.4423,0.2028,0.3788,0.2947,0.1984,0.2341,0.1306,0.4182,0.3835,0.1057,0.184,0.197,0.1674,0.0583,0.1401,0.1628,0.0621,0.0203,0.053,0.0742,0.0409,0.0061,0.0125,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,0.6333,0.706,0.5544,0.532,0.6479,0.6931,0.6759,0.7551,0.8929,0.8619,0.7974,0.6737,0.4293,0.3648,0.5331,0.2413,0.507,0.8533,0.6036,0.8514,0.8512,0.5045,0.1862,0.2709,0.4232,0.3043,0.6116,0.6756,0.5375,0.4719,0.4647,0.2587,0.2129,0.2222,0.2111,0.0176,0.1348,0.0744,0.013,0.0106,0.0033,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,0.0881,0.1992,0.0184,0.2261,0.1729,0.2131,0.0693,0.2281,0.406,0.3973,0.2741,0.369,0.5556,0.4846,0.314,0.5334,0.5256,0.252,0.209,0.3559,0.626,0.734,0.612,0.3497,0.3953,0.3012,0.5408,0.8814,0.9857,0.9167,0.6121,0.5006,0.321,0.3202,0.4295,0.3654,0.2655,0.1576,0.0681,0.0294,0.0241,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,0.4152,0.3952,0.4256,0.4135,0.4528,0.5326,0.7306,0.6193,0.2032,0.4636,0.4148,0.4292,0.573,0.5399,0.3161,0.2285,0.6995,1.0,0.7262,0.4724,0.5103,0.5459,0.2881,0.0981,0.1951,0.4181,0.4604,0.3217,0.2828,0.243,0.1979,0.2444,0.1847,0.0841,0.0692,0.0528,0.0357,0.0085,0.023,0.0046,0.0156,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [5]:
# Getting the dimensions of the DataFrame (number of rows, number of columns).
# This is useful for understanding the size of the dataset and ensuring it has been loaded properly.
# The output will be in the form of (number_of_rows, number_of_columns).
df.shape

(208, 61)

In [6]:
# Generating descriptive statistics for the DataFrame.
# This method provides a summary of the central tendency, dispersion, and shape of the dataset's distribution.
# It includes metrics such as count, mean, standard deviation, minimum, maximum, and the 25th, 50th, and 75th percentiles.
# It is useful for quickly understanding the statistical properties of the numerical data.
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,0.236013,0.250221,0.273305,0.296568,0.320201,0.378487,0.415983,0.452318,0.504812,0.563047,0.60906,0.624275,0.646975,0.672654,0.675424,0.699866,0.702155,0.694024,0.642074,0.580928,0.504475,0.43904,0.41722,0.403233,0.392571,0.384848,0.363807,0.339657,0.3258,0.311207,0.289252,0.278293,0.246542,0.214075,0.197232,0.160631,0.122453,0.091424,0.051929,0.020424,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,0.132705,0.140072,0.140962,0.164474,0.205427,0.23265,0.263677,0.261529,0.257988,0.262653,0.257818,0.255883,0.250175,0.239116,0.244926,0.237228,0.245657,0.237189,0.24025,0.220749,0.213992,0.213237,0.206513,0.231242,0.259132,0.264121,0.239912,0.212973,0.199075,0.178662,0.171111,0.168728,0.138993,0.133291,0.151628,0.133938,0.086953,0.062417,0.035954,0.013665,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,0.0289,0.0236,0.0184,0.0273,0.0031,0.0162,0.0349,0.0375,0.0494,0.0656,0.0512,0.0219,0.0563,0.0239,0.024,0.0921,0.0481,0.0284,0.0144,0.0613,0.0482,0.0404,0.0477,0.0212,0.0223,0.008,0.0351,0.0383,0.0371,0.0117,0.036,0.0056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,0.12925,0.133475,0.166125,0.175175,0.164625,0.1963,0.20585,0.242075,0.299075,0.350625,0.399725,0.406925,0.450225,0.540725,0.5258,0.544175,0.5319,0.534775,0.4637,0.4114,0.34555,0.2814,0.257875,0.217575,0.179375,0.15435,0.1601,0.174275,0.173975,0.18645,0.1631,0.1589,0.1552,0.126875,0.094475,0.06855,0.06425,0.045125,0.02635,0.01155,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,0.2248,0.24905,0.26395,0.2811,0.2817,0.3047,0.3084,0.3683,0.43495,0.5425,0.6177,0.6649,0.6997,0.6985,0.7211,0.7545,0.7456,0.7319,0.6808,0.60715,0.49035,0.4296,0.3912,0.35105,0.31275,0.32115,0.3063,0.3127,0.2835,0.27805,0.2595,0.2451,0.22255,0.1777,0.148,0.12135,0.10165,0.0781,0.0447,0.0179,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,0.30165,0.33125,0.35125,0.386175,0.452925,0.535725,0.659425,0.67905,0.7314,0.809325,0.816975,0.831975,0.848575,0.872175,0.873725,0.8938,0.9171,0.900275,0.852125,0.735175,0.64195,0.5803,0.556125,0.596125,0.59335,0.556525,0.5189,0.44055,0.4349,0.42435,0.387525,0.38425,0.324525,0.27175,0.23155,0.200375,0.154425,0.1201,0.068525,0.025275,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,0.7342,0.706,0.7131,0.997,1.0,0.9988,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9657,0.9306,1.0,0.9647,1.0,1.0,0.9497,1.0,0.9857,0.9297,0.8995,0.8246,0.7733,0.7762,0.7034,0.7292,0.5522,0.3339,0.1981,0.0825,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [7]:
# Displaying a concise summary of the DataFrame.
# This includes information about the number of non-null entries, data types of each column, 
# and memory usage of the DataFrame. It is useful for getting a quick overview of the structure of the dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       208 non-null    float64
 1   1       208 non-null    float64
 2   2       208 non-null    float64
 3   3       208 non-null    float64
 4   4       208 non-null    float64
 5   5       208 non-null    float64
 6   6       208 non-null    float64
 7   7       208 non-null    float64
 8   8       208 non-null    float64
 9   9       208 non-null    float64
 10  10      208 non-null    float64
 11  11      208 non-null    float64
 12  12      208 non-null    float64
 13  13      208 non-null    float64
 14  14      208 non-null    float64
 15  15      208 non-null    float64
 16  16      208 non-null    float64
 17  17      208 non-null    float64
 18  18      208 non-null    float64
 19  19      208 non-null    float64
 20  20      208 non-null    float64
 21  21      208 non-null    float64
 22  22

In [8]:
# Counting the frequency of unique values in column 60 of the DataFrame.
# In this dataset, column 60 likely represents the target variable (e.g., rock or mine).
# The method will return the count of occurrences for each unique value in the column, 
# which is useful for understanding the class distribution of the target variable.
df[60].value_counts()

60
M    111
R     97
Name: count, dtype: int64

In [9]:
# Separating the features (independent variables) and the target (dependent variable).
# X contains all the columns except column 60, which are the features used for prediction.
# Y contains column 60, which is the target variable (e.g., rock or mine classification).
# 'drop(columns=60, axis=1)' removes column 60 from the dataset to create X.
X=df.drop(columns=60,axis=1)

# Y is set to column 60, which contains the labels or target variable for classification.
Y=df[60]

# Printing the features (X) and target (Y) to verify that they have been separated correctly.
print(X)  # Displays the feature set
print(Y)  # Displays the target variable (classification labels)

         0       1       2       3       4       5       6       7       8   \
0    0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1    0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2    0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3    0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4    0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   
..      ...     ...     ...     ...     ...     ...     ...     ...     ...   
203  0.0187  0.0346  0.0168  0.0177  0.0393  0.1630  0.2028  0.1694  0.2328   
204  0.0323  0.0101  0.0298  0.0564  0.0760  0.0958  0.0990  0.1018  0.1030   
205  0.0522  0.0437  0.0180  0.0292  0.0351  0.1171  0.1257  0.1178  0.1258   
206  0.0303  0.0353  0.0490  0.0608  0.0167  0.1354  0.1465  0.1123  0.1945   
207  0.0260  0.0363  0.0136  0.0272  0.0214  0.0338  0.0655  0.1400  0.1843   

         9       10      11      12      13      14

In [None]:
def plot_all_histograms(df):
    """
    Plots histograms for all columns in the DataFrame except the last one.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the dataset.

    Functionality:
    - Excludes the last column from the plotting process.
    - Creates a histogram for each column with Kernel Density Estimate (KDE) to visualize the data distribution.
    - Arranges histograms in a grid with 5 plots per row.
    - Dynamically calculates the number of rows required based on the number of columns.
    - Any unused subplot spaces are removed for neatness.
    """
    
    # Exclude the last column from plotting
    columns_to_plot = df.columns[:-1]
    n_columns = len(columns_to_plot)
    
    # Calculate the number of rows needed for the grid (5 histograms per row)
    n_rows = math.ceil(n_columns / 5)
    
    # Set up the figure and axes for the plots
    fig, axes = plt.subplots(n_rows, 5, figsize=(20, 4*n_rows))  # Dynamic figure size
    fig.suptitle('Histograms of all columns (except the last)', fontsize=16)  # Title for the entire plot grid
    
    # Flatten the axes array for easier iteration
    axes = axes.flatten()
    
    # Plot a histogram for each column in the DataFrame, except the last one
    for i, column in enumerate(columns_to_plot):
        sns.histplot(data=df, x=column, ax=axes[i], kde=True)  # Plot histogram with KDE
        axes[i].set_title(f'Column {column}')  # Set the title of each subplot to indicate the column number
        axes[i].set_xlabel('')  # Remove x-axis labels to save space and keep the layout clean
    
    # Remove any unused subplots (in case the grid size exceeds the number of columns)
    for i in range(n_columns, len(axes)):
        fig.delaxes(axes[i])
    
    # Adjust layout to prevent overlap of subplots
    plt.tight_layout()
    
    # Show the histograms
    plt.show()

# Calling the function to plot histograms for the DataFrame 'df'
plot_all_histograms(df)


In [None]:
# Splitting the dataset into training and test sets.
# X: Features (independent variables)
# Y: Target (dependent variable)
# test_size=0.1: Reserves 10% of the data for testing, and 90% for training.
# stratify=Y: Ensures that the class distribution in Y is preserved in both the training and test sets.
# random_state=1: Ensures reproducibility by setting a seed for the random number generator.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=1)

# Printing the shapes of the overall dataset, training set, and test set.
# This is helpful for verifying that the data was split correctly.
print(X.shape, X_train.shape, X_test.shape)


In [None]:
# Initializing the Logistic Regression model.
# Logistic Regression is a linear model commonly used for binary classification tasks.
# The model will be trained using the training data (X_train, Y_train) and tested on the test data (X_test, Y_test).
model = LogisticRegression()

In [None]:
# Training the Logistic Regression model using the training data.
# The fit() method trains the model by finding the optimal weights that map the input features (X_train)
# to the target labels (Y_train). This is where the model learns from the training data.
model.fit(X_train, Y_train)

In [None]:
# Making predictions on the training data using the trained Logistic Regression model.
# The predict() method generates predictions (X_train_prediction) based on the input features (X_train).
X_train_prediction = model.predict(X_train)

# Calculating the accuracy of the model on the training data.
# accuracy_score() compares the predicted labels (X_train_prediction) with the actual labels (Y_train).
# It returns the proportion of correct predictions made by the model.
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

# Printing the accuracy of the model on the training data.
print("The accuracy on training data:", training_data_accuracy)

In [None]:
# Making predictions on the test data using the trained Logistic Regression model.
# The predict() method generates predictions (X_test_prediction) based on the input features (X_test).
X_test_prediction = model.predict(X_test)

# Calculating the accuracy of the model on the test data.
# accuracy_score() compares the predicted labels (X_test_prediction) with the actual labels (Y_test).
# It returns the proportion of correct predictions made by the model on unseen data (test set).
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

# Printing the accuracy of the model on the test data.
print("The accuracy on test data:", test_data_accuracy)


In [None]:
def normalize_data(df):
    """
    Applies various normalization techniques to the feature columns of a DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame where the last column is the target, 
                           and the rest are features to be normalized.
    
    Normalization Techniques Used:
    - StandardScaler (Z-score normalization)
    - MinMaxScaler
    - Log Transformation (logarithmic normalization)
    - Square Root Transformation
    - Box-Cox Transformation
    
    The function also plots histograms for each normalized dataset to visualize the effects
    of the normalization techniques.

    Returns:
    - A dictionary containing DataFrames for the original and each normalized version of the features.
    - The target column (Y) as a separate series.
    """
    
    # Separate features (X) and target (y)
    X = df.iloc[:, :-1]  # All columns except the last
    y = df.iloc[:, -1]   # Last column (target)

    # Create copies of features for each normalization technique
    X_standard = X.copy()
    X_minmax = X.copy()
    X_log = X.copy()
    X_sqrt = X.copy()
    X_boxcox = X.copy()

    # StandardScaler (Z-score normalization): Centers the data with mean 0 and variance 1
    scaler = StandardScaler()
    X_standard[:] = scaler.fit_transform(X)

    # MinMaxScaler: Scales data to be within a specified range, usually [0, 1]
    minmax_scaler = MinMaxScaler()
    X_minmax[:] = minmax_scaler.fit_transform(X)

    # Log transformation: Applies logarithmic normalization, adding 1 to handle zero values
    X_log[:] = np.log1p(X)

    # Square root transformation: Applies square root normalization to reduce the impact of large values
    X_sqrt[:] = np.sqrt(X)

    # Box-Cox transformation: Applies the Box-Cox normalization technique, adding 1 to handle zero values
    for column in X_boxcox.columns:
        X_boxcox[column], _ = boxcox(X_boxcox[column] + 1)  # Adding 1 to handle zeros

    # Inner function to plot histograms for each normalization technique
    def plot_histograms(data, title):
        """
        Plots histograms for each feature in the DataFrame to visualize data distribution.

        Parameters:
        data (pandas.DataFrame): The DataFrame to plot.
        title (str): Title for the plot.
        """
        plt.figure(figsize=(20, 15))
        for i, col in enumerate(data.columns):
            plt.subplot(8, 8, i+1)  # Adjusting for an 8x8 grid layout
            sns.histplot(data[col], kde=True)
            plt.title(col)
        plt.tight_layout()
        plt.suptitle(title, fontsize=16)  # Main title
        plt.subplots_adjust(top=0.95)  # Adjusting space for title
        plt.show()

    # Plot histograms for the original and normalized datasets
    plot_histograms(X, "Original Data")
    plot_histograms(X_standard, "StandardScaler Normalization")
    plot_histograms(X_minmax, "MinMaxScaler Normalization")
    plot_histograms(X_log, "Log Transformation")
    plot_histograms(X_sqrt, "Square Root Transformation")
    plot_histograms(X_boxcox, "Box-Cox Transformation")

    # Return a dictionary containing all the normalized DataFrames and the target variable
    return {
        'original': X,
        'standard': X_standard,
        'minmax': X_minmax,
        'log': X_log,
        'sqrt': X_sqrt,
        'boxcox': X_boxcox
    }, y

# Usage of the function with a given DataFrame 'df'
normalized_data, target = normalize_data(df)


In [None]:
def train_and_evaluate_model(df):
    """
    Trains and evaluates a logistic regression model using the provided DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame where the last column is the target variable
                           and the remaining columns are the features.

    Functionality:
    - Separates the features (X) and target (y).
    - Normalizes the features using StandardScaler to ensure the model performs better.
    - Splits the dataset into training (80%) and testing (20%) sets.
    - Trains a Logistic Regression model on the training data.
    - Evaluates the model by calculating accuracy on both the training and testing sets.
    
    Returns:
    - model (LogisticRegression): The trained logistic regression model.
    - train_accuracy (float): The accuracy score on the training data.
    - test_accuracy (float): The accuracy score on the testing data.
    """
    
    # Separate features (X) and target (y)
    X = df.iloc[:, :-1]  # All columns except the last one
    y = df.iloc[:, -1]   # Last column is the target variable
    
    # Normalize the features using StandardScaler to standardize the dataset
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)

    # Split the normalized data into training and testing sets (80% training, 20% testing)
    # random_state=42 ensures the split is reproducible
    X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

    # Initialize and train the logistic regression model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate accuracy on the training data
    train_accuracy = accuracy_score(y_train, y_train_pred)

    # Calculate accuracy on the testing data
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Print the accuracy scores for training and testing sets
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")

    # Return the trained model along with the training and testing accuracy scores
    return model, train_accuracy, test_accuracy

# Assuming your DataFrame is named 'df', you can call the function as follows:
model, train_accuracy, test_accuracy = train_and_evaluate_model(df)


In [None]:
def train_and_evaluate_model(df):
    """
    Trains and evaluates a logistic regression model after applying the Box-Cox transformation
    to the feature columns of the input DataFrame.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame where the last column is the target variable
                           and the remaining columns are the features.

    Functionality:
    - Separates the features (X) and target (y).
    - Applies Box-Cox transformation to normalize the distribution of the features.
    - Splits the dataset into training (80%) and testing (20%) sets.
    - Trains a Logistic Regression model on the transformed training data.
    - Evaluates the model by calculating accuracy on both the training and testing sets.
    
    Returns:
    - model (LogisticRegression): The trained logistic regression model.
    - train_accuracy (float): The accuracy score on the training data.
    - test_accuracy (float): The accuracy score on the testing data.
    """
    
    # Separate features (X) and target (y)
    X = df.iloc[:, :-1]  # All columns except the last
    y = df.iloc[:, -1]   # Last column is the target variable

    # Apply Box-Cox transformation to normalize the features
    X_boxcox = pd.DataFrame()
    for column in X.columns:
        # Add a small constant to make all values positive, ensuring no zero or negative values for Box-Cox
        min_val = X[column].min()
        if min_val <= 0:
            X[column] = X[column] - min_val + 1e-5  # Adding a small constant to avoid negative values

        # Apply Box-Cox transformation
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')  # Suppress warnings during the transformation
            X_boxcox[column], _ = boxcox(X[column])

    # Split the transformed data into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(X_boxcox, y, test_size=0.2, random_state=42)

    # Initialize and train the logistic regression model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the training and testing data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate accuracy on the training data
    train_accuracy = accuracy_score(y_train, y_train_pred)

    # Calculate accuracy on the testing data
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Print the accuracy scores for training and testing sets
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")

    # Return the trained model along with the training and testing accuracy scores
    return model, train_accuracy, test_accuracy

# Assuming your DataFrame is named 'df', you can call the function as follows:
model, train_accuracy, test_accuracy = train_and_evaluate_model(df)
