# SONAR Rock vs Mine Dataset

The **SONAR Rock vs Mine** dataset is used for binary classification, distinguishing between sonar signals reflected from rocks and underwater mines. It contains:

- **60 Features**: Continuous numerical values representing energy levels of sonar signals at different frequencies.
- **Target Variable**: 
  - `R` → Rock (Sonar signal from a rock)
  - `M` → Mine (Sonar signal from an underwater mine)

## Key Considerations:
- The dataset consists of **208 samples** (balanced between classes).
- **Classification models** like Logistic Regression, SVM, Random Forest, or Neural Networks can be applied.
- Evaluation metrics: **Accuracy, Precision, Recall, and F1-Score**.



# Import necessary libraries

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import LabelEncoder,RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from scipy.stats.mstats import winsorize

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    confusion_matrix,
    roc_curve
)

In [None]:
data=pd.read_csv("/kaggle/input/rock-vs-mine-prediction/Copy of sonar data.csv",header=None)
data.head()

# Understanding the Data

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
np.isinf(data.iloc[:,:-1]).sum()

In [None]:
data.info()

In [None]:
col_name=[]
n_unique=[]
unique_values=[]
col_dtype=[]
for col in data.columns:
    col_name.append(col)
    n_unique.append(data[col].nunique())
    unique_values.append(data[col].unique())
    col_dtype.append(data[col].dtype)
CHECH_DICT={'Column':col_name,'Number of Unique Values':n_unique,'Unique Values':unique_values,'Data Type':col_dtype}
pd.DataFrame(CHECH_DICT)

# Data visualization

## Explore Categorical Variables

In [None]:
print(data.iloc[:,-1].value_counts())
sns.countplot(data=data,x=data.iloc[:,-1])
plt.show()

In [None]:
data.groupby(60).agg(['mean','median'])

## Explore Numerical Variables

In [None]:
data.hist(figsize=(20,25))
plt.show()

In [None]:
numerical_cols = data.select_dtypes(include=["number"])

rows = len(numerical_cols.columns) // 2 if len(numerical_cols.columns) % 2 == 0 else (len(numerical_cols.columns) // 2) + 1

plt.figure(figsize=(10, rows * 4)) 

for i, col in enumerate(numerical_cols.columns):  
    plt.subplot(rows, 2, i + 1)
    sns.boxplot(y=data[col])  
    plt.ylabel(col)
    plt.title(f"Boxplot of column number{col}")

plt.tight_layout()
plt.show()

## Detecting Outliers
### Using Interquartile Range (IQR)

In [None]:
Q1=data.iloc[:,:-1].quantile(.25)
Q3=data.iloc[:,:-1].quantile(.75)
IQR=Q3-Q1

outliers=((data.iloc[:, :-1] < (Q1 - 1.5 * IQR)) | (data.iloc[:, :-1] > (Q3 + 1.5 * IQR)))
display(data[outliers.any(axis=1)]) # checks each row to see if it has at least one True value, This returns a Boolean Series where True means that row contains at least one outlier.
outliers.sum().sort_values(ascending=False)

### Using Z-Score

In [None]:
z_score=data.iloc[:,:-1].apply(zscore)
z_outliers=(z_score.abs()>3)#Any value beyond ±3 standard deviations is considered an outlier.
display(data[z_outliers.any(axis=1)])
print(z_outliers.sum().sort_values(ascending=False))

## **Best Approach for Sonar Dataset?**
- Since Sonar data is **small (208 samples)**, **removing outliers is not ideal**.
- **Best options:**
  1. **Use RobustScaler** to normalize without removing data.
  2. **Apply Winsorization** if extreme values exist.
  3. **Check ML model performance** before and after handling outliers.

In [None]:
data_winsorized = data.iloc[:,:-1].apply(lambda x :winsorize(x, limits=[0.05, 0.05]))
data_winsorized[data.columns[-1]]=data.iloc[-1]
data_winsorized

In [None]:
plt.figure(figsize=(10, rows * 4)) 

for i, col in enumerate(numerical_cols.columns):  
    plt.subplot(rows, 2, i + 1)
    sns.boxplot(y=data_winsorized[col])  
    plt.ylabel(col)
    plt.title(f"Boxplot of column number{col}")

plt.tight_layout()
plt.show()

## Checking Feature Correlation with Target

In [None]:
encoded_data=data.copy()
encoder=LabelEncoder()
encoded_data[60]=encoder.fit_transform(encoded_data[60])
correlation = encoded_data.corr()[60].abs().sort_values(ascending=False)
correlation

In [None]:
correlation[correlation<.05]

# Data Preprocessing

### Split data into separate training and test set

In [None]:
X=encoded_data.iloc[:,:-1]
y=encoded_data.iloc[:,-1]

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Feature Scaling

In [None]:
scaler=RobustScaler()
X_train.iloc[:,:-1]=scaler.fit_transform(X_train.iloc[:,:-1])
X_test.iloc[:,:-1]=scaler.transform(X_test.iloc[:,:-1])

In [None]:
# Initialize result dictionary
results = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": [],
    "ROC-AUC Score": [],
    "Cross-Validation Score":[]
}

# Function to append results
def append_results(model_name, accuracy, precision, recall, f1_score, roc_auc_score,cross_score):
    results['Model'].append(model_name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1-Score'].append(f1_score)
    results['ROC-AUC Score'].append(roc_auc_score)
    results['Cross-Validation Score'].append(cross_score)

In [None]:
def model_evaluation_classification(y_pred, y_test, model, X_train, y_train, y_pred_proba=None):
    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # ROC-AUC Score
    if y_pred_proba is not None:
        roc_auc =  roc_auc_score(y_test, y_pred_proba[:, 1])
    else :
        roc_auc = "N/A (no probabilities provided)"

    # Cross-validation score on the training set
    cross_score = cross_val_score(model, X_train, y_train, cv=5)
    
    # Print evaluation metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print(f"Cross-Validation Score: {cross_score.mean():.4f}")
    
    # Display predictions and actual values
    display(pd.DataFrame(np.c_[y_pred, y_test], columns=["Prediction", "Actual"]))
    
    # Return metrics
    return accuracy, precision, recall, f1, roc_auc, cross_score.mean()

# Models

## Logistic Regression


In [None]:
lg=LogisticRegression()
lg.fit(X_train,y_train)
y_pred_lg=lg.predict(X_test)
y_pred_proba_lg=lg.predict_proba(X_test)
accuracy_lg, precision_lg, recall_lg, f1_lg, roc_auc_lg, cross_score_lg = model_evaluation_classification(y_pred_lg, y_test, lg, X_train, y_train, y_pred_proba_lg)
append_results("Logistic Regression",accuracy_lg, precision_lg, recall_lg, f1_lg, roc_auc_lg, cross_score_lg)

In [None]:
cm=confusion_matrix(y_test,y_pred_lg)
plt.figure(figsize=(8,6))
sns.heatmap(cm,annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## K-Nearest Neighbour (KNN)

In [None]:
knn_score=[]
for i in range(1,20):
    knn=KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train,y_train)
    knn_score.append(knn.score(X_test,y_test))

plt.plot(range(1,20),knn_score)
plt.xlabel("K value")
plt.xticks(np.arange(2,21,1))
plt.ylabel("Score")
plt.show()

In [None]:
k=np.argmax(knn_score)+1
max_accc=max(knn_score)
print("KNN best accuracy: {:.2f}% with n={}".format(max_accc*100, k))

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred_knn=knn.predict(X_test)
y_pred_proba_knn=knn.predict_proba(X_test)
# Evaluate the KNN model
accuracy_knn, precision_knn, recall_knn, f1_knn, roc_auc_knn, cross_score_knn = model_evaluation_classification(y_pred_knn, y_test, knn, X_train, y_train, y_pred_proba_knn)
# Append results for KNN
append_results("KNN",accuracy_knn, precision_knn, recall_knn, f1_knn, roc_auc_knn, cross_score_knn)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_knn)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix,annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Support Vector Machine (SVM)

In [None]:
svc = SVC( probability=True,random_state=42)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
y_pred_proba_svc = svc.predict_proba(X_test)
# Evaluate the SVC model
accuracy_svc, precision_svc, recall_svc, f1_svc, roc_auc_svc, cross_score_svc = model_evaluation_classification(
    y_pred_svc, y_test, svc, X_train, y_train, y_pred_proba_svc)

# Append results for SVC
append_results("SVC", accuracy_svc, precision_svc, recall_svc, f1_svc, roc_auc_svc, cross_score_svc)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_svc)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix,annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)
y_pred_proba_dt = dt.predict_proba(X_test)
# Evaluate the best model
accuracy_dt, precision_dt, recall_dt, f1_dt, roc_auc_dt, cross_score_dt = model_evaluation_classification(
    y_pred_dt, y_test, dt, X_train, y_train, y_pred_proba_dt)

# Append results for Decision Tree
append_results("Decision Tree", accuracy_dt, precision_dt, recall_dt, f1_dt, roc_auc_dt, cross_score_dt)

In [None]:
conf_matrix=confusion_matrix(y_test,y_pred_dt)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
y_pred_proba_rf = rf.predict_proba(X_test)
# Evaluate the RF model
accuracy_rf, precision_rf, recall_rf, f1_rf, roc_auc_rf, cross_score_rf = model_evaluation_classification(
    y_pred_rf, y_test, rf, X_train, y_train, y_pred_proba_rf)

# Append results for RF
append_results("Random Forest", accuracy_rf, precision_rf, recall_rf, f1_rf, roc_auc_rf, cross_score_rf)

In [None]:
conf_matrix=confusion_matrix(y_test,y_pred_rf)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Ensemble Learning

In [None]:
voting_clf= VotingClassifier(
    estimators=[
        ('lg', lg), 
        ('knn', knn), 
        ('svc', svc),
        ('dt', dt), 
        ('rf', rf)
    ],
    voting='soft'   
)
voting_clf.fit(X_train,y_train)
y_pred_voting = voting_clf.predict(X_test)
y_pred_voting_proba = voting_clf.predict_proba(X_test)
# Evaluate the Voting Classifier
accuracy_voting, precision_voting, recall_voting, f1_voting, roc_auc_voting, cross_score_voting = model_evaluation_classification(
    y_pred_voting, y_test, voting_clf, X_train, y_train, y_pred_voting_proba)

# Append results for Voting Classifier
append_results("Soft-Voting Classifier", accuracy_voting, precision_voting, recall_voting, f1_voting, roc_auc_voting, cross_score_voting)

In [None]:
conf_matrix=confusion_matrix(y_test,y_pred_voting)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
voting_clf= VotingClassifier(
    estimators=[
        ('lg', lg), 
        ('knn', knn), 
        ('svc', svc),
        ('dt', dt), 
        ('rf', rf)
    ],
    voting='hard'   
)
voting_clf.fit(X_train,y_train)
y_pred_voting = voting_clf.predict(X_test)
# Evaluate the Voting Classifier
accuracy_voting, precision_voting, recall_voting, f1_voting, roc_auc_voting, cross_score_voting = model_evaluation_classification(
    y_pred_voting, y_test, voting_clf, X_train, y_train, y_pred_voting_proba)

# Append results for Voting Classifier
append_results("Hard-Voting Classifier", accuracy_voting, precision_voting, recall_voting, f1_voting, roc_auc_voting, cross_score_voting)

In [None]:
conf_matrix=confusion_matrix(y_test,y_pred_voting)

plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
Models=pd.DataFrame(results)
Models.sort_values(by='Accuracy',ascending=False)