In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


# **Data Collection**

In [3]:
df = pd.read_csv('sonar_data.csv',header=None )

# **Data Preprocessing**

In [4]:
df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
56    0
57    0
58    0
59    0
60    0
Length: 61, dtype: int64

In [5]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       208 non-null    float64
 1   1       208 non-null    float64
 2   2       208 non-null    float64
 3   3       208 non-null    float64
 4   4       208 non-null    float64
 5   5       208 non-null    float64
 6   6       208 non-null    float64
 7   7       208 non-null    float64
 8   8       208 non-null    float64
 9   9       208 non-null    float64
 10  10      208 non-null    float64
 11  11      208 non-null    float64
 12  12      208 non-null    float64
 13  13      208 non-null    float64
 14  14      208 non-null    float64
 15  15      208 non-null    float64
 16  16      208 non-null    float64
 17  17      208 non-null    float64
 18  18      208 non-null    float64
 19  19      208 non-null    float64
 20  20      208 non-null    float64
 21  21      208 non-null    float64
 22  22

**Check Outlier**

In [7]:
# Check Outlier
# capping (winsorization)
outlier_mask = pd.DataFrame(False, index=df.index, columns=df.columns)
outlier_count = {}
df_capped = df.copy()
for col in df.select_dtypes(include=['float64', 'int64']).columns:

  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 -Q1

  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  outlier_mask[col] = (df[col] < lower_bound) | (df[col] > upper_bound)
  outlier_count[col] = outlier_mask[col].sum()
  df_capped[col]= df_capped[col].clip(lower_bound,upper_bound)

df_outlier = df[outlier_mask.any(axis=1)]

In [8]:
# Check Outlier
outlier_count

{0: 15,
 1: 13,
 2: 11,
 3: 13,
 4: 5,
 5: 8,
 6: 5,
 7: 11,
 8: 8,
 9: 10,
 10: 5,
 11: 2,
 12: 4,
 13: 5,
 14: 2,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 20: 0,
 21: 0,
 22: 0,
 23: 2,
 24: 0,
 25: 0,
 26: 0,
 27: 0,
 28: 0,
 29: 0,
 30: 0,
 31: 0,
 32: 0,
 33: 0,
 34: 0,
 35: 0,
 36: 0,
 37: 8,
 38: 5,
 39: 4,
 40: 4,
 41: 4,
 42: 5,
 43: 11,
 44: 24,
 45: 16,
 46: 13,
 47: 9,
 48: 8,
 49: 14,
 50: 6,
 51: 13,
 52: 3,
 53: 7,
 54: 9,
 55: 6,
 56: 8,
 57: 9,
 58: 12,
 59: 8}

In [9]:
# capping (winsorization)
df_capped

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02000,0.0371,0.0428,0.0207,0.0954,0.098600,0.1539,0.160100,0.310900,0.211100,...,0.0027,0.0065,0.0159,0.0072,0.016700,0.018000,0.0084,0.0090,0.0032,R
1,0.04530,0.0523,0.0843,0.0689,0.1183,0.234775,0.2156,0.303362,0.333700,0.287200,...,0.0084,0.0089,0.0048,0.0094,0.019100,0.014000,0.0049,0.0052,0.0044,R
2,0.02620,0.0582,0.1099,0.1083,0.0974,0.228000,0.2431,0.303362,0.438025,0.504838,...,0.0232,0.0166,0.0095,0.0180,0.019838,0.020512,0.0164,0.0095,0.0078,R
3,0.01000,0.0171,0.0623,0.0205,0.0205,0.036800,0.1098,0.127600,0.059800,0.126400,...,0.0121,0.0036,0.0150,0.0085,0.007300,0.005000,0.0044,0.0040,0.0117,R
4,0.06885,0.0666,0.0481,0.0394,0.0590,0.064900,0.1209,0.246700,0.356400,0.445900,...,0.0031,0.0054,0.0105,0.0110,0.001500,0.007200,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.01870,0.0346,0.0168,0.0177,0.0393,0.163000,0.2028,0.169400,0.232800,0.268400,...,0.0116,0.0098,0.0199,0.0033,0.010100,0.006500,0.0115,0.0193,0.0157,M
204,0.03230,0.0101,0.0298,0.0564,0.0760,0.095800,0.0990,0.101800,0.103000,0.215400,...,0.0061,0.0093,0.0135,0.0063,0.006300,0.003400,0.0032,0.0062,0.0067,M
205,0.05220,0.0437,0.0180,0.0292,0.0351,0.117100,0.1257,0.117800,0.125800,0.252900,...,0.0160,0.0029,0.0051,0.0062,0.008900,0.014000,0.0138,0.0077,0.0031,M
206,0.03030,0.0353,0.0490,0.0608,0.0167,0.135400,0.1465,0.112300,0.194500,0.235400,...,0.0086,0.0046,0.0126,0.0036,0.003500,0.003400,0.0079,0.0036,0.0048,M


In [10]:
df_capped[60].value_counts()

60
M    111
R     97
Name: count, dtype: int64

**M ----> MINE**

**R -----> ROCK**

>  



# **Data Splitting**

In [11]:
#Seperate Data

X = df_capped.drop(columns=60,axis=1)
Y = df_capped[60]

In [12]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.02000,0.0371,0.0428,0.0207,0.0954,0.098600,0.1539,0.160100,0.310900,0.211100,...,0.0232,0.0027,0.0065,0.0159,0.0072,0.016700,0.018000,0.0084,0.0090,0.0032
1,0.04530,0.0523,0.0843,0.0689,0.1183,0.234775,0.2156,0.303362,0.333700,0.287200,...,0.0125,0.0084,0.0089,0.0048,0.0094,0.019100,0.014000,0.0049,0.0052,0.0044
2,0.02620,0.0582,0.1099,0.1083,0.0974,0.228000,0.2431,0.303362,0.438025,0.504838,...,0.0033,0.0232,0.0166,0.0095,0.0180,0.019838,0.020512,0.0164,0.0095,0.0078
3,0.01000,0.0171,0.0623,0.0205,0.0205,0.036800,0.1098,0.127600,0.059800,0.126400,...,0.0241,0.0121,0.0036,0.0150,0.0085,0.007300,0.005000,0.0044,0.0040,0.0117
4,0.06885,0.0666,0.0481,0.0394,0.0590,0.064900,0.1209,0.246700,0.356400,0.445900,...,0.0156,0.0031,0.0054,0.0105,0.0110,0.001500,0.007200,0.0048,0.0107,0.0094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.01870,0.0346,0.0168,0.0177,0.0393,0.163000,0.2028,0.169400,0.232800,0.268400,...,0.0203,0.0116,0.0098,0.0199,0.0033,0.010100,0.006500,0.0115,0.0193,0.0157
204,0.03230,0.0101,0.0298,0.0564,0.0760,0.095800,0.0990,0.101800,0.103000,0.215400,...,0.0051,0.0061,0.0093,0.0135,0.0063,0.006300,0.003400,0.0032,0.0062,0.0067
205,0.05220,0.0437,0.0180,0.0292,0.0351,0.117100,0.1257,0.117800,0.125800,0.252900,...,0.0155,0.0160,0.0029,0.0051,0.0062,0.008900,0.014000,0.0138,0.0077,0.0031
206,0.03030,0.0353,0.0490,0.0608,0.0167,0.135400,0.1465,0.112300,0.194500,0.235400,...,0.0042,0.0086,0.0046,0.0126,0.0036,0.003500,0.003400,0.0079,0.0036,0.0048


In [13]:
Y

0      R
1      R
2      R
3      R
4      R
      ..
203    M
204    M
205    M
206    M
207    M
Name: 60, Length: 208, dtype: object

**Training and Test Data**

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.1, stratify=Y, random_state=2)

# **Logistic Regression**

In [15]:
model = LogisticRegression()

In [16]:
model.fit(x_train,y_train)

Tuning

In [17]:
# pipeline biar scaling otomatis
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ctf", LogisticRegression(max_iter=1000))
])
#parameter grid
param_grid = {
    "ctf__C" : [0.01, 0.1, 1, 10, 100],
    "ctf__penalty" : ["l1", "l2"],
    'ctf__solver' : ['liblinear', 'saga']
}
#tuning using cross validation
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
grid.fit(x_train, y_train)



In [18]:
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

Best Parameters: {'ctf__C': 0.1, 'ctf__penalty': 'l2', 'ctf__solver': 'saga'}
Best CV Score: 0.7698435277382646


In [19]:
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)

Model Evaluating

In [24]:
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.9047619047619048


In [28]:
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="M")
recall = recall_score(y_test, y_pred, pos_label="M")

In [29]:
print("Confusion Matrix:\n", cm)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
 [[10  1]
 [ 1  9]]
Precision: 0.9090909090909091
Recall: 0.9090909090909091
