# **KNN Classifier**

**Step 1: Load and Prepare Data**

In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Create the dataset
data = {
    'pH': [6.5, 7, 8, 6.8, 7.5, 5, 9, 6.2, 7.2, 8.5],
    'Salinity': [30, 35, 28, 33, 36, 40, 25, 32, 34, 29],
    'Turbidity': [5, 10, 8, 6, 4, 20, 2, 7, 9, 3],
    'Dissolved Oxygen': [6.8, 7.5, 5, 6, 8.2, 4.5, 9, 6.2, 7, 8],
    'Nutrient Concentration': [1.2, 0.8, 1.5, 1, 0.6, 2, 0.5, 1.1, 0.9, 0.7],
    'Category': ['Clean', 'Clean', 'Moderately Polluted', 'Clean', 'Clean',
                 'Polluted', 'Clean', 'Moderately Polluted', 'Clean', 'Clean']
}

df = pd.DataFrame(data)

# Encode the categorical labels
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category'])  # Clean=0, Moderately Polluted=1, Polluted=2

# Features and target
X = df.drop('Category', axis=1)
y = df['Category']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Step 2: Train the KNN Classifier**

In [23]:
# Initialize and train
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

**Step 3: Make Predictions and Evaluate**

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# All label values in encoded form: [0, 1, 2]
all_labels = [0, 1, 2]  # Clean=0, Moderately Polluted=1, Polluted=2

# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=all_labels))
print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=all_labels, target_names=le.classes_))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[2 0 0]
 [0 0 0]
 [0 0 0]]

Classification Report:
                      precision    recall  f1-score   support

              Clean       1.00      1.00      1.00         2
Moderately Polluted       0.00      0.00      0.00         0
           Polluted       0.00      0.00      0.00         0

           accuracy                           1.00         2
          macro avg       0.33      0.33      0.33         2
       weighted avg       1.00      1.00      1.00         2

Accuracy Score: 1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


le.classes_ stores the original class names (strings) in the order they were encoded into numbers.

# **KNN Regressor**

In [26]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the dataset
data = {
    'pH': [7.2, 8.0, 7.8, 7.4, 7.5],
    'Salinity': [30, 25, 28, 35, 32],
    'Turbidity': [10, 12, 15, 20, 18],
    'Nutrients': [5, 8, 10, 6, 7],
    'DO': [6.0, 6.5, 5.8, 5.5, 6.2]
}

df = pd.DataFrame(data)

# Features and target
X = df.drop('DO', axis=1)
y = df['DO']

# Initialize and train KNN Regressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X, y)

# Predict on the training data (since dataset is small)
y_pred = knn.predict(X)

# Evaluation
print("Predictions:", y_pred)
print("MSE:", mean_squared_error(y, y_pred))
print("R² Score:", r2_score(y, y_pred))

Predictions: [6.1        6.1        6.16666667 5.83333333 5.83333333]
MSE: 0.11000000000000006
R² Score: 0.05172413793103414


# **K-mean Clustering**

In [27]:
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd

# Provided data
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'SST (°C)': [28.1, 28.5, 27.9, 28.3, 28.0, 22.0, 21.8, 22.2, 21.9, 22.1],
    'SSS (psu)': [34.8, 35.2, 34.6, 35.1, 34.9, 33.0, 32.9, 33.2, 33.1, 33.0]
}

df = pd.DataFrame(data)

# Features for clustering (SST and SSS)
X = df[['SST (°C)', 'SSS (psu)']].values

# Determine the number of clusters (K)
# From visual inspection of the data, it's clear there are two distinct groups.
# For more complex data, you might use methods like the Elbow Method or Silhouette Score.
n_clusters = 2

# Initialize and fit the KMeans model
# random_state is set for reproducibility.
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # n_init is set to 10 for robustness
kmeans.fit(X)

# Get the cluster assignments for each data point
df['Cluster'] = kmeans.labels_

# Get the final centroids of the clusters
centroids = kmeans.cluster_centers_

print("K-Means Clustering Results:")
print(df)
print("\nCluster Centroids:")
print(pd.DataFrame(centroids, columns=['SST (°C)', 'SSS (psu)']))

# You can further analyze the clusters
for cluster_id in range(n_clusters):
    cluster_data = df[df['Cluster'] == cluster_id]
    print(f"\nDetails for Cluster {cluster_id}:")
    print(cluster_data)
    print(f"Mean SST: {cluster_data['SST (°C)'].mean():.2f}°C")
    print(f"Mean SSS: {cluster_data['SSS (psu)'].mean():.2f} psu")

K-Means Clustering Results:
   ID  SST (°C)  SSS (psu)  Cluster
0   1      28.1       34.8        0
1   2      28.5       35.2        0
2   3      27.9       34.6        0
3   4      28.3       35.1        0
4   5      28.0       34.9        0
5   6      22.0       33.0        1
6   7      21.8       32.9        1
7   8      22.2       33.2        1
8   9      21.9       33.1        1
9  10      22.1       33.0        1

Cluster Centroids:
   SST (°C)  SSS (psu)
0     28.16      34.92
1     22.00      33.04

Details for Cluster 0:
   ID  SST (°C)  SSS (psu)  Cluster
0   1      28.1       34.8        0
1   2      28.5       35.2        0
2   3      27.9       34.6        0
3   4      28.3       35.1        0
4   5      28.0       34.9        0
Mean SST: 28.16°C
Mean SSS: 34.92 psu

Details for Cluster 1:
   ID  SST (°C)  SSS (psu)  Cluster
5   6      22.0       33.0        1
6   7      21.8       32.9        1
7   8      22.2       33.2        1
8   9      21.9       33.1        1
9  10