In [36]:
import os
os.environ['OMP_NUM_THREADS'] = '3' # for windows threading issue

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [37]:
# You are provided with a dataset minerals.csv containing data about the height, width, density,
# hardness, and color_intensity of various mineral stones. The first column indicates the class of
# each stone (labeled as X, Y, Z, W, or V).

In [38]:
df = pd.read_csv('data/minerals.csv', delimiter=',')

In [39]:
# 1. Data Handling:
# Ensure that the dataset is preprocessed (handle any missing values, normalize the features
# if required).

df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)

X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

df_scaled.insert(0, 'Class', y)

In [40]:
# 2. Decision Tree Classification:
# a. Randomly split the dataset into 70% training and 30% test sets.
# b. Build a classification model using Decision Trees.
# c. Generate the confusion matrix and classification report for the test set.

X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('Class', axis=1), df_scaled['Class'], test_size=0.3, random_state=42)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14 15 11  8  6]
 [15 12 10  7 13]
 [14 15 13 15 11]
 [14 12 13 13 13]
 [ 9 18 11  8 10]]
              precision    recall  f1-score   support

           V       0.21      0.26      0.23        54
           W       0.17      0.21      0.19        57
           X       0.22      0.19      0.21        68
           Y       0.25      0.20      0.22        65
           Z       0.19      0.18      0.18        56

    accuracy                           0.21       300
   macro avg       0.21      0.21      0.21       300
weighted avg       0.21      0.21      0.21       300



In [41]:
# 3. K-Nearest Neighbors Classification:
# a. Randomly split the dataset into 70% training and 30% test sets.
# b. Build a classification model using KNN with K=7.
# c. Generate the confusion matrix and classification report for the test set.

X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('Class', axis=1), df_scaled['Class'], test_size=0.3, random_state=42)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[18 12 13  8  3]
 [22 14 10  6  5]
 [18 18 11 13  8]
 [23 15 15  7  5]
 [15 17 12  8  4]]
              precision    recall  f1-score   support

           V       0.19      0.33      0.24        54
           W       0.18      0.25      0.21        57
           X       0.18      0.16      0.17        68
           Y       0.17      0.11      0.13        65
           Z       0.16      0.07      0.10        56

    accuracy                           0.18       300
   macro avg       0.18      0.18      0.17       300
weighted avg       0.18      0.18      0.17       300



In [42]:
# 4. Support Vector Machine (SVM) Classification:
# a. Randomly split the dataset into 70% training and 30% test sets.
# b. Build a classification model using SVM with a radial basis function (RBF) kernel.
# c. Generate the confusion matrix and classification report for the test set.

X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('Class', axis=1), df_scaled['Class'], test_size=0.3, random_state=42)

svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13 15 11  6  9]
 [10 20 13  3 11]
 [12 19 12 11 14]
 [15 12 21  8  9]
 [ 7 18 12  8 11]]
              precision    recall  f1-score   support

           V       0.23      0.24      0.23        54
           W       0.24      0.35      0.28        57
           X       0.17      0.18      0.18        68
           Y       0.22      0.12      0.16        65
           Z       0.20      0.20      0.20        56

    accuracy                           0.21       300
   macro avg       0.21      0.22      0.21       300
weighted avg       0.21      0.21      0.21       300



In [43]:
# 5. Naïve Bayes Classification:
# a. Randomly split the dataset into 70% training and 30% test sets.
# b. Build a classification model using Gaussian Naïve Bayes method.
# c. Generate the confusion matrix and classification report for the test set.

X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('Class', axis=1), df_scaled['Class'], test_size=0.3, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[14 14  9  7 10]
 [ 9 16 12  5 15]
 [15 16 11 13 13]
 [12 21 18  4 10]
 [14 18  9  3 12]]
              precision    recall  f1-score   support

           V       0.22      0.26      0.24        54
           W       0.19      0.28      0.23        57
           X       0.19      0.16      0.17        68
           Y       0.12      0.06      0.08        65
           Z       0.20      0.21      0.21        56

    accuracy                           0.19       300
   macro avg       0.18      0.20      0.19       300
weighted avg       0.18      0.19      0.18       300



In [46]:
# 6. Classification of New Data:
# Use all four models trained in the tasks above to classify the following new entry: height: 5.8,
# width: 3.6, density: 6.9, hardness: 7.4, color_intensity: 8.2. Compare the predicted class for this
# entry from all three models and briefly discuss any differences.

new_entry = pd.DataFrame({'Height': [5.8], 'Width': [3.6], 'Density': [6.9], 'Hardness': [7.4], 'Color_Intensity': [8.2]})
new_entry_scaled = scaler.transform(new_entry)

dt_pred = dt.predict(new_entry_scaled)

knn_pred = knn.predict(new_entry_scaled)

svm_pred = svm.predict(new_entry_scaled)

nb_pred = nb.predict(new_entry_scaled)

print(f'Decision Tree Prediction: {dt_pred}')
print(f'KNN Prediction: {knn_pred}')
print(f'SVM Prediction: {svm_pred}')
print(f'Naive Bayes Prediction: {nb_pred}')

Decision Tree Prediction: ['Z']
KNN Prediction: ['V']
SVM Prediction: ['W']
Naive Bayes Prediction: ['W']


