# 1. (A) KNN Implementation Using KNeighborsClassifier from the sklearn.neighbors

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# Load the Wine dataset
wine_data = load_wine()
X = wine_data.data
y = wine_data.target

# Combining of  features and target into a single DataFrame
data = pd.DataFrame(X, columns=wine_data.feature_names)
data['target'] = y

# Remove duplicates
data = data.drop_duplicates()

# Standardization of the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop('target', axis=1))

# Applying PCA to reduce dimensions to 5
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# Save the preprocessed data including target
preprocessed_data = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(5)])
preprocessed_data['target'] = data['target']  # Ensure target is included after dropping duplicates
preprocessed_data.to_csv('wine_knn_pca_data.csv', index=False)

# Load the preprocessed data
preprocessed_data = pd.read_csv('wine_knn_pca_data.csv')
X = preprocessed_data.drop('target', axis=1)
y = preprocessed_data['target']

# Split the dataset into training and testing sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predictions and evaluation
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Model Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')


Model Accuracy: 0.96
F1 Score: 0.96


# 1B. KNN Implementation Using scipy KDTree

In [2]:
from scipy.spatial import KDTree
from collections import Counter

# Load the preprocessed data
preprocessed_data = pd.read_csv('wine_knn_pca_data.csv')
X = preprocessed_data.drop('target', axis=1).values
y = preprocessed_data['target'].values

# Split the dataset into training and testing sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create KDTree with the training data
tree = KDTree(X_train)

# Query KDTree for k nearest neighbors (k=5)
k = 5
distances, indices = tree.query(X_test, k=k)

# Aggregate predictions using majority voting
predicted_labels = []
for index_list in indices:
    neighbor_labels = y_train[index_list]
    most_common = Counter(neighbor_labels).most_common(1)[0][0]
    predicted_labels.append(most_common)

accuracy = accuracy_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels, average='weighted')

print(f'Model Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

Model Accuracy: 0.96
F1 Score: 0.96


# 2. (A) Implementation 1: Using DecisionTreeClassifier from scikit-learn

In [3]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, classification_report, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load the Wine dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name='target')

# Drop duplicates, if any
X.drop_duplicates(inplace=True)
y = y.loc[X.index]

# Normalize the data using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save preprocessed data to CSV
preprocessed_data = pd.DataFrame(X_scaled, columns=wine.feature_names)
preprocessed_data['target'] = y
preprocessed_data.to_csv('preprocessed_wine_data.csv', index=False)

# Load preprocessed data
preprocessed_data = pd.read_csv('preprocessed_wine_data.csv')
X = preprocessed_data.drop(columns=['target'])
y = preprocessed_data['target']

# Split data into training and testing sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# DecisionTreeClassifier Implementation
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_clf = clf.predict(X_test)
f1_clf = f1_score(y_test, y_pred_clf, average='weighted')
accuracy_clf = accuracy_score(y_test, y_pred_clf)

print("\n sklearn Decision Tree Classifier Results:")
print(f"F1 Score: {f1_clf:.2f}")
print(f"Accuracy: {accuracy_clf:.2f}")



 sklearn Decision Tree Classifier Results:
F1 Score: 0.96
Accuracy: 0.96


# 2B. Implementation Using XGBOOST DecisionTree 

In [4]:
import xgboost as xgb

# Load the preprocessed wine data from CSV
preprocessed_data = pd.read_csv('preprocessed_wine_data.csv')
X = preprocessed_data.drop(columns=['target'])
y = preprocessed_data['target']

# Split data into training and testing sets with the same 70/30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize the XGBoost classifier with consistent random state
xgb_clf = xgb.XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# Train the XGBoost model
xgb_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Evaluate the model using the same metrics
f1_score_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Output the evaluation metrics
print("\nXGBoost Classifier Results:")
print(f"F1 Score: {f1_score_xgb:.2f}")
print(f"Accuracy: {accuracy_xgb:.2f}")



XGBoost Classifier Results:
F1 Score: 1.00
Accuracy: 1.00
