In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')


In [None]:
wine = pd.read_csv('Wine Data.csv')
chickwts = pd.read_csv('Chick Weight Data.csv')
us_arrests = pd.read_csv('Arrests Data.csv')


In [None]:
print("Wine Dataset Info:")
print(wine.info())
print(wine.describe(), "\n")

print("Chickwts Dataset Info:")
print(chickwts.info())
print(chickwts.describe(), "\n")

print("USArrests Dataset Info:")
print(us_arrests.info())
print(us_arrests.describe(), "\n")


In [None]:
# Wine Dataset
wine.dropna(inplace=True)
wine_features = wine.drop('target', axis=1)
wine_target = LabelEncoder().fit_transform(wine['target'])
wine_scaled = StandardScaler().fit_transform(wine_features)

# Chickwts Dataset
chickwts_numeric = chickwts.drop(columns=['feed_name'])
chickwts_clean = chickwts_numeric[chickwts_numeric['weight'] > 0]
chickwts_scaled = StandardScaler().fit_transform(chickwts_clean)

# USArrests Dataset
us_arrests_clean = us_arrests.drop(columns=['Unnamed: 0'])
us_arrests_scaled = StandardScaler().fit_transform(us_arrests_clean)


In [None]:
# Wine PCA
pca_wine = PCA(n_components=0.95)
wine_pca = pca_wine.fit_transform(wine_scaled)

# Chickwts PCA
pca_chick = PCA(n_components=1)
chick_pca = pca_chick.fit_transform(chickwts_scaled)

# USArrests PCA
feature_variances = np.var(us_arrests_scaled, axis=0)
top3_idx = np.argsort(feature_variances)[-3:]
us_selected = us_arrests_scaled[:, top3_idx]
us_pca = PCA(n_components=2).fit_transform(us_selected)


In [None]:
param_grid = {
    'n_neighbors': list(range(1, 21)),
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(wine_pca, wine_target)

wine_pred = grid_search.predict(wine_pca)
print(classification_report(wine_target, wine_pred))
print("Accuracy:", accuracy_score(wine_target, wine_pred))


In [None]:
similarity_matrix = cosine_similarity(chick_pca)
chick_pca_df = pd.DataFrame(chick_pca, columns=['Component_1'])
chick_pca_df['feed_name'] = chickwts['feed_name'].values
ref_index = 0
chick_pca_df['similarity'] = similarity_matrix[ref_index]
chick_pca_df.sort_values(by='similarity', ascending=False)


In [None]:
inertias = []
silhouette_scores_kmeans = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(us_pca)
    inertias.append(kmeans.inertia_)
    silhouette_scores_kmeans.append(silhouette_score(us_pca, labels))

# Plot Elbow Method
plt.figure(figsize=(6, 4))
plt.plot(range(2, 10), inertias, marker='o')
plt.title('Elbow Method (KMeans)')
plt.xlabel('K')
plt.ylabel('Inertia')
plt.tight_layout()
plt.show()

# Plot Silhouette Scores
plt.figure(figsize=(6, 4))
plt.plot(range(2, 10), silhouette_scores_kmeans, marker='s')
plt.title('Silhouette Score (KMeans)')
plt.xlabel('K')
plt.ylabel('Score')
plt.tight_layout()
plt.show()


In [None]:
bic_scores = []
aic_scores = []
for k in range(1, 10):
    gmm = GaussianMixture(n_components=k, random_state=42)
    gmm.fit(us_pca)
    bic_scores.append(gmm.bic(us_pca))
    aic_scores.append(gmm.aic(us_pca))

# Plot BIC and AIC
plt.figure(figsize=(6, 4))
plt.plot(range(1, 10), bic_scores, label='BIC', marker='x')
plt.plot(range(1, 10), aic_scores, label='AIC', marker='o')
plt.title('BIC and AIC Scores (GMM)')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
kmeans_final = KMeans(n_clusters=3, random_state=42).fit(us_pca)
gmm_final = GaussianMixture(n_components=3, random_state=42).fit(us_pca)
kmeans_labels = kmeans_final.predict(us_pca)
gmm_labels = gmm_final.predict(us_pca)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.scatterplot(x=us_pca[:, 0], y=us_pca[:, 1], hue=kmeans_labels, ax=axes[0], palette='Set2')
axes[0].set_title('K-Means Clustering')
sns.scatterplot(x=us_pca[:, 0], y=us_pca[:, 1], hue=gmm_labels, ax=axes[1], palette='Set3')
axes[1].set_title('GMM Clustering')
plt.tight_layout()
plt.show()
