# Coursework final

- Andy Yuan
- Aidan Dignam
- Amelia Walker
- Owen Stevenson


# Question 1

In [None]:
# Imports

import numpy as np
import polars as pl
import pandas as pd

# Clustering
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering

# Dimensionality reduction
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.manifold import TSNE

# Tree-based models
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor

# Model selection and evaluation
from sklearn.model_selection import (
    GridSearchCV, train_test_split, StratifiedKFold, cross_val_score, 
    cross_validate, RepeatedKFold
)

from sklearn.metrics import (
    silhouette_samples, silhouette_score, confusion_matrix, 
    roc_curve, roc_auc_score, classification_report, accuracy_score, r2_score
)

# Preprocessing and feature selection
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector

# Linear models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import linear_model

# XGBoost
from xgboost import XGBRegressor, XGBClassifier

# Shap
import shap
shap.initjs() # Import Java engine.

# Plotting
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Statistical and optimization tools
from scipy.sparse import csr_matrix
from scipy.stats import zscore
from scipy.special import factorial
import scipy.optimize as so

# Text processing
import sklearn.feature_extraction.text as sktext
import re

# Dimensionality reduction
import umap

# Miscellaneous
import os
from itertools import chain, combinations
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from yellowbrick.cluster.elbow import kelbow_visualizer
import heapq

In [None]:
data = pl.read_csv("../Coursework Data/Household data.csv")
data.describe()

## a. Data clean


In [None]:
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data.to_pandas()), columns=data.columns)

## b. K-means

### Clusterer and elbow

In [None]:
# k-means clusterer

KClusterer = KMeans(n_clusters=3, # initialized with 3 for simplicity
                   verbose=0,
                   random_state=2025) # Name of operator and cluster number

# Elbow visualizer

visualizer = KElbowVisualizer(KClusterer,
                              k=(2,20),
                              locate_elbow=True,
                              timings=False)

visualizer.fit(data_scaled)
visualizer.show

### Silhouette

In [None]:
# Silhouette visualizer

fig, axes = plt.subplots(15, 3, figsize=(15,10))
axes = axes.flatten()
sil_scores = {}

for i in range(2,20):
    Kmeansclusterer = KMeans(n_clusters=i,
                             verbose=0,
                             random_state=2025
                                            )
    cluster_labels = Kmeansclusterer.fit_predict(data_scaled)
    sil_avg = silhouette_score(data_scaled, cluster_labels)
    sil_scores[f'{i} clusters'] = sil_avg
    
    visualizer = SilhouetteVisualizer(Kmeansclusterer, colors='yellowbrick', ax=axes[i-1])
    visualizer.fit(data_scaled)  # Fit only the scaled data, not cluster labels
    axes[i-1].set_title(f"{i} Clusters - Silhouette: {sil_avg:.2f}")
    
plt.tight_layout()
plt.show()

print("Silhouette Scores:", sil_scores)
max_values = heapq.nlargest(3, sil_scores.values())
max_keys = [key for key, value in sil_scores.items() if value in max_values]
print("The greatest silhouette scores are from ", max_keys, "with scores of", max_values)

### Applying k-means according to best cluster number (data not scaled)

In [None]:
KClusterer = KMeans(n_clusters=3, # put in optimal cluster amount
                   verbose=0,
                   random_state=2025) # Name of operator and cluster number

data_np = data.to_numpy() # not the scaled version
cluster_labels = KClusterer.fit_predict(data_np)
data_cluster_table = data.with_columns(pl.Series("cluster_label", cluster_labels))

cluster_avg = data_cluster_table.group_by("cluster_label").agg([
    pl.col(column).mean().alias(f"{column}_mean") for column in data_cluster_table.columns if column != "cluster_label"
])
print(cluster_avg.sort('cluster_label'))

## c. Linear dimensionality reductions

### i. Applying PCA

In [None]:
## tf-idf transformer
TfIDFTransformer = sktext.TfidfVectorizer(strip_accents='unicode', # Eliminate accents and special characters
                      stop_words='english', # Eliminates stop words.
                      min_df = 0.01, # Eliminate words that do not appear in more than 5% of texts
                      max_df = 0.95, # Eliminate words that appear in more than 95% of texts
                      sublinear_tf=True # Use sublinear weights (softplus)
                      )
TfIDFdata = TfIDFTransformer.fit_transform(data["text columns"])
word_index = TfIDFTransformer.get_feature_names_out()
len(word_index)

## PCA
nPCA = PCA(n_components=100)
nPCA.fit(np.asarray(TfIDFdata.todense()))
total_variance = np.sum(nPCA.explained_variance_) * 100
print('The total explained variance of the first %i components is %.3f percent' % (nPCA.n_components_, total_variance))

### ii. Plot the data of the first two PCs in a scatterplot (scaled data)

In [None]:
pca_components = nPCA.transform(np.asarray(TfIDFdata.todense()))  # Transform TF-IDF data
pc1 = pca_components[:, 0]  # First principal component
pc2 = pca_components[:, 1]  # Second principal component
pc3 = pca_components[:, 2]  # Second principal component

cluster_labels_scaled = KClusterer.fit_predict(data_scaled)

plt.figure(figsize=(10, 7))
scatter = plt.scatter(
    pc1, 
    pc2, 
    c=cluster_labels_scaled, 
    cmap='viridis',  # You can change to 'plasma', 'rainbow', etc.
    alpha=0.6,       # Slightly transparent points
    edgecolors='w',  # White edges for better visibility
    s=50            # Point size
)

plt.xlabel('Principal Component 1 (PC1)', fontsize=12)
plt.ylabel('Principal Component 2 (PC2)', fontsize=12)
plt.title('PCA Scatterplot Colored by Cluster', fontsize=14)

cbar = plt.colorbar(scatter)
cbar.set_label('Cluster Label', fontsize=12)

plt.show()

### iii. Calculate average value of first three components, put into table

In [None]:
pca_df = pd.DataFrame({
    'PC1': pc1,
    'PC2': pc2,
    'PC3': pc3,
    'Cluster': cluster_labels_scaled  # From KClusterer.fit_predict(data_scaled)
})

cluster_means = pca_df.groupby('Cluster').mean().reset_index()

def name_cluster(row):
    max_pc = max(['PC1', 'PC2', 'PC3'], key=lambda x: abs(row[x]))
    if max_pc == 'PC1':
        return "High PC1 (Topic A)" if row['PC1'] > 0 else "Low PC1 (Topic B)"
    elif max_pc == 'PC2':
        return "High PC2 (Topic C)" if row['PC2'] > 0 else "Low PC2 (Topic D)"
    else:
        return "High PC3 (Topic E)" if row['PC3'] > 0 else "Low PC3 (Topic F)"

cluster_means['Cluster_Name'] = cluster_means.apply(name_cluster, axis=1)
print(cluster_means[['Cluster', 'Cluster_Name', 'PC1', 'PC2', 'PC3']])

## d. UMAP

### Choice of parameters

In [None]:
# best parameters to use:

# Define parameter grid
param_grid = {
    'n_neighbors': [5, 15, 30, 50],
    'min_dist': [0.01, 0.1, 0.5],
    'metric': ['euclidean', 'cosine']
}

best_score = -1
best_params = {}
best_embedding = None

# Grid search (small-scale for demonstration)
for n in param_grid['n_neighbors']:
    for d in param_grid['min_dist']:
        for m in param_grid['metric']:
            reducer = umap.UMAP(n_neighbors=n, min_dist=d, metric=m, random_state=2025)
            embedding = reducer.fit_transform(np.asarray(TfIDFdata.todense()))
            score = silhouette_score(embedding, cluster_labels)
            if score > best_score:
                best_score = score
                best_params = {'n_neighbors': n, 'min_dist': d, 'metric': m}
                best_embedding = embedding

print(f"Best parameters: {best_params}, Silhouette score: {best_score:.3f}")

### Plot UMAP

In [None]:
reducer = umap.UMAP(n_neighbors=10,              # Number of neareast neighbours to use. (set to best)
                    n_components=2,              # Number of components. UMAP is robust to larger values
                    metric='cosine',             # Metric to use. (set to best)
                    n_epochs=1000,               # Iterations. Set to convergence. None implies either 200 or 500.
                    min_dist=0.1,                # Minimum distance embedded points. Smaller makes clumps, larger, sparseness. (set to best)
                    spread=1.0,                  # Scale to combine with min_dist
                    low_memory=False,             # Run slower, but with less memory.
                    n_jobs=-1,                   # Cores to use
                    verbose=0                 # Verbosity
                   )
UMAP_embedding = reducer.fit_transform(TfIDFdata)

sns.scatterplot(x=UMAP_embedding[:, 0], y=UMAP_embedding[:, 1], hue=cluster_labels_scaled)
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.title("UMAP of data")
plt.show()

# Question 2


## a. Regularized elastic-net linear regression from data