# Coursework final

- Andy Yuan
- Aidan Dignam
- Amelia Walker
- Owen Stevenson


In [None]:
# Imports

import numpy as np
import polars as pl
import pandas as pd

# Clustering
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering

# Dimensionality reduction
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.manifold import TSNE

# Tree-based models
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor

# Model selection and evaluation
from sklearn.model_selection import (
    GridSearchCV, train_test_split, StratifiedKFold, cross_val_score, 
    cross_validate, RepeatedKFold
)

from sklearn.metrics import (
    silhouette_samples, silhouette_score, confusion_matrix, 
    roc_curve, roc_auc_score, classification_report, accuracy_score, r2_score
)

# Preprocessing and feature selection
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector

# Linear models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import linear_model

# XGBoost
from xgboost import XGBRegressor, XGBClassifier

# Shap
import shap
shap.initjs() # Import Java engine.

# Plotting
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Statistical and optimization tools
from scipy.sparse import csr_matrix
from scipy.stats import zscore
from scipy.special import factorial
import scipy.optimize as so

# Text processing
import sklearn.feature_extraction.text as sktext
import re

# Dimensionality reduction
import umap

# Miscellaneous
import os
from itertools import chain, combinations
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from yellowbrick.cluster.elbow import kelbow_visualizer

# Data import

In [None]:
data = pl.read_csv("Household data.csv")
data.describe()

scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data.to_pandas()), columns=data.columns)

In [None]:
# k- means cluster

KClusterer = KMeans(n_clusters=3,
                   verbose=0,
                   random_state=2025) # Name of operator and cluster number

mall2 = data_scaled.copy()
mall2['cluster_label'] = KClusterer.fit_predict(data_scaled)


sns.pairplot(vars=data_scaled.columns, # Variable names
             hue='cluster_label',        # How to colour the points. Use cluster labels
             markers=['X','o','^'],      # Differentiate markers
             data=mall2,                     # What data to use
            )

plt.show()

In [None]:
# PCA 

## td-idf transformer
TfIDFTransformer = sktext.TfidfVectorizer(strip_accents='unicode', # Eliminate accents and special characters
                      stop_words='english', # Eliminates stop words.
                      min_df = 0.01, # Eliminate words that do not appear in more than 5% of texts
                      max_df = 0.95, # Eliminate words that appear in more than 95% of texts
                      sublinear_tf=True # Use sublinear weights (softplus)
                      )
TfIDFdata = TfIDFTransformer.fit_transform(data["text column"])
word_index = TfIDFTransformer.get_feature_names_out()
len(word_index)

## PCA
nPCA = PCA(n_components=100)
nPCA.fit(np.asarray(TfIDFyelp.todense()))
total_variance = np.sum(nPCA.explained_variance_) * 100
print('The total explained variance of the first %i components is %.3f percent' % (nPCA.n_components_, total_variance))

## part 1 c i and ii

In [None]:
# UMAP

reducer = umap.UMAP(n_neighbors=10,              # Number of neareast neighbours to use.
                    n_components=2,              # Number of components. UMAP is robust to larger values
                    metric='cosine',             # Metric to use.
                    n_epochs=1000,               # Iterations. Set to convergence. None implies either 200 or 500.
                    min_dist=0.1,                # Minimum distance embedded points. Smaller makes clumps, larger, sparseness.
                    spread=1.0,                  # Scale to combine with min_dist
                    low_memory=False,             # Run slower, but with less memory.
                    n_jobs=-1,                   # Cores to use
                    verbose=0                 # Verbosity
                   )
UMAP_embedding = reducer.fit_transform(TfIDFdata)

# regression
