# Imports

In [None]:
%pip install umap-learn
%pip install scikit-learn-extra
%pip install country_converter
%pip install geonamescache

In [1]:
import pandas as pd
import numpy as np
import ast
import time
import pickle
import random
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap.umap_ as umap
import plotly.graph_objects as go
import sklearn
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import DBSCAN
from sklearn.metrics import confusion_matrix
from scipy import sparse
import country_converter as coco
import geonamescache
from sklearn.metrics.cluster import homogeneity_score
from collections import Counter, OrderedDict
from scipy.stats import entropy
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder

from fpgrowth import FP_Growth
from candidate_gen import CandidateGen

In [2]:
def to_pkl(data, filename):
    file = open(filename, 'wb')
    pickle.dump(data, file)

def from_pkl(filename):
    file = open(filename, 'rb') 
    return pickle.load(file)

def max_len(frequent_itemsets):
    return max([len(itemset[0]) for itemset in frequent_itemsets])

def get_binary_matrix(transactions, frequent_itemsets, use_support=False):
    matrix = np.ones((len(transactions), len(frequent_itemsets)))
    if use_support:
        for i, itemset in enumerate(frequent_itemsets):
            matrix[:,i] =  itemset[1]
    frequent_itemsets = [itemset[0] for itemset in frequent_itemsets]
    for i, transaction in enumerate(transactions):
        for j, itemset in enumerate(frequent_itemsets):
            for item in itemset:
                if item not in transaction:
                    matrix[i][j] = 0
                    break
    return matrix

def apply_pca(matrix, n_components=10, random_state=0):
    random.seed(random_state)
    return PCA(n_components=n_components, random_state=random_state).fit_transform(matrix)

def apply_umap(matrix, n_components=10, random_state=0, metric='jaccard'):
    random.seed(random_state)
    matrix = sparse.csr_matrix(matrix)
    return umap.UMAP(n_components=n_components, random_state=random_state, metric=metric).fit_transform(matrix)

def visualise_clusters(matrix, target=None, method='pca', dimensions=2, random_state=0, save_name=None):
    """
    method must be any of 'pca' or 'umap'
    dimensions must be 2 or 3
    """
    if target is None: target = np.ones(matrix.shape[0])
    # class_idx = {}
    # for i in target.unique():
    #     class_idx[i] = np.where(target==i)
        
    # class_colors = {0: 'blue', 1: 'red'}
    # point_colors = [class_colors[val] for val in target]

    reduced_features = None
    if method=='pca':
        reduced_features = apply_pca(matrix, n_components=dimensions, random_state=random_state)
    elif method=='umap':
        reduced_features = apply_umap(matrix, n_components=dimensions, random_state=random_state)

    fig = go.Figure()
    if dimensions==3:
        fig.add_trace(go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2], mode='markers', marker=dict(size=3,color=target, opacity=0.8)))
    elif dimensions==2:
        fig.add_trace(go.Scatter(x=reduced_features[:,0], y=reduced_features[:,1], mode='markers', marker=dict(size=3,color=target, opacity=0.8)))
    fig.show()

    if save_name is not None:
        fig.write_html('plots/'+save_name+'.html')

def get_optimal_pca_components(matrix, min_explained_var=0.95):
    pca = PCA().fit(matrix)
    cum_sum_explained_var = np.cumsum(pca.explained_variance_ratio_)
    min_num_components = np.where(cum_sum_explained_var>=min_explained_var)[0][0] + 1
    return min_num_components

In [62]:
df = pd.read_csv('03_winemag-data-reduced.csv', index_col='index')
df.words = df.words.map(ast.literal_eval)
df = df[~df.country.isna()].reset_index(drop=True)

  df = pd.read_csv('03_winemag-data-reduced.csv', index_col='index')


In [4]:
transactions = df.words.values.tolist()
to_pkl(transactions, 'transactions.pkl')

In [5]:
frequent_itemsets = from_pkl('frequent_itemsets.pkl')

# Run Apriori

In [58]:
# candidate_gen = CandidateGen(transactions, min_support=0.025)
# start = time.time()
# frequent_itemsets = candidate_gen.run()
# end = time.time()
# print(end-start)
# print(len(frequent_itemsets))
# print(max_len(frequent_itemsets))

46.477370738983154
236
3


In [62]:
# fp_growth = FP_Growth(transactions, min_support=0.025)
# start = time.time()
# frequent_itemsets = fp_growth.run()
# end = time.time()
# print(end-start)
# print(len(frequent_itemsets))
# print(max_len(frequent_itemsets))

785.4035103321075
236
3


In [78]:
# Get binary matrix for clustering
# matrix = get_binary_matrix(transactions, frequent_itemsets)
# to_pkl(matrix, 'matrix.pkl')

matrix = from_pkl('matrix.pkl')
ss = StandardScaler()
matrix = ss.fit_transform(matrix)

# Clustering

In [33]:
get_optimal_pca_components(matrix, min_explained_var=0.95)

778

In [67]:
reduced_matrix = from_pkl('matrix_pca_700.pkl')

In [81]:
def get_clusters(matrix, n_clusters=6, method="KMeans", params={'random_state': 0}):
    """
    Specify method parameter to an existing sklearn clustering class name
    Specify params parameter to a dictionary specific to the clustering algorithm
    """
    # matrix[np.isnan(matrix)] = 0
    # ss = StandardScaler()
    # matrix = ss.fit_transform(matrix)
    clusters = None
    if method=='DBScan': 
        dbscan = DBSCAN(eps=10, min_samples=100).fit(matrix)
        clusters = dbscan.labels_
    elif method=='KMedoids':
        kmedoids = KMedoids(n_clusters=n_clusters, **params).fit(matrix)
        clusters = kmedoids.labels_
    else: 
        clf = getattr(sklearn.cluster, method)(n_clusters=n_clusters)
        clusters = clf.fit_predict(matrix)
        
    
    return clusters

In [88]:
clusters = get_clusters(matrix, method='AgglomerativeClustering')

In [89]:
df['cluster'] = clusters
df['cluster'].value_counts()

cluster
0    34083
2     5953
4     4759
5     2600
1     1596
3     1009
Name: count, dtype: int64

In [93]:
def get_distribution(array):
    counts = Counter(array)
    total_count = sum(counts.values())
    distribution = {
        val: count / total_count for val, count in counts.items()
    }
    return OrderedDict(sorted(distribution.items()))

def get_entropy(distribution, qk):
    return entropy(list(distribution.values()), qk=qk)

def eval_entropy(df):
    res = 0
    clusters = list(df.cluster.unique())
    population_distribution = get_distribution(df['country'])
    for cluster in clusters:
        cluster_distribution = get_distribution(df[df.cluster==cluster]['country'])
        for country in population_distribution:
            if country not in cluster_distribution.keys(): cluster_distribution[country] = 0
        res += get_entropy(cluster_distribution, qk=list(population_distribution.values()))
    res /= len(clusters)
    return res

In [94]:
eval_entropy(df)

5.826957389449842

In [95]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df.country)
contingency_table = confusion_matrix(encoded_labels, df.cluster)
contingency_table = contingency_table / contingency_table.sum(axis=1, keepdims=True) 
contingency_table = contingency_table / contingency_table.sum(axis=0, keepdims=True)

unique_labels = label_encoder.classes_  # Textual labels
unique_clusters = np.unique(df.cluster)

fig = go.Figure()

# Add a bar to the figure for each label in each cluster
for i, label in enumerate(unique_labels):
    fig.add_trace(go.Bar(
        x=[f'Cluster {cluster}' for cluster in unique_clusters],
        y=contingency_table[i, :],
        name=label
    ))

# Update layout for clarity
fig.update_layout(
    barmode='group',
    title='Distribution of Labels within Predicted Clusters',
    xaxis_title='Predicted Clusters',
    yaxis_title='Count',
    legend_title='Labels'
)

# Show the figure
fig.show()


invalid value encountered in divide

