# Load imports and data

In [None]:
from sklearn.cluster import KMeans
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, silhouette_samples
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn_extra.cluster import KMedoids
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
random_state = 42
data = pd.read_csv('output/loan_transformed.csv')
data

In [None]:
input_cols = [
    'owner_male',
    'owner_age',
    'owner_district_no_inhabitants',
    'owner_district_ratio_urban_inhabitants',
    'owner_district_average_salary',
    'owner_district_unemployment_rate_95',
    # 'owner_district_unemployment_rate_96',
    'owner_district_no_enterpreneurs_per_1000_inhabitants',
    'owner_district_no_crimes_95',
    # 'owner_district_no_crimes_96',
]

data[input_cols]

# Descriptive problem

## Finding socio-demographic profile of account owners who made loans

In [None]:
def calculate_inputs(pca=False):
    inputs = data[input_cols].values

    if pca:
        pca = PCA(n_components=2, random_state=random_state)
        pca.fit(inputs)
        inputs = pca.transform(inputs)
    
    return inputs

def run_model(model, params, metric='euclidean'):
    inputs = calculate_inputs()
    clf = model(**params)
    clf = clf.fit(inputs)

    labels = clf.labels_

    silhouettes = silhouette_samples(inputs, labels, metric=metric)

    # order inputs, labels, silhouettes by labels, then by silhouettes
    inputs = inputs[np.lexsort((silhouettes, labels))]
    labels = labels[np.lexsort((silhouettes, labels))]
    silhouettes = silhouettes[np.lexsort((silhouettes, labels))]

    # plot the silhouette scores for each sample
    x = np.arange(len(inputs))
    plt.figure(figsize=(20, 10))
    plt.bar(x, silhouettes, width=0.5)
    plt.title('Silhouette scores')
    plt.show()

    print(f"Average silhouette score: { {i: silhouettes[labels == i].mean() for i in range(clf.n_clusters)} }")
    print(f"Total average silhouette score: {np.mean(silhouettes)}")

    # The score is defined as ratio of the sum of between-cluster dispersion and of within-cluster dispersion.
    print(f"Variance Ratio Criterion: {calinski_harabasz_score(inputs, labels)}") 

    # The score is defined as the average similarity measure of each cluster with its most similar cluster, 
    # where similarity is the ratio of within-cluster distances to between-cluster distances. 
    # Thus, clusters which are farther apart and less dispersed will result in a better score.
    print(f"Davies-Bouldin score: {davies_bouldin_score(inputs, labels)}")
    
    return clf

def figure_out_best_k(model, params, metric='euclidean', min_clusters=2, max_clusters=10):
    """
    This unfortunately cannot be done automatically, must choose manually after interpreting the graphs.
    """    
    inputs = calculate_inputs()

    clf = model(**params)

    visualizer = KElbowVisualizer(clf, k=(min_clusters, max_clusters), distance_metric=metric, timings=False)
    visualizer.fit(inputs)
    visualizer.show()

    silhouettes = []
    K = range(min_clusters, max_clusters)
    for k in K:
        clf = model(**params, n_clusters=k)
        clf.fit(inputs)
        silhouettes.append(silhouette_score(inputs, clf.labels_, metric=metric))

    best_k_silhouette = np.argmax(silhouettes) + min_clusters

    plt.figure(figsize=(4,2))
    plt.plot(K, silhouettes, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Average silhouette width')
    plt.title('Silhouette-based Method showing the optimal k')
    plt.show()
 
    return visualizer.elbow_value_, best_k_silhouette

def use_model(model, params=None, metric='euclidean', min_clusters=2, max_clusters=10):
    if params is None:
        params = {}
    
    print("Finding best k...")
    elbow, silhouette = figure_out_best_k(model, params, metric, min_clusters, max_clusters)
    print("=========================================")
    print(f"Using best k from elbow method: {elbow}")
    run_model(model, {**params, 'n_clusters': elbow}, metric)
    print("=========================================")
    print(f"Using best k from silhouette method: {silhouette}")
    run_model(model, {**params, 'n_clusters': silhouette}, metric)


In [None]:
use_model(KMeans, {'random_state': random_state}, metric='euclidean')


In [None]:
use_model(AgglomerativeClustering, {'linkage': 'average'}, metric='euclidean')

In [None]:
use_model(KMedoids, {'method': 'pam', 'init': 'heuristic', 'max_iter': 300}, metric='euclidean')

# Use and plot best model

In [None]:
clf = run_model(AgglomerativeClustering, {'linkage': 'average', 'n_clusters': 2}, metric='euclidean')
data['owner_profile'] = clf.labels_
data

In [None]:
sns.scatterplot(data, x='owner_district_ratio_urban_inhabitants', y='owner_district_average_salary', hue='owner_profile', palette='Set1')

In [None]:
sns.scatterplot(data, x='owner_district_unemployment_rate_95', y='owner_district_no_crimes_95', hue='owner_profile', palette='Set1')

In [None]:
sns.scatterplot(data, x='owner_district_no_inhabitants', y='owner_district_no_cities', hue='owner_profile', palette='Set1')

# Save data

In [None]:
data.to_csv('output/loan_transformed_with_cluster.csv', index=False)

df = data.sort_values(by='loan_date')
competition = df[df['Predicted'].isna()]
data = df[~df['Predicted'].isna()]

data.to_csv('output/loan_dev_transformed_with_cluster.csv', index=False)
competition.to_csv('output/loan_comp_transformed_with_cluster.csv', index=False)