# Myopia Clusters

In this notebook I attempt to cluster a myopia dataset into discernable groups using various unsupervised machine learning algorithms.  

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Do this on Windows machines before importing KMeans to avoid a known bug (memory leak). 
    
import os
os.environ["OMP_NUM_THREADS"] = "1"

from sklearn.cluster import KMeans

# PART 1: Prepare the Data

In [None]:
def load_dataset():
    
    filepath = "Resources/myopia.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

In [None]:
def clean_dataset(a_df):
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
        
    a_df = a_df.reset_index(drop=True)
    
    return a_df   

In [None]:
def examine_dataset(a_df):
    
    print(a_df.info())
           
    print(f'\n\nMYOPIC VALUE COUNTS: \n {a_df["MYOPIC"].value_counts()}\n\n')
    
    a_df.hist(figsize = (15, 15))  
    
    sns.PairGrid(a_df[["SPHEQ", "AL", "ACD", "LT", "VCD"]]).map_upper(plt.scatter) # just scatter plot the float values

In [None]:
def scale_dataset(a_df):
    
    myopic_labels = a_df["MYOPIC"]    
    no_target_df = a_df.drop(columns=["MYOPIC"])
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(no_target_df)    
    print(f"SHAPE AFTER SCALING: {scaled_data.shape}")

    normalized_data = normalize(no_target_df)    
    print(f"SHAPE AFTER NORMALIZING: {normalized_data.shape}")

    return no_target_df, scaled_data, normalized_data, myopic_labels

In [None]:
loaded_df = load_dataset()

In [None]:
clean_df = clean_dataset(loaded_df)

In [None]:
examine_dataset(clean_df)

In [None]:
df, scaled_data, normalized_data, labels = scale_dataset(clean_df)

In [None]:
df.head()

In [None]:
scaled_data[:5]

In [None]:
normalized_data[:5]

In [None]:
set(labels)

# PART 2: Apply Dimensionality Reduction

In [None]:
def perform_pca(data):
    
    pca = PCA(n_components=0.90)
    
    pca_data = pca.fit_transform(data)
    
    num_cols = pca_data.shape[1]
    
    print(f"SHAPE AFTER PCA: {pca_data.shape}")
    
    pca_df = pd.DataFrame(
        data=pca_data,
        columns=[f"pc_{x}" for x in list(range(1,num_cols+1))]
    )
    
    return pca_df 

In [None]:
def perform_tsne(a_df):
    
    tsne_df = a_df.copy()
    
    tsne = TSNE(learning_rate=200)
    
    tsne_features = tsne.fit_transform(tsne_df)
    
    print(f"SHAPE AFTER TSNE: {tsne_features.shape}")
    
    tsne_df["x"] = tsne_features[:, 0]
    
    tsne_df["y"] = tsne_features[:, 1]    
        
    return tsne_df

In [None]:
def plot_tsne(a_df):
    
    plt.scatter(a_df["x"], a_df["y"], c=labels, cmap="rainbow")
    plt.title(f"TSNE Plot")
    
    plt.show()

In [None]:
scaled_df = perform_pca(scaled_data)

In [None]:
normalized_df = perform_pca(normalized_data)

In [None]:
tsne_scaled_df = perform_tsne(scaled_df)

In [None]:
tsne_normalized_df = perform_tsne(normalized_df)

In [None]:
plot_tsne(tsne_scaled_df)

In [None]:
plot_tsne(tsne_normalized_df)

At this point in the analysis, it appears that there are no discernable clusters. 

# PART 3: Perform a Cluster Analysis with K-means

In [None]:
def make_elbow_plot(a_df):
    
    inertia = []
    
    num_cols = a_df.shape[1]

    k = list(range(1,num_cols+1))

    for i in k:
        km = KMeans(n_clusters=i, random_state=0)
        km.fit(a_df)
        inertia.append(km.inertia_)

    elbow_data = {"k": k, "inertia": inertia}
    df_elbow = pd.DataFrame(elbow_data)

    plt.plot(df_elbow["k"], df_elbow["inertia"])
    plt.xticks(range(1,num_cols+1))
    plt.xlabel("Number of clusters")
    plt.ylabel("Inertia")
    plt.title(f"Elbow Plot")

    plt.show()

In [None]:
def make_kmeans_plot(a_df, some_k):
    
    k = some_k
    
    model = KMeans(n_clusters=k, random_state=0)

    model.fit(a_df)    

    predictions = model.predict(a_df)
    
    print(f"Predictions: {predictions}")

    labels = model.labels_
    
    plt.scatter(a_df["x"], a_df["y"], c=labels, cmap="rainbow")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.title(f"K-means Plot with {k} Clusters")
    plt.show()   

In [None]:
make_elbow_plot(tsne_scaled_df)

The elbow of the above chart appears to be 5.

In [None]:
make_elbow_plot(tsne_normalized_df)

The elbow of the above chart appears to be 4.

In [None]:
make_kmeans_plot(tsne_scaled_df, 5)

In [None]:
make_kmeans_plot(tsne_normalized_df, 4)

# PART 4: Make a Recommendation

After testing the myopia dataset it looks to this analyst like there are 4-5 discernable clusters of groups.  When the data are standardized using the StandardScaler() there are 5 fairly well-separated clusters, resembling a "starfish"; when standardized with normalization, 4 clusters are detectable, though not as clearly as the former. 