In [None]:
import pandas as pd

# PART 1: Prepare the Data

In [None]:
df = pd.read_csv("myopia.csv")

In [None]:
df.info()

In [None]:
df.sample(10)

In [None]:
df["MYOPIC"].value_counts()

In [None]:
df["AGE"].value_counts()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
labels = df["MYOPIC"]
df_drop = df.drop(columns=["MYOPIC"])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_drop)
scaled_data.shape

# PART 2: Apply Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

# Initialize PCA model

pca = PCA(n_components=0.90)

myopia_pca = pca.fit_transform(scaled_data)

In [None]:
myopia_pca.shape # 14 features reduced to 10

In [None]:
 # Transform PCA data to a DataFrame
    
df_myopia_pca = pd.DataFrame(
    data=myopia_pca,
    columns=["pc_1", "pc_2", "pc_3", "pc_4", "pc_5", "pc_6", "pc_7", "pc_8", "pc_9", "pc_10"]
)

df_myopia_pca.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.singular_values_

In [None]:
from sklearn.manifold import TSNE

tsne_df = df_myopia_pca.copy()

 # Initialize t-SNE model
    
tsne = TSNE(learning_rate=10)

In [None]:
 # Reduce dimensions
    
tsne_features = tsne.fit_transform(tsne_df)

In [None]:
tsne_features.shape # Reduced 10 primary components down to 2

In [None]:
import matplotlib.pyplot as plt

# Prepare to plot the dataset

# The first column of transformed features

tsne_df["x"] = tsne_features[:, 0]

# The second column of transformed features

tsne_df["y"] = tsne_features[:, 1]

 # Visualize the clusters
    
plt.scatter(tsne_df["x"], tsne_df["y"])
plt.show()

In [None]:
labels.value_counts()

In [None]:
 # Visualize the clusters with color
    
plt.scatter(tsne_df["x"], tsne_df["y"], c=labels)
plt.show()

# PART 3: Perform a Cluster Analysis with K-means

In [None]:
 # Do this on Windows machines before importing KMeans to avoid a known bug (memory leak). 
    
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [None]:
from sklearn.cluster import KMeans

inertia = []

k = list(range(1,11))

# Looking for the best k

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow["k"], df_elbow["inertia"])
plt.xticks(range(1,11))
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.show()

In [None]:
# Initializing model with K = 3 
    
model = KMeans(n_clusters=3, random_state=0)

model.fit(df)

In [None]:
 # Get predictions
    
predictions = model.predict(df)

print(predictions)

In [None]:
df["class"] = model.labels_
df.head()

In [None]:
set(model.labels_)

In [None]:
plt.scatter(df["SPHEQ"], df["ACD"], c=df["class"])
plt.xlabel("SPHEQ")
plt.ylabel("ACD")
plt.show()

# PART 4: Make a Recommendation

I'm not finding any discernable clusters from the different unsupervised learning algorithms we employed.