Myopia Clusters!

In [None]:
#Imports
import pandas as pd
from sklearn.cluster import KMeans
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
file_path = "Resources/myopia.csv"
my_df = pd.read_csv(file_path)

In [None]:
my_df

In [None]:
myopia_df = my_df.drop(["MYOPIC"], axis=1)
myopia_df

In [None]:
#scale the columns in order for them to be closer in range with eachother
scaler = StandardScaler()
scaled_data = scaler.fit_transform(myopia_df[["AGE", "SPHEQ", "AL", "ACD", "LT", "VCD", "SPORTHR", 
                                              "READHR", "COMPHR", "STUDYHR", "TVHR", "DIOPTERHR", "MOMMY", "DADMY"]])

In [None]:
scaled_df = pd.DataFrame(scaled_data, columns=myopia_df.columns)
scaled_df

In [None]:
#Add PCA elements
pca = PCA(n_components=.9)
myopia_PCA = pca.fit_transform(scaled_df)

In [None]:
pca.explained_variance_ratio_

In [None]:
df_myopia_pca = pd.DataFrame(
    data=myopia_PCA, columns=["PC_1", "PC_2", "PC_3", "PC_4", "PC_5", "PC_6", "PC_7", "PC_8", "PC_9", "PC_10"])
df_myopia_pca.head()

In [None]:
#add a t-sne element
from sklearn.manifold import TSNE
tsne = TSNE(learning_rate=50)
tsne_features = tsne.fit_transform(df_myopia_pca)

In [None]:
tsne_features.shape

In [None]:
df_myopia_pca['PC_x'] = tsne_features[:,0]
df_myopia_pca["PC_y"] = tsne_features[:,1]


In [None]:
#Plot the results from the TSNE
plt.scatter(df_myopia_pca['PC_x'], df_myopia_pca["PC_y"])
plt.show()

Based on the above scatter plot of the TSNE results, it appears there are roughly 4 different clusters. 

In [None]:
#add labels
labels=my_df["MYOPIC"]

In [None]:
plt.scatter(df_myopia_pca['PC_x'], df_myopia_pca["PC_y"], c=labels)
plt.show()

In [None]:
#Create and runt the K-means model
model = KMeans(n_clusters=10, random_state=0)

In [None]:
#fit the model
model.fit(df_myopia_pca)

In [None]:
#predictions
predictions = model.predict(df_myopia_pca)

In [None]:
#add a new class column to the dataset
df_myopia_pca["class"] = model.labels_
df_myopia_pca.head()

In [None]:
#Find the best value for K
inertia = []
k = list(range(1, 11))

#look for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_myopia_pca)
    inertia.append(km.inertia_)
    
#define DF to plot Elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(11)))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
df_elbow

In [None]:
model = KMeans(n_clusters=3, random_state=4)
model.fit(df_myopia_pca)

In [None]:
predicition = model.predict(df_myopia_pca)

In [None]:
def get_clusters(k, data):
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)

    # Train the model
    model.fit(df_myopia_pca)

    # Predict clusters
    predictions = model.predict(df_myopia_pca)

    # Create return DataFrame with predicted clusters
    df_myopia_pca["class"] = model.labels_

    return df_myopia_pca

In [None]:
df_clusters = get_clusters(3, df_myopia_pca)
df_clusters

In [None]:
def show_clusters(df):
    plt.scatter(df_myopia_pca['PC_x'], df_myopia_pca["PC_y"], c=df_myopia_pca["class"])
    plt.show
    
show_clusters(df_clusters)

After creating an unsupervised model for the myopia data, it appears the most accurate number of clusters for patients is 3. 