In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

## Part 1: Prepare the Data

In [None]:
# Read the CSV file into a Pandas DataFrame
myopia = pd.read_csv("../Resources/myopia.csv", low_memory=False)
myopia.head()

In [None]:
myopia.columns

In [None]:
print(myopia.info())

In [None]:
myopia['MYOPIC'].value_counts()

In [None]:
# Remove the "MYOPIC" column from the dataset
# Split the data into X_train, X_test, y_train, y_test
X = myopia.drop(columns=['MYOPIC'])
y = myopia['MYOPIC']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

In [None]:
# Create a scaler to standardize the dataset
scaler = StandardScaler()

In [None]:
# Train the scaler with X_train data
scaler.fit(X_train)

In [None]:
# Transform X_train and X_test.
# Note that the scaler used to transform X_train and X_test was trained on X_train.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
 # Instantiate KNN model and make predictions
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

In [None]:
 # Assess the accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

## Part 2: Apply Dimensionality Reduction

In [None]:
myopia.sample(20)

In [None]:
# Crestr a new dataframe for t-sne
myopia2 = myopia.drop(['MYOPIC'], axis = 1)
labels = myopia['MYOPIC']

In [None]:
# Initialze t-sne model
tsne = TSNE(learning_rate=35)

In [None]:
# Reduce dimesins
tsne_features = tsne.fit_transform(myopia2)

In [None]:
# The dataset has 2 columns
tsne_features.shape

In [None]:
# Prepare ro plot the dataset
# The first column of transformed features
myopia2['x']=tsne_features[:,0]

In [None]:
myopia2['y']=tsne_features[:,1]

In [None]:
# Visualize the clusters
plt.scatter(myopia2['x'], myopia2['y'])
plt.show()

In [None]:
labels.value_counts()

In [None]:
# Visualize the clusters with color

In [None]:
plt.scatter(myopia2['x'], myopia2['y'], c=labels)
plt.show()

In [None]:
# Standarized data with StandarsScaler
myopia_scaled = StandardScaler().fit_transform(myopia)
print(myopia_scaled[0:15])

In [None]:
# Applying PCA to reduce dimensions from 15 to 2
# Initialize PCA model
pca = PCA(n_components=2)

In [None]:
# Get tow principal components for the myopia data
myopia_pca = pca.fit_transform(myopia_scaled)

In [None]:
# Transform PCA data to a DataFrme
df_myopia_pca = pd.DataFrame(
    data=myopia_pca, columns=["principal component 1", "principal component 2"])
df_myopia_pca.head()

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

## Analysis 
According to the explained variance, the first principal component contains approximately 20% of the variance and the second principal component contains about 15% of the variance. I have approximately 87% of the information in the original dataset, and will see whether increasing the number of principal components to 3 will increase the explained variance.

In [None]:
# Initialize PCA mode for 3 principal components
pca = PCA(n_components= 3)

In [None]:
myopia_pca = pca.fit_transform(myopia_scaled)

In [None]:
# Transform PCA data to a DataFrme
df_myopia_pca = pd.DataFrame(
    data=myopia_pca, columns=["principal component 1", "principal component 2", "principal component 3"] )
df_myopia_pca.head()

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

## Analysis
With 3 principal components, I have 44% of the information in the original dataset.

In [None]:
# Initialzie the K-Means model
model = KMeans(n_clusters=3, random_state=5)

In [None]:
# Fit the model
model.fit(df_myopia_pca)

In [None]:
# Prediction Clusters
predictions = model.predict(df_myopia_pca)
print(predictions)

In [None]:
# Add the proedicted class columns
df_myopia_pca["class"] = model.labels_
df_myopia_pca.head()

In [None]:
# Plot the 15 principal components
import plotly.express as px
fig = px.scatter_3d(
    df_myopia_pca,
    x="principal component 3",
    y="principal component 2",
    z="principal component 1",
    color ="class",
    symbol="class",
    width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
plt.scatter(x=df_myopia_pca["principal component 1"], 
            y=df_myopia_pca["principal component 2"],
           c=df_myopia_pca["principal component 3"])
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.show()

In [None]:
# # Initialize PCA mode for 3 principal components
# pca = PCA(n_components= 0.99)

In [None]:
# myopia_pca = pca.fit_transform(myopia_scaled)

In [None]:
# # Transform PCA data to a DataFrme
# df_myopia_pca = pd.DataFrame(
#     data=myopia_pca )
# df_myopia_pca.head()

In [None]:
# # Fetch the explained variance
# pca.explained_variance_ratio_

## Part 3: Perform a Cluster Analysis with K-means
### Finding the best value for k using the Elbow Curve

In [None]:
inertia = []
# Same as K = List(range(1, 11))
k= [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_myopia_pca)
    inertia.append(km.inertia_)

In [None]:
# Defrine a DataFrame to plot the Elobow Curce using hvPlot
elbow_data={"k":k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [None]:
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

## Part 4: Make a Recommendation

Based on Analysis - 2 PCA components vs 3 PCA components, patients can not be clustered less than 3 in order to have better prediction.