K means with K++ and without PCA

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

data = pd.read_csv('../Dataset/final_df.csv')

features = data.drop(columns=["id", "score"])

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

k = 12

kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
data['cluster'] = kmeans.fit_predict(features_standardized)

min_score = 0.5
step_size = 0.5
score_mapping = {i: min_score + i * step_size for i in range(k)}
data['predicted_score'] = data['cluster'].map(score_mapping)

actual_scores = data['score']

rmse = np.sqrt(mean_squared_error(actual_scores, data['predicted_score']))

print(f"RMSE: {rmse}")

K means with K++ and PCA

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

data = pd.read_csv('../Dataset/final_df.csv')


features = data.drop(columns=["id", "score"])

scaler = MinMaxScaler()
features_standardized = scaler.fit_transform(features)

pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_standardized)
explained_variance = pca.explained_variance_ratio_

k = 12

kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
data['cluster'] = kmeans.fit_predict(features_pca)

min_score = 0.5
step_size = 0.5
score_mapping = {i: min_score + i * step_size for i in range(k)}
data['predicted_score'] = data['cluster'].map(score_mapping)

actual_scores = data['score']

rmse = np.sqrt(mean_squared_error(actual_scores, data['predicted_score']))
print(f"RMSE: {rmse}")