In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

In [21]:
# Read in the data
train_data = pd.read_excel("./Task1and2/train.xlsx")
test_data = pd.read_excel("./Task1and2/test.xlsx")

In [4]:
train_data.isnull().sum().sum()

0

In [25]:
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']
X_test = test_data

In [26]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
kmeans = KMeans(n_clusters=160, random_state=10, n_init='auto')
kmeans.fit(X_train_scaled)

In [31]:
train_clusters = kmeans.predict(X_train_scaled)

In [32]:
import numpy as np
from scipy.stats import mode

cluster_to_target = {}

for cluster in np.unique(train_clusters):
    # Get indices of data points in this cluster
    indices = np.where(train_clusters == cluster)
    # Get the most common target value for this cluster
    most_common_target = mode(y_train.iloc[indices]).mode[0]
    cluster_to_target[cluster] = most_common_target

print(cluster_to_target)


{0: 'A9', 1: 'B49', 2: 'A30', 3: 'A59', 4: 'A61', 5: 'B70', 6: 'B23', 7: 'B15', 8: 'A21', 9: 'A43', 10: 'B61', 11: 'A13', 12: 'A24', 13: 'A52', 14: 'B35', 15: 'A64', 16: 'B58', 17: 'A33', 18: 'B40', 19: 'B10', 20: 'A25', 21: 'B76', 22: 'B46', 23: 'B28', 24: 'A3', 25: 'A33', 26: 'B42', 27: 'B75', 28: 'B24', 29: 'A31', 30: 'B14', 31: 'B1', 32: 'A66', 33: 'B60', 34: 'A70', 35: 'B44', 36: 'B11', 37: 'B25', 38: 'B56', 39: 'A22', 40: 'A76', 41: 'A15', 42: 'B33', 43: 'B4', 44: 'A7', 45: 'B9', 46: 'A75', 47: 'B57', 48: 'B79', 49: 'B50', 50: 'A39', 51: 'A77', 52: 'B41', 53: 'B34', 54: 'B51', 55: 'A62', 56: 'B52', 57: 'A47', 58: 'B62', 59: 'A51', 60: 'B5', 61: 'B43', 62: 'A37', 63: 'B77', 64: 'A40', 65: 'A35', 66: 'B23', 67: 'A78', 68: 'A23', 69: 'B18', 70: 'B64', 71: 'B66', 72: 'A28', 73: 'A54', 74: 'A36', 75: 'B37', 76: 'A46', 77: 'A37', 78: 'A38', 79: 'B47', 80: 'A57', 81: 'A8', 82: 'B12', 83: 'B19', 84: 'A50', 85: 'A48', 86: 'B22', 87: 'A2', 88: 'A27', 89: 'A12', 90: 'B16', 91: 'A53', 92: 'B

In [33]:
test_clusters = kmeans.predict(X_test_scaled)


In [36]:
train_clusters = [cluster_to_target[i] for i in train_clusters]


In [37]:
train_clusters

['B37',
 'B61',
 'A19',
 'A22',
 'A33',
 'A75',
 'A41',
 'B14',
 'B80',
 'A33',
 'B64',
 'A14',
 'B21',
 'A70',
 'B57',
 'B9',
 'B50',
 'B31',
 'A21',
 'B30',
 'B37',
 'B37',
 'A14',
 'B50',
 'B40',
 'B6',
 'A64',
 'A33',
 'B40',
 'A25',
 'B18',
 'A66',
 'A33',
 'A53',
 'B73',
 'A75',
 'A50',
 'B29',
 'B57',
 'A21',
 'A14',
 'B61',
 'A35',
 'A3',
 'A33',
 'A46',
 'B45',
 'A36',
 'B5',
 'A79',
 'A32',
 'B5',
 'B23',
 'A7',
 'A33',
 'A62',
 'A53',
 'B3',
 'A71',
 'A57',
 'A9',
 'A50',
 'B63',
 'B51',
 'B8',
 'B61',
 'B36',
 'A14',
 'A15',
 'A36',
 'A21',
 'A24',
 'B38',
 'B21',
 'A3',
 'B4',
 'B4',
 'A33',
 'A12',
 'B41',
 'B63',
 'A8',
 'B14',
 'B49',
 'A14',
 'A3',
 'A15',
 'A22',
 'A3',
 'B8',
 'B71',
 'A14',
 'B59',
 'B39',
 'A19',
 'B80',
 'B29',
 'A76',
 'A29',
 'B58',
 'B21',
 'B28',
 'A77',
 'A67',
 'B24',
 'A31',
 'A34',
 'B63',
 'A57',
 'A9',
 'A32',
 'B71',
 'B51',
 'A7',
 'A34',
 'A24',
 'B34',
 'A11',
 'B16',
 'B57',
 'B20',
 'A59',
 'B25',
 'A14',
 'A7',
 'B51',
 'B61',
 'B

In [39]:
results_df = pd.DataFrame(columns=['train_cluster_predictions', 'actual_given_cluster'])

results_df['train_cluster_predictions'] = train_clusters
results_df['actual_given_cluster'] = y_train

results_df

Unnamed: 0,train_cluster_predictions,actual_given_cluster
0,B37,B37
1,B61,B61
2,A19,A19
3,A22,A22
4,A33,A33
...,...,...
36747,B34,B34
36748,B15,B15
36749,A6,A6
36750,B2,B2


In [44]:
len(test_clusters)

15752

In [45]:
test_predictions = pd.DataFrame(columns=["test_predictions"])
test_predictions["test_predictions"] = test_clusters

test_predictions

Unnamed: 0,test_predictions
0,B74
1,A3
2,B69
3,B20
4,A67
...,...
15747,A71
15748,B11
15749,B69
15750,A38


In [50]:
test_predictions.to_csv("test_predictions.csv", index=False)
results_df.to_csv("train_predictions.csv", index=False)

In [47]:
import pickle as pkl

with open("model.pkl", "wb") as f:
    pkl.dump(kmeans, f)

In [48]:
with open("cluster_to_target.pkl", "wb") as f:
    pkl.dump(cluster_to_target, f)

In [49]:
kmeans.cluster_centers_

array([[ 1.70878786,  0.44664717, -0.75633504, ..., -1.72304729,
        -0.68385357, -1.36917553],
       [-1.29284931, -0.8978037 ,  0.33097395, ...,  0.85721607,
         0.43861301,  0.53300685],
       [ 0.62901178,  0.38976519,  1.4118576 , ..., -0.73386752,
        -1.28547504, -0.08513226],
       ...,
       [-1.0397796 , -0.3486437 , -1.1942862 , ...,  0.68586669,
        -0.02193708,  1.3273319 ],
       [-0.03732781, -0.98145254,  1.07328991, ..., -1.23899298,
        -1.3107266 , -0.78535896],
       [ 0.45640326,  0.65184368,  1.18232257, ..., -0.01337673,
        -0.59024288, -0.49457519]])

In [54]:
with open("scaler.pkl", "wb") as f:
    pkl.dump(scaler, f)