In [1]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans


In [2]:
# Make simulated feature matrix
features, _ = make_blobs(n_samples = 50,
                        n_features = 2,
                        centers = 3,
                        random_state = 1)


In [3]:
# Create DataFrame
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

In [4]:
# Making the k-means clusterer.
clusterer = KMeans(3, random_state=0)

# Fit clusterer.
clusterer.fit(features)



In [5]:
# predict the values.
dataframe["group"] = clusterer.predict(features)

In [6]:
# viewing the first few observations.
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


## Imputing missing values

In [7]:
# Using the KNN.

# Load libraries
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
                        n_features = 2,
                        random_state = 1)
# Standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# Predict the missing values in the feature matrix
knn_imputer = KNNImputer(n_neighbors=5)
features_knn_imputed = knn_imputer.fit_transform(standardized_features)

# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_knn_imputed[0,0])


True Value: 0.8730186113995938
Imputed Value: 1.0959262913919632


In [8]:
# Using the simple imputer

# Load libraries
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
                            n_features = 2,
                            random_state = 1)
# Standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

# Create imputer using the "mean" strategy
mean_imputer = SimpleImputer(strategy="mean")

# Impute values
features_mean_imputed = mean_imputer.fit_transform(features)

# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0,0])


True Value: 0.8730186113995938
Imputed Value: -3.058372724614996
