In [1]:
import pandas as pd
import numpy as np

In [2]:
sample_df = pd.read_csv('data.csv').set_index('id').head(30)

In [47]:
class data_cleaner:
    def __init__(self, force_fummies = True):
        self.dtype_dict = None
        self.data_transform_dict = None
        self.force_dummies = force_fummies
    
    def check_data_type(self, X):
        dtype_dict={}
        n_unique_thres = 4
        for col in X.columns:
            if X[col].dtype in ['int','int64','float','float64']:
                if len(X[col].unique()) <= n_unique_thres:
                    dtype_dict[col] = 'categorical'
                else:
                    dtype_dict[col] = 'numerical'
            else:
                dtype_dict[col] = 'categorical'
        self.dtype_dict = dtype_dict
        
    def fit(self, df):
        self.check_data_type(df)
        data_transform_dict = {}
        for col in df.columns:
            if self.dtype_dict[col] == 'categorical':
                mapping_dict = dict(zip(df[col].unique(),range(len(df[col].unique()))))
                data_transform_dict[col] = ('categorical', mapping_dict)
            else:
                data_transform_dict[col] = ('numerical', None)
        self.data_transform_dict = data_transform_dict

    def transform(self, df):
        new_df = pd.DataFrame(index = df.index)
        for col, v in self.data_transform_dict.items():
            col_dtype, mapping_dict = v
            if col_dtype == 'categorical':
                if self.force_dummies:
                    for k, v in mapping_dict.items():
                        new_df[col + '_' + str(v)] = (df[col] == k).astype(int)
                else:
                    new_df[col] = df[col].apply(lambda x: mapping_dict[x])
            else:
                new_df[col] = df[col]
        return new_df

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)
d = data_cleaner()
df = d.fit_transform(sample_df)

In [93]:
class kmeans_model:
    def __init__(self, k = 5, d_metric_p = 2):
        self.k = k
        self.fitted = False
        self.init = 1
        self.d_metric_p = d_metric_p
        self.centers = None
    def distance(self, arr1, arr2):
        if self.d_metric_p == 1:
            return np.sum(np.abs(arr1 - arr2))
        elif self.d_metric_p == 2:
            return np.sqrt(np.sum(np.square(arr1 - arr2)))
        elif self.d_metric_p == np.inf:
            return np.max(np.abs(arr1 - arr2))
    
    def find_cluster(self, data_row):
        return self.centers.apply(lambda center_row: self.distance(center_row, data_row),axis = 1).argmin()


    def random_init_centers(self, df):
        self.centers = df.iloc[np.random.choice(df.shape[0], self.k, replace = True),:].reindex() # centers as a numpy array
        
    def fit(self, df, max_iter = 10, init_centers = None):
        if init_centers is None:
            self.random_init_centers(df)
        t = 0
        # df = self.clean(df) funtion to clean dataset, e.g make dummies, check 
        while t < max_iter:
            old_centers = self.centers.copy(deep = True)
            clusters = self.predict(df)
            self.centers = df.groupby(clusters).agg('mean').reindex()
            t += 1

            try:
                if (old_centers.values == self.centers.values ).all():
                    print('fitting converged at iteration', t)
                    break
            except:
                print(old_centers)
                print(self.centers)
        print('fitting stopped')
        self.fitted = True
        print(self.centers)
    
    def predict(self, df):
        return df.apply(lambda row: self.find_cluster(row),axis = 1)

In [94]:
km = kmeans_model()
km.fit(df)

fitting converged at iteration 6
fitting stopped
   gender_0  gender_1        age  hypertension_0  hypertension_1  \
0  0.500000  0.500000  65.125000        0.750000        0.250000   
1  0.000000  1.000000  66.666667        0.666667        0.333333   
2  0.428571  0.571429  68.571429        1.000000        0.000000   
3  0.500000  0.500000  76.000000        0.500000        0.500000   
4  0.600000  0.400000  68.900000        0.800000        0.200000   

   heart_disease_0  heart_disease_1  ever_married_0  ever_married_1  \
0         0.250000         0.750000        1.000000        0.000000   
1         0.000000         1.000000        0.666667        0.333333   
2         0.285714         0.714286        0.857143        0.142857   
3         0.500000         0.500000        1.000000        0.000000   
4         0.500000         0.500000        0.900000        0.100000   

   work_type_0  ...  work_type_2  Residence_type_0  Residence_type_1  \
0     0.500000  ...     0.125000          0