In [1]:
#        DATA
# ==================== #
import pandas as pd
import numpy as np
from scipy import stats
import time
import random
import math

#      PLOTING
# ============================== #

from PIL import Image
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme(style="white")

#      CLUSTERING
# ============================== #
from sklearn.cluster import KMeans
from scipy.sparse import hstack

#      PRE PROCESSING
# ============================== #
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

#      METRICS
# ============================== #
from sklearn.metrics import f1_score   # average='micro'
from sklearn.model_selection import cross_val_score , StratifiedShuffleSplit , RepeatedStratifiedKFold

#      MODEL SELECTION
# ============================== #
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

#      CLASIFICATORS
# ============================== #
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.svm import SVC


#      OTHERS
# ============================== #
from sklearn.inspection import permutation_importance
import multiprocessing

import python.functions
from python.KMeansFeaturer import KMeansFeaturizer



#      WARNINGS
# ============================== #
import warnings
warnings.filterwarnings("ignore")
import dill

In [2]:
df = pd.read_csv("../data/trainProb.csv")
test = pd.read_csv("../data/testProb.csv")

In [3]:
target = 'damage_grade' 
numeric = ['age_pt','area_percentage_pt', 'height_percentage_pt','prob1_geo1','prob1_geo2','prob1_geo3','prob2_geo1','prob2_geo2','prob2_geo3','prob3_geo1','prob3_geo2','prob3_geo3', 'CntFloorAge' , 'CntFloorsArea' , 'CntFloorsHeight' , 'AreaPerAge' , 'HeightPerAge' , 'AreaPerHeight' , 'CntFamFloors' ,  'CntFamArea' ,  'CntFamHeight']
dummies = ['count_families_0','count_families_1','count_families_2',
                'count_floors_pre_eq_1','count_floors_pre_eq_2',
                'foundation_type_1', 'ground_floor_type_1',
                'land_surface_condition_t','land_surface_condition_n',
                'other_floor_type_j','other_floor_type_q', 'other_floor_type_x', 
                'position_s','position_t',
                'roof_type_n','roof_type_q']
binary = ['has_secondary_use',
        'has_secondary_use_agriculture',
        'has_superstructure_adobe_mud',
        'has_superstructure_cement_mortar_brick',
        'has_superstructure_mud_mortar_brick',
        'has_superstructure_mud_mortar_stone',
        'has_superstructure_timber']

In [41]:
k = 7
target_scale = 0.3
seed = 1995


X = df[numeric]
y = df[target]

In [42]:
km = KMeansFeaturizer(k,target_scale,seed).fit(X,y)

In [53]:
cs = km.transform(test[numeric])

cs

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [56]:
set([cs[i][0] for i in range(len(cs))])

{0, 1, 2, 3, 4, 5, 6}

In [39]:
data_with_target = np.hstack((X, y[:,np.newaxis]*target_scale))
        
# Build a pre-training k-means model on data and target
km_model_pretrain = KMeans(n_clusters=k,
                            n_init=20,
                            random_state=random_state)

km_model_pretrain.fit(data_with_target)

# Run k-means a second time to get the clusters in the original space
# without target info. Initialize using centroids found in pre-training.
# Go through a single iteration of cluster assignment and centroid
# recomputation.

n = X.shape[1]

km_model = KMeans(n_clusters=k,
                    init=km_model_pretrain.cluster_centers_[:,:n],
                    n_init=20,
                    max_iter=100)
km_model.fit(X)

km_model = km_model
cluster_centers_ = km_model.cluster_centers_

In [32]:
imp = SimpleImputer()

test[numeric] = imp.fit_transform(test[numeric])

In [40]:
km_model.predict(test[numeric])

array([0, 0, 0, ..., 0, 0, 0])