# Libraries
- numpy==1.23.5
- pandas==2.0.0
- lightgbm==3.3.5
- sklearn==1.2.2
- keras==2.12.0 (no GPU support)

In [1]:
import os 
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.metrics import f1_score 
from sklearn.model_selection import KFold
from sklearn import preprocessing

import keras 
from keras.layers import *
from keras.optimizers import *
from keras.models import Model
from keras import backend as K

import warnings
warnings.filterwarnings("ignore")

In [20]:
DIR  = "data/"
SEED = 1881

SEED = 1109

if not os.path.isdir("models/"):
    os.makedirs("models")

In [3]:
train_x = pd.read_csv(DIR+"train_values.csv")
train_y = pd.read_csv(DIR+"train_labels.csv")
test_x  = pd.read_csv(DIR+"test_values.csv")
sub_csv = pd.read_csv(DIR+"submission_format.csv")

# 1 - Data processing

## 1.1 AutoEncoder for Geo location features
Ref: https://github.com/Goodsea/Richter-s-Eye/blob/master/main.ipynb

### 1.1.1 Binary encode all geo level features

In [4]:
geo1 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_1_id"], test_x["geo_level_1_id"]])))
geo2 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_2_id"], test_x["geo_level_2_id"]])))
geo3 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_3_id"], test_x["geo_level_3_id"]])))

### 1.1.2 Construct Autoencoder network

In [5]:
def NET():
    inp = Input((geo3.shape[1],))
    i1 = Dense(16, name="intermediate")(inp)
    x2 = Dense(geo2.shape[1], activation='sigmoid')(i1)
    x1 = Dense(geo1.shape[1], activation='sigmoid')(i1)

    model = Model(inp, [x2,x1])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

Fit the model on processed geo features or Load a pretrained model

In [None]:
model = NET()
# Train GEO-Embed model: Uncomment the code below
# model.fit(geo3, [geo2, geo1], batch_size=128, epochs=10, verbose=2)
# model.save("geo_embed.h5")

# Load GEO-Embed pretrained Model
model.load_weights("geo_embed.h5")

Define function to extract the intermediate layer as the new embedding

In [6]:
get_int_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])

### 1.2 Extract new embedding for geo levels and add to dataframe

In [7]:
# Train data
out = []
for dat in geo3[:260601]:
    layer_output = get_int_layer_output([dat.reshape(1, 11861)])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

train_data = pd.get_dummies(train_x.copy())
train_data = train_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
train_data = train_data.assign(geo_feat1=out[:,0],
                               geo_feat2=out[:,1],
                               geo_feat3=out[:,2],  
                               geo_feat4=out[:,3],
                               geo_feat5=out[:,4],    
                               geo_feat6=out[:,5],
                               geo_feat7=out[:,6],
                               geo_feat8=out[:,7],
                               geo_feat9=out[:,8],
                               geo_feat10=out[:,9],
                               geo_feat11=out[:,10],
                               geo_feat12=out[:,11],
                               geo_feat13=out[:,12],
                               geo_feat14=out[:,13],
                               geo_feat15=out[:,14],           
                               geo_feat16=out[:,15])
train_data

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,802906,2,30,6,5,1,1,0,0,0,...,1.466794,-2.044550,-1.338502,0.522944,-0.658317,-1.996131,1.364559,1.150429,-1.650674,1.423452
1,28830,2,10,8,7,0,1,0,0,0,...,0.792328,-1.930641,0.244771,0.821282,-1.682540,-0.781844,1.162983,-0.517174,-1.422999,0.489339
2,94947,2,10,5,5,0,1,0,0,0,...,1.379252,-2.364805,-1.095964,-1.032024,-1.567636,-0.170536,2.197870,-1.740250,-1.110470,2.265696
3,590882,2,10,6,5,0,1,0,0,0,...,1.645584,-0.355447,-1.220192,1.386736,-1.921839,0.495985,-0.817264,-1.476800,-0.671453,1.848267
4,201944,3,30,8,9,1,0,0,0,0,...,0.998216,-2.229619,-2.360607,-1.019490,-1.564085,0.837669,1.749065,-1.862957,-0.837888,1.602420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,688636,1,55,6,3,0,1,0,0,0,...,-0.143475,-1.791613,-1.062033,1.642672,-1.544195,-1.436718,0.894187,-1.064682,0.495309,1.850656
260597,669485,2,0,6,5,0,1,0,0,0,...,-0.676942,-2.418662,-1.692341,1.550785,-0.663093,-1.541144,1.177135,-0.965449,1.004659,0.579021
260598,602512,3,55,6,7,0,1,0,0,0,...,0.829690,-2.057462,-1.013165,2.013322,0.062592,-1.616684,1.965572,1.093879,0.540577,1.351077
260599,151409,2,10,14,6,0,0,0,0,0,...,1.648115,-1.069342,-1.993976,1.468483,0.345005,-1.379133,-0.399066,-2.365860,0.549455,1.766544


In [10]:
# Test data
out = []
for dat in geo3[260601:]:
    layer_output = get_int_layer_output([dat.reshape(1, 11861)])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

test_data = pd.get_dummies(test_x.copy())
test_data = test_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
test_data = test_data.assign(geo_feat1=out[:,0],
                             geo_feat2=out[:,1],
                             geo_feat3=out[:,2],  
                             geo_feat4=out[:,3],
                             geo_feat5=out[:,4],    
                             geo_feat6=out[:,5],
                             geo_feat7=out[:,6],
                             geo_feat8=out[:,7],
                             geo_feat9=out[:,8],
                             geo_feat10=out[:,9],
                             geo_feat11=out[:,10],
                             geo_feat12=out[:,11],
                             geo_feat13=out[:,12],
                             geo_feat14=out[:,13],
                             geo_feat15=out[:,14],           
                             geo_feat16=out[:,15])
test_data

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,...,geo_feat7,geo_feat8,geo_feat9,geo_feat10,geo_feat11,geo_feat12,geo_feat13,geo_feat14,geo_feat15,geo_feat16
0,300051,3,20,7,6,0,1,0,0,0,...,-0.430310,-1.036007,-0.628201,1.396918,-0.646301,-0.401721,1.775716,-0.525752,0.589180,1.922086
1,99355,2,25,13,5,0,1,0,0,0,...,1.198316,-1.658057,-0.896333,0.958980,-0.607773,-1.545761,0.012132,0.605375,-1.019763,0.474312
2,890251,2,5,4,5,0,1,0,0,0,...,0.878997,-0.098311,-0.367600,1.122562,-1.336323,-0.563150,0.142514,-1.042175,0.012609,1.281353
3,745817,1,0,19,3,0,0,0,0,0,...,1.829058,-1.253964,-2.191850,1.562519,0.324142,-1.490117,-0.430267,-2.678957,0.555670,1.977267
4,421793,3,15,8,7,0,1,0,0,0,...,-0.775551,-1.498556,0.408616,1.177538,-1.127435,-0.428089,1.805395,0.195895,-0.223472,1.146803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86863,310028,3,70,20,6,0,1,0,0,0,...,0.815776,-1.888190,-2.218592,2.016480,0.878734,-1.145824,1.932422,-1.522319,-1.862096,0.579397
86864,663567,3,25,6,7,1,1,1,0,0,...,1.539498,-1.648497,-1.291400,1.608987,-1.017836,-1.125162,1.162247,0.355431,0.586007,1.033257
86865,1049160,1,50,3,3,0,1,0,0,0,...,1.235654,-0.985354,0.304887,1.557580,-1.386231,-0.497524,0.277491,-0.896868,-0.636278,1.854645
86866,442785,2,5,9,5,1,1,0,0,0,...,1.941297,-2.894349,0.021195,1.277626,-2.116505,-1.672430,-0.155020,1.588272,-1.446333,2.817224


## 1.3 Prepare train and test data

In [None]:
# Train data
df = train_data.drop(["building_id"], axis=1)
x = np.array(df)

y = np.array(train_y["damage_grade"])-1

# Test data
df_test = test_data.drop(["building_id"], axis=1)
x_test = np.array(df_test)

# 2 - Model training

## 2.1 Get labels from prediction probability
Extract predicted labels by picking the labels that has highest probability, returning one-hot vector per prediction

In [12]:
def threshold_arr(array):
    # Get major confidence-scored predicted value.
    new_arr = []
    for ix, val in enumerate(array):
        loc = np.array(val).argmax(axis=0)
        k = list(np.zeros((len(val))))
        k[loc]=1
        new_arr.append(k)
        
    return np.array(new_arr)

## 2.2 Train 8 LightGBM models on 8-folds setting
All models have the same hyperparameters

In [13]:
kf = KFold(n_splits=8, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(kf.split(x)):
    lgb_params = {
        "objective" : "multiclass",
        "num_class":3,
        "metric" : "auc_mu",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "feature_fraction" : 0.5,
        "min_sum_hessian_in_leaf" : 0.1,
        "max_bin":8192,
        "verbosity" : 1,
        "num_threads":6,
        "seed": SEED
    }
    x_train, x_val, y_train, y_val= x[train_index], x[test_index], y[train_index], y[test_index]

    train_data_ = lgb.Dataset(x_train, label=y_train)
    val_data_   = lgb.Dataset(x_val, label=y_val)

    lgb_clf = lgb.train(lgb_params,
                        train_data_,
                        3000,
                        valid_sets = [val_data_],
                        verbose_eval = 1000)

    y_pred = lgb_clf.predict(x_val)
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_val)), threshold_arr(y_pred), average='micro'))
    lgb_clf.save_model(f'models/model{ix}.txt')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120546
[LightGBM] [Info] Number of data points in the train set: 228025, number of used features: 80
[LightGBM] [Info] Start training from score -2.344522
[LightGBM] [Info] Start training from score -0.563314
[LightGBM] [Info] Start training from score -1.094258
[1000]	valid_0's auc_mu: 0.921772
[2000]	valid_0's auc_mu: 0.922028
[3000]	valid_0's auc_mu: 0.921103
F1-MICRO SCORE:  0.747759086444008
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120800
[LightGBM] [Info] Number of data points in the train set: 228026, number of used features: 80
[LightGBM] [Info] Start training from score -2.338917
[LightGBM] [Info] Start training from score -0.564737
[LightGBM] [Info] Start training from score -1.093451
[1000]	valid_0's auc_mu: 0.92075
[2000]	valid_0's auc_mu: 0.921578
[3000]	valid_0's auc_mu: 0.920848
F1-MICRO SCORE:  0.7510053722179586
You can set `force_col_wis

In [14]:
# Append all models to a list
models = []
for i in range(8):
    model = lgb.Booster(model_file=f'models/model{i}.txt')

    y_pred = model.predict(x)
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8427749701651185
F1-MICRO SCORE:  0.8425332212846459
F1-MICRO SCORE:  0.8421034454971393
F1-MICRO SCORE:  0.8423106588232585
F1-MICRO SCORE:  0.8429322988016161
F1-MICRO SCORE:  0.8429591597883355
F1-MICRO SCORE:  0.8426176415286204
F1-MICRO SCORE:  0.8420804217942371


## 2.3 Model ensembling
We simply take the sum over probability predictions of all models, and again, pick the labels that has maximum accumulated probability

In [15]:
def ensemble(models, x):
    y_preds = []
    
    for model in models:
        y_pred = model.predict(x)
        y_preds.append(y_pred)
        
    init_y_pred = y_preds[0]
    for ypred in y_preds[1:]:
        init_y_pred += ypred
        
    y_pred = threshold_arr(init_y_pred)
    
    return y_pred

In [16]:
y_pred = ensemble(models, x_test)
y_pred = y_pred.argmax(axis=1)+1

## 2.4 Generate prediction file

In [17]:
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)