In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import xgboost as xgb
import optuna

import matplotlib.pylab as plt
import seaborn as sns

ModuleNotFoundError: No module named 'optuna'

In [3]:
train_data = pd.read_csv('data/train_values.csv')
train_labels = pd.read_csv('data/train_labels.csv')

In [4]:
train_data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


In [5]:
train_labels.head()

Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


In [12]:
data = train_data.merge(train_labels, on='building_id', how='left')

In [13]:
print(data.shape)
data.head()

(260601, 40)


Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


In [17]:
def objective(trial):

    values_df = pd.read_csv('./data/train_values.csv')
    labels_df = pd.read_csv('./data/train_labels.csv')
    data = pd.merge(values_df, labels_df, on='building_id')
    
    # initialize a label encoder
    label_encoder = LabelEncoder()
    #encode categorical columns
    cat_cols = data.select_dtypes('object').columns
    for col in cat_cols:
        data[col] = label_encoder.fit_transform(data[col])
    
    # separate features as X and target as y
    X = data.drop('damage_grade', axis=1)
    data['damage_grade_'] = [x-1 for x in data['damage_grade']]
    y = data['damage_grade_']
    
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2, stratify=y)

    
    dtrain = xgb.DMatrix(train_x, label=train_y, enable_categorical=True)
    dvalid = xgb.DMatrix(valid_x, label=valid_y, enable_categorical=True)

    param = {
        "verbosity": 0,
        "num_class": 3,
        "objective": "multi:softmax",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    fscore = sklearn.metrics.f1_score(valid_y, pred_labels, average='micro')
    return fscore


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

NameError: name 'optuna' is not defined

In [16]:
# initialize a label encoder
label_encoder = LabelEncoder()
#encode categorical columns
cat_cols = data.select_dtypes('object').columns
for col in cat_cols:
    data[col] = label_encoder.fit_transform(data[col])

# separate features as X and target as y
X = data.drop('damage_grade', axis=1)
data['damage_grade_'] = [x-1 for x in data['damage_grade']]
y = data['damage_grade_']

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.33, stratify=y)

dtrain = xgb.DMatrix(train_x, label=train_y, enable_categorical=True)
dvalid = xgb.DMatrix(valid_x, label=valid_y, enable_categorical=True)