# Load Functions

In [5]:
from src import (FeatureEngineering, FeatureSelection, train_test_split_function, 
                           check_numerical_columns, find_best_score, run_and_save, 
                           tune_model, compute_mean_encodings, apply_mean_encodings)

# Load Data & Feature Engineering

In [6]:
df,test_df = FeatureEngineering(drop_non_numerical=True, drop_empty_rows=True)

In [7]:
building_id = test_df.building_id

In [24]:
selected_features = df.columns

In [26]:
df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,1,1,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,0,1,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,0,1,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,0,1,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,1,0,...,0,0,0,0,0,0,0,0,0,3


In [9]:
# Use list comprehension to remove 'damage_grade' from the list
selected_features_test = [feature for feature in selected_features if feature != 'damage_grade']

# Assuming FeatureSelection is a function that selects specified features from a dataframe
selected_df = FeatureSelection(df, selected_features)
selected_test_df = FeatureSelection(test_df, selected_features_test)

# Encoding

In [10]:
mean_encodings = compute_mean_encodings(dataframe=selected_df,target_variable='damage_grade', columns_to_encode=['geo_level_1_id','geo_level_2_id','geo_level_3_id'])

In [11]:
selected_df = apply_mean_encodings(dataframe=selected_df, mean_encodings=mean_encodings)

In [12]:
selected_test_df = apply_mean_encodings(dataframe=selected_test_df, mean_encodings=mean_encodings)

In [13]:
check_numerical_columns(selected_df)

Yes


# Train_Test_Split

In [14]:
target_column_name = 'damage_grade'

In [15]:
X_train, X_val, y_train, y_val = train_test_split_function(selected_df,target_column_name)

# Finding best parameters

In [16]:
results = find_best_score(X_train, y_train, 2, 10, 2,[0.1,0.01], model='XGB')

In [17]:
best_params = None
best_score = float('-inf')  # Initialize to negative infinity

for key, (params, score) in results.items():
    if score > best_score:
        best_score = score
        best_params = params

print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}
Best Score: 0.7660145415496353


In [18]:
infolist = [1,2]

# Cross-Validation and Model-Fit

In [19]:
best_params

{'num_class': 3, 'max_depth': 2, 'learning_rate': 0.01, 'num_boost_round': 100}

In [20]:
infolist = []

In [21]:
fitted_model,accuracy = tune_model(X_train, y_train, X_val, y_val, best_params, infolist, model = 'XGB')

[0]	Test-mlogloss:1.09244
[1]	Test-mlogloss:1.08637
[2]	Test-mlogloss:1.08036


Parameters: { "num_boost_round" } are not used.



[3]	Test-mlogloss:1.07448
[4]	Test-mlogloss:1.06869
[5]	Test-mlogloss:1.06296
[6]	Test-mlogloss:1.05735
[7]	Test-mlogloss:1.05183
[8]	Test-mlogloss:1.04637
[9]	Test-mlogloss:1.04101
[10]	Test-mlogloss:1.03571
[11]	Test-mlogloss:1.03051
[12]	Test-mlogloss:1.02537
[13]	Test-mlogloss:1.02033
[14]	Test-mlogloss:1.01540
[15]	Test-mlogloss:1.01048
[16]	Test-mlogloss:1.00569
[17]	Test-mlogloss:1.00091
[18]	Test-mlogloss:0.99625
[19]	Test-mlogloss:0.99161
[20]	Test-mlogloss:0.98709
[21]	Test-mlogloss:0.98258
[22]	Test-mlogloss:0.97812
[23]	Test-mlogloss:0.97379
[24]	Test-mlogloss:0.96947
[25]	Test-mlogloss:0.96525
[26]	Test-mlogloss:0.96105
[27]	Test-mlogloss:0.95693
[28]	Test-mlogloss:0.95289
[29]	Test-mlogloss:0.94886
[30]	Test-mlogloss:0.94488
[31]	Test-mlogloss:0.94101
[32]	Test-mlogloss:0.93714
[33]	Test-mlogloss:0.93337
[34]	Test-mlogloss:0.92962
[35]	Test-mlogloss:0.92595
[36]	Test-mlogloss:0.92229
[37]	Test-mlogloss:0.91869
[38]	Test-mlogloss:0.91518
[39]	Test-mlogloss:0.91167
[40]	Tes

In [22]:
accuracy

0.7303965771953723

# Run validation and print csv

In [23]:
run_and_save(fitted_model,selected_test_df,building_id)

File saved as data/output/predictions_2024-02-03_11-52-16.csv
