In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import mean_squared_error,classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
# from sklearn.datasets import make_classification

In [None]:
train_df = pd.read_csv('data/train.csv')

In [None]:
train_df.info()

In [None]:
train_df.sample(10)

### Scaling Fire size and Fire year

In [None]:
scaler = MinMaxScaler()
train_df['FIRE_SIZE_SCALED'] = scaler.fit_transform(train_df[['FIRE_SIZE']])
train_df['FIRE_YEAR_SCALED'] = scaler.fit_transform(train_df[['FIRE_YEAR']])

### Selecting Important Columns

In [None]:
train_df["STATE"] = LabelEncoder().fit_transform(train_df["STATE"])

In [None]:
good_cols = ["FIRE_YEAR_SCALED","DISCOVERY_DOY","CONT_DOY","CONT_TIME",
             "FIRE_SIZE_SCALED","LATITUDE","LONGITUDE","STATE","FIRE_SIZE_CLASS_ENCODED"]

prefix1 = 'STAT_CAUSE_DESCR_'

# select the columns you want to keep
keep_cols1 = train_df.filter(like=prefix1).columns.tolist()
# keep_cols2 = train_df.filter(like=prefix2).columns.tolist()
keep_cols = good_cols + keep_cols1

In [None]:
df_interest = train_df[keep_cols]

In [None]:
df_interest.head()

In [None]:
X = df_interest.drop(['STATE'], axis=1)
y = df_interest['STATE']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
smote = SMOTE(random_state=42,k_neighbors=2)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled , test_size=0.2, random_state=42)

In [None]:
# define the hyperparameter search space
search_space = {
    'learning_rate': Real(0.01, 1.0, prior='log-uniform'),
    'max_depth': Integer(1, 10),
    'n_estimators': Integer(50, 1000),
    'subsample': Real(0.1, 1.0, prior='uniform'),
    'gamma': Real(0, 1.0, prior='uniform'),
    'colsample_bytree': Real(0.1, 1.0, prior='uniform'),
    'reg_alpha': Real(1e-9, 1000, prior='log-uniform'),
    'reg_lambda': Real(1e-9, 1000, prior='log-uniform')
}

# define the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# define the BayesSearchCV object
bayes_search = BayesSearchCV(
    xgb_model,
    search_space,
    n_iter=50,
    cv=5,
    n_jobs=-1
)

# fit the BayesSearchCV object
bayes_search.fit(X_train, y_train)

# print the best hyperparameters and the corresponding mean cross-validation score
print("Best hyperparameters: ", bayes_search.best_params_)
print("Best mean cross-validation score: ", bayes_search.best_score_)

In [None]:
clf = xgb.XGBClassifier(bayes_search.best_params_)
clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))

## Testing on test data

In [None]:
test_df = pd.read_csv('data/test.csv')
x_test = test_df.drop(['STATE'], axis=1)

y_test = test_df['STATE']

In [None]:
predict = clf.predict(x_test)
print(classification_report(y_test, predict))