In [1]:
import copy

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", 200)

from sklearn.experimental import enable_halving_search_cv
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score

print("import success")

import success


In [2]:
# import pre-processed datasets

# train data
data = pd.read_csv(r"source/for_train.csv", index_col="id")

# target
labels = pd.read_csv(r"source/train_labels.csv", index_col="id")

# check whether rows are equal
print("data => rows: %s, cols: %s" % (data.shape[0], data.shape[1]))
print("labels => rows: %s, cols: %s" % (labels.shape[0], labels.shape[1]))

data => rows: 74250, cols: 122
labels => rows: 59400, cols: 1


In [3]:
target = labels.values.ravel()

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.2,
    shuffle = True,
    stratify = target,
    random_state = 42
)

# size = 25
size = len(y_train)
X_train_pruned = X_train.head(size).copy()
y_train_pruned = copy.deepcopy(y_train[:size])

In [4]:
rfc = RandomForestClassifier(
    criterion="entropy",
    max_features="sqrt"
)

grid = {
    'n_estimators': [n for n in range(1600, 3001) if n % 400 == 0],
    'max_depth': [60, 80, 100, 110, 120] + [None],
}

hrscv = HalvingGridSearchCV(
    estimator=rfc,
    param_grid=grid,
    scoring=make_scorer(f1_score, average = 'weighted'),
    cv=2,
    error_score='raise',
    refit=False,
    verbose=10,
    random_state=42,
    n_jobs=4
)

print("Ready!")

Ready!


In [5]:
# hrscv.fit(X_train_pruned, y_train_pruned)

In [6]:
# print(hrscv.best_params_)
# print(hrscv.best_score_)
# print(hrscv.best_estimator_)

{'max_depth': 110, 'n_estimators': 2000}
0.782550678353571

In [7]:
# fit the classifier

rfc = RandomForestClassifier(criterion="entropy", n_estimators=2000, max_depth=110, max_features="sqrt")
rfc.fit(X_train, y_train)
print("Fit complete")

Fit complete


In [8]:
predicted_vals = rfc.predict(X_test)
score = f1_score(y_test, predicted_vals, average="weighted")

print(score)

0.7974290791389845


In [9]:
build = pd.DataFrame(test.index)

build["status_group"] = rfc.predict(test)

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 2 and build["status_group"].nunique() == 3)

build.head()

(14850, 2)


Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [10]:
build.to_csv("source/output.csv", index=False)
print("success")


success
