# Time for some Random Forests!

![alt text](https://cdn-images-1.medium.com/max/592/1*i0o8mjFfCn-uD79-F1Cqkw.png "Random Forest Diagram")

So for this challenge, I decided to look into the scikit-learn Random Forest Classifier, which basically creates multiple decision trees and outputs the mode of the decision trees as the predicted class. 

First things first, import some functions from sklearn, pandas, numpy and matplotlib.

In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder, PolynomialFeatures, KBinsDiscretizer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

And quickly create a baseline.

In [2]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

print((y_test == y_train.mode().values[0]).sum() / y_test.count())

0.5408249158249159


Great, let's look at Logistic Regression as a second baseline, while limiting the maximum number of categories for a categorical feature.

In [3]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X["date_recorded"] = pd.to_numeric(pd.to_datetime(X["date_recorded"]))

X["region_code"] = X["region_code"].astype(str)
X["district_code"] = X["district_code"].astype(str)

X[[c for c in X if X[c].dtype == 'object']] = X[[c for c in X if X[c].dtype == 'object']].astype(str).fillna("NAN")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

cols = [c for c in X_train if X_train[c].dtype == 'object' and len(X_train[c].astype('category').cat.categories) < 1000]
dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

l = LogisticRegression().fit(X_train, y_train)
print(l.score(X_test, y_test))

0.5408249158249159


Not much better...

However, there are some weird values for some of the numerical features, so let's solve those first.

In [4]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X["date_recorded"] = pd.to_numeric(pd.to_datetime(X["date_recorded"]))

X["region_code"] = X["region_code"].astype(str)
X["district_code"] = X["district_code"].astype(str)

X[[c for c in X if X[c].dtype == 'object']] = X[[c for c in X if X[c].dtype == 'object']].astype(str).fillna("NAN")

X['longitude'] = X['longitude'].replace(0.0, np.nan)
X['longitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
X['latitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['num_private'] = X['num_private'].replace(0.0, np.nan)
X["num_private_bool"] = (X["num_private"] == np.nan).astype(int)

X['construction_year'] = X['construction_year'].replace(0.0, np.nan)
X["construction_year_bool"] = (X["construction_year"] == np.nan).astype(int)

X['population'] = X['population'].replace(0.0, np.nan)
X["population_bool"] = (X["population"] == np.nan).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

longitude_gm = X_train.groupby(['region', 'district_code'])['longitude'].transform('mean')
X_train['longitude'] = X_train['longitude'].fillna(longitude_gm)
longitude_m = X_train['longitude'].mean()
X_train['longitude'] = X_train['longitude'].fillna(longitude_m)

X_test['longitude'] = X_test['longitude'].fillna(longitude_gm)
X_test['longitude'] = X_test['longitude'].fillna(longitude_m)


latitude_gm = X_train.groupby(['region', 'district_code'])['latitude'].transform('mean')
X_train['latitude'] = X_train['latitude'].fillna(latitude_gm)
latitude_m = X_train['latitude'].mean()
X_train['latitude'] = X_train['latitude'].fillna(latitude_m)

X_test['latitude'] = X_test['latitude'].fillna(latitude_gm)
X_test['latitude'] = X_test['latitude'].fillna(latitude_m)

num_private_m = X_train['num_private'].mean()
X_train['num_private'] = X_train['num_private'].fillna(num_private_m)
X_test['num_private'] = X_test['num_private'].fillna(num_private_m)

construction_year_m = X_train['construction_year'].mean()
X_train['construction_year'] = X_train['construction_year'].fillna(construction_year_m)
X_test['construction_year'] = X_test['construction_year'].fillna(construction_year_m)

population_m = X_train['population'].mean()
X_train['population'] = X_train['population'].fillna(population_m)
X_test['population'] = X_test['population'].fillna(population_m)

dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

l = LogisticRegression().fit(X_train, y_train)
print(l.score(X_test, y_test))

0.5408249158249159


Still nothing huh...

Let's add some stuff! (Inverses, bins, and power transforms of numerical features)

In [6]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X["date_recorded"] = pd.to_numeric(pd.to_datetime(X["date_recorded"]))

X["region_code"] = X["region_code"].astype(str)
X["district_code"] = X["district_code"].astype(str)

X[[c for c in X if X[c].dtype == 'object']] = X[[c for c in X if X[c].dtype == 'object']].astype(str).fillna("NAN")

X['longitude'] = X['longitude'].replace(0.0, np.nan)
X['longitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
X['latitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['num_private'] = X['num_private'].replace(0.0, np.nan)
X["num_private_bool"] = (X["num_private"] == np.nan).astype(int)

X['construction_year'] = X['construction_year'].replace(0.0, np.nan)
X["construction_year_bool"] = (X["construction_year"] == np.nan).astype(int)

X['population'] = X['population'].replace(0.0, np.nan)
X["population_bool"] = (X["population"] == np.nan).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

numcols = [c for c in X_train if X_train[c].dtype != 'object']
invcols = [c for c in X_train if X_train[c].dtype != 'object' and (X_train[c] == 0).sum() == 0]
cols = [c for c in X_train if X_train[c].dtype == 'object' and len(X_train[c].astype('category').cat.categories) < 1000]

longitude_gm = X_train.groupby(['region', 'district_code'])['longitude'].transform('mean')
X_train['longitude'] = X_train['longitude'].fillna(longitude_gm)
longitude_m = X_train['longitude'].mean()
X_train['longitude'] = X_train['longitude'].fillna(longitude_m)

X_test['longitude'] = X_test['longitude'].fillna(longitude_gm)
X_test['longitude'] = X_test['longitude'].fillna(longitude_m)


latitude_gm = X_train.groupby(['region', 'district_code'])['latitude'].transform('mean')
X_train['latitude'] = X_train['latitude'].fillna(latitude_gm)
latitude_m = X_train['latitude'].mean()
X_train['latitude'] = X_train['latitude'].fillna(latitude_m)

X_test['latitude'] = X_test['latitude'].fillna(latitude_gm)
X_test['latitude'] = X_test['latitude'].fillna(latitude_m)

num_private_m = X_train['num_private'].mean()
X_train['num_private'] = X_train['num_private'].fillna(num_private_m)
X_test['num_private'] = X_test['num_private'].fillna(num_private_m)

construction_year_m = X_train['construction_year'].mean()
X_train['construction_year'] = X_train['construction_year'].fillna(construction_year_m)
X_test['construction_year'] = X_test['construction_year'].fillna(construction_year_m)

population_m = X_train['population'].mean()
X_train['population'] = X_train['population'].fillna(population_m)
X_test['population'] = X_test['population'].fillna(population_m)

dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

for c in invcols:
    X_train["{}(inv)".format(c)] = 1.0 / X_train[c]
    X_test["{}(inv)".format(c)] = 1.0 / X_test[c]
    
k = KBinsDiscretizer(n_bins=100, encode="onehot-dense", strategy="uniform").fit(X_train[numcols])
kcols = ["{}[{}]".format(c,i) for ind,c in enumerate(numcols) for i in range(k.n_bins_[ind])]
X_train = pd.concat([X_train, pd.DataFrame(k.transform(X_train[numcols]), columns=kcols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(k.transform(X_test[numcols]), columns=kcols, index=X_test.index)], axis=1)

m = PowerTransformer().fit(X_train[numcols])
X_train[numcols] = m.transform(X_train[numcols])
X_test[numcols] = m.transform(X_test[numcols])

l = LogisticRegression().fit(X_train, y_train)
print(l.score(X_test, y_test))

0.7552188552188552


Some progress! Now let's add some interaction. However, there will be way too many terms, so let's limit it with a select percentile (ANOVA) to a max of 200 pre-polynomial and a max of 1000 post-polynomial.

In [7]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X["date_recorded"] = pd.to_numeric(pd.to_datetime(X["date_recorded"]))

X["region_code"] = X["region_code"].astype(str)
X["district_code"] = X["district_code"].astype(str)

X[[c for c in X if X[c].dtype == 'object']] = X[[c for c in X if X[c].dtype == 'object']].astype(str).fillna("NAN")

X['longitude'] = X['longitude'].replace(0.0, np.nan)
X['longitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
X['latitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['num_private'] = X['num_private'].replace(0.0, np.nan)
X["num_private_bool"] = (X["num_private"] == np.nan).astype(int)

X['construction_year'] = X['construction_year'].replace(0.0, np.nan)
X["construction_year_bool"] = (X["construction_year"] == np.nan).astype(int)

X['population'] = X['population'].replace(0.0, np.nan)
X["population_bool"] = (X["population"] == np.nan).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

numcols = [c for c in X_train if X_train[c].dtype != 'object']
invcols = [c for c in X_train if X_train[c].dtype != 'object' and (X_train[c] == 0).sum() == 0]
cols = [c for c in X_train if X_train[c].dtype == 'object' and len(X_train[c].astype('category').cat.categories) < 1000]

longitude_gm = X_train.groupby(['region', 'district_code'])['longitude'].transform('mean')
X_train['longitude'] = X_train['longitude'].fillna(longitude_gm)
longitude_m = X_train['longitude'].mean()
X_train['longitude'] = X_train['longitude'].fillna(longitude_m)

X_test['longitude'] = X_test['longitude'].fillna(longitude_gm)
X_test['longitude'] = X_test['longitude'].fillna(longitude_m)

latitude_gm = X_train.groupby(['region', 'district_code'])['latitude'].transform('mean')
X_train['latitude'] = X_train['latitude'].fillna(latitude_gm)
latitude_m = X_train['latitude'].mean()
X_train['latitude'] = X_train['latitude'].fillna(latitude_m)

X_test['latitude'] = X_test['latitude'].fillna(latitude_gm)
X_test['latitude'] = X_test['latitude'].fillna(latitude_m)

num_private_m = X_train['num_private'].mean()
X_train['num_private'] = X_train['num_private'].fillna(num_private_m)
X_test['num_private'] = X_test['num_private'].fillna(num_private_m)

construction_year_m = X_train['construction_year'].mean()
X_train['construction_year'] = X_train['construction_year'].fillna(construction_year_m)
X_test['construction_year'] = X_test['construction_year'].fillna(construction_year_m)

population_m = X_train['population'].mean()
X_train['population'] = X_train['population'].fillna(population_m)
X_test['population'] = X_test['population'].fillna(population_m)

dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

for c in invcols:
    X_train["{}(inv)".format(c)] = 1.0 / X_train[c]
    X_test["{}(inv)".format(c)] = 1.0 / X_test[c]
    
k = KBinsDiscretizer(n_bins=100, encode="onehot-dense", strategy="uniform").fit(X_train[numcols])
kcols = ["{}[{}]".format(c,i) for ind,c in enumerate(numcols) for i in range(k.n_bins_[ind])]
X_train = pd.concat([X_train, pd.DataFrame(k.transform(X_train[numcols]), columns=kcols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(k.transform(X_test[numcols]), columns=kcols, index=X_test.index)], axis=1)

m = PowerTransformer().fit(X_train[numcols])
X_train[numcols] = m.transform(X_train[numcols])
X_test[numcols] = m.transform(X_test[numcols])

p_val1 = SelectPercentile(percentile=(200*100)//X_train.shape[1]).fit(X_train, y_train)
X_train = p_val1.transform(X_train)
X_test = p_val1.transform(X_test)

poly = PolynomialFeatures(degree=2).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

p_val2 = SelectPercentile(percentile=(1000*100)//X_train.shape[1]).fit(X_train, y_train)
X_train = p_val2.transform(X_train)
X_test = p_val2.transform(X_test)

l = LogisticRegression().fit(X_train, y_train)
print(l.score(X_test, y_test))

0.7361952861952862


Alright, so not so great. Let's look at Random Forest.

In [8]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X["date_recorded"] = pd.to_numeric(pd.to_datetime(X["date_recorded"]))

X["region_code"] = X["region_code"].astype(str)
X["district_code"] = X["district_code"].astype(str)

X[[c for c in X if X[c].dtype == 'object']] = X[[c for c in X if X[c].dtype == 'object']].astype(str).fillna("NAN")

X['longitude'] = X['longitude'].replace(0.0, np.nan)
X['longitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
X['latitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['num_private'] = X['num_private'].replace(0.0, np.nan)
X["num_private_bool"] = (X["num_private"] == np.nan).astype(int)

X['construction_year'] = X['construction_year'].replace(0.0, np.nan)
X["construction_year_bool"] = (X["construction_year"] == np.nan).astype(int)

X['population'] = X['population'].replace(0.0, np.nan)
X["population_bool"] = (X["population"] == np.nan).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

ordin = OrdinalEncoder()
ordin.fit(y_train.values.reshape(-1,1))
y_tr_c = ordin.transform(y_train.values.reshape(-1,1)).reshape(-1)
y_te_c = ordin.transform(y_test.values.reshape(-1,1)).reshape(-1)

numcols = [c for c in X_train if X_train[c].dtype != 'object']
invcols = [c for c in X_train if X_train[c].dtype != 'object' and (X_train[c] == 0).sum() == 0]
cols = [c for c in X_train if X_train[c].dtype == 'object' and len(X_train[c].astype('category').cat.categories) < 1000]

longitude_gm = X_train.groupby(['region', 'district_code'])['longitude'].transform('mean')
X_train['longitude'] = X_train['longitude'].fillna(longitude_gm)
longitude_m = X_train['longitude'].mean()
X_train['longitude'] = X_train['longitude'].fillna(longitude_m)

X_test['longitude'] = X_test['longitude'].fillna(longitude_gm)
X_test['longitude'] = X_test['longitude'].fillna(longitude_m)


latitude_gm = X_train.groupby(['region', 'district_code'])['latitude'].transform('mean')
X_train['latitude'] = X_train['latitude'].fillna(latitude_gm)
latitude_m = X_train['latitude'].mean()
X_train['latitude'] = X_train['latitude'].fillna(latitude_m)

X_test['latitude'] = X_test['latitude'].fillna(latitude_gm)
X_test['latitude'] = X_test['latitude'].fillna(latitude_m)

num_private_m = X_train['num_private'].mean()
X_train['num_private'] = X_train['num_private'].fillna(num_private_m)
X_test['num_private'] = X_test['num_private'].fillna(num_private_m)

construction_year_m = X_train['construction_year'].mean()
X_train['construction_year'] = X_train['construction_year'].fillna(construction_year_m)
X_test['construction_year'] = X_test['construction_year'].fillna(construction_year_m)

population_m = X_train['population'].mean()
X_train['population'] = X_train['population'].fillna(population_m)
X_test['population'] = X_test['population'].fillna(population_m)

dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

for c in invcols:
    X_train["{}(inv)".format(c)] = 1.0 / X_train[c]
    X_test["{}(inv)".format(c)] = 1.0 / X_test[c]
    
k = KBinsDiscretizer(n_bins=100, encode="onehot-dense", strategy="uniform").fit(X_train[numcols])
kcols = ["{}[{}]".format(c,i) for ind,c in enumerate(numcols) for i in range(k.n_bins_[ind])]
X_train = pd.concat([X_train, pd.DataFrame(k.transform(X_train[numcols]), columns=kcols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(k.transform(X_test[numcols]), columns=kcols, index=X_test.index)], axis=1)

m = PowerTransformer().fit(X_train[numcols])
X_train[numcols] = m.transform(X_train[numcols])
X_test[numcols] = m.transform(X_test[numcols])

p_val1 = SelectPercentile(percentile=(200*100)//X_train.shape[1]).fit(X_train, y_train)
X_train = p_val1.transform(X_train)
X_test = p_val1.transform(X_test)

poly = PolynomialFeatures(degree=2).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

p_val2 = SelectPercentile(percentile=(1000*100)//X_train.shape[1]).fit(X_train, y_train)
X_train = p_val2.transform(X_train)
X_test = p_val2.transform(X_test)

clf = RandomForestClassifier(n_estimators=1000, verbose=1, n_jobs=-1, random_state=42).fit(X_train, y_tr_c)
print(clf.score(X_test, y_te_c))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.2s


0.7675925925925926


[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.6s finished


Great! Lets add some sigmoid calibration.

In [9]:
X = pd.read_csv("train_features.csv", index_col=0, header=0)
y = pd.read_csv("train_labels.csv", index_col=0, header=0)["status_group"]

X["date_recorded"] = pd.to_numeric(pd.to_datetime(X["date_recorded"]))

X["region_code"] = X["region_code"].astype(str)
X["district_code"] = X["district_code"].astype(str)

X[[c for c in X if X[c].dtype == 'object']] = X[[c for c in X if X[c].dtype == 'object']].astype(str).fillna("NAN")

X['longitude'] = X['longitude'].replace(0.0, np.nan)
X['longitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['latitude'] = X['latitude'].replace(-2.000000e-08, np.nan)
X['latitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['num_private'] = X['num_private'].replace(0.0, np.nan)
X["num_private_bool"] = (X["num_private"] == np.nan).astype(int)

X['construction_year'] = X['construction_year'].replace(0.0, np.nan)
X["construction_year_bool"] = (X["construction_year"] == np.nan).astype(int)

X['population'] = X['population'].replace(0.0, np.nan)
X["population_bool"] = (X["population"] == np.nan).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

ordin = OrdinalEncoder()
ordin.fit(y_train.values.reshape(-1,1))
y_tr_c = ordin.transform(y_train.values.reshape(-1,1)).reshape(-1)
y_te_c = ordin.transform(y_test.values.reshape(-1,1)).reshape(-1)

numcols = [c for c in X_train if X_train[c].dtype != 'object']
invcols = [c for c in X_train if X_train[c].dtype != 'object' and (X_train[c] == 0).sum() == 0]
cols = [c for c in X_train if X_train[c].dtype == 'object' and len(X_train[c].astype('category').cat.categories) < 1000]

longitude_gm = X_train.groupby(['region', 'district_code'])['longitude'].transform('mean')
X_train['longitude'] = X_train['longitude'].fillna(longitude_gm)
longitude_m = X_train['longitude'].mean()
X_train['longitude'] = X_train['longitude'].fillna(longitude_m)

X_test['longitude'] = X_test['longitude'].fillna(longitude_gm)
X_test['longitude'] = X_test['longitude'].fillna(longitude_m)


latitude_gm = X_train.groupby(['region', 'district_code'])['latitude'].transform('mean')
X_train['latitude'] = X_train['latitude'].fillna(latitude_gm)
latitude_m = X_train['latitude'].mean()
X_train['latitude'] = X_train['latitude'].fillna(latitude_m)

X_test['latitude'] = X_test['latitude'].fillna(latitude_gm)
X_test['latitude'] = X_test['latitude'].fillna(latitude_m)

num_private_m = X_train['num_private'].mean()
X_train['num_private'] = X_train['num_private'].fillna(num_private_m)
X_test['num_private'] = X_test['num_private'].fillna(num_private_m)

construction_year_m = X_train['construction_year'].mean()
X_train['construction_year'] = X_train['construction_year'].fillna(construction_year_m)
X_test['construction_year'] = X_test['construction_year'].fillna(construction_year_m)

population_m = X_train['population'].mean()
X_train['population'] = X_train['population'].fillna(population_m)
X_test['population'] = X_test['population'].fillna(population_m)

dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

for c in invcols:
    X_train["{}(inv)".format(c)] = 1.0 / X_train[c]
    X_test["{}(inv)".format(c)] = 1.0 / X_test[c]
    
k = KBinsDiscretizer(n_bins=100, encode="onehot-dense", strategy="uniform").fit(X_train[numcols])
kcols = ["{}[{}]".format(c,i) for ind,c in enumerate(numcols) for i in range(k.n_bins_[ind])]
X_train = pd.concat([X_train, pd.DataFrame(k.transform(X_train[numcols]), columns=kcols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(k.transform(X_test[numcols]), columns=kcols, index=X_test.index)], axis=1)

m = PowerTransformer().fit(X_train[numcols])
X_train[numcols] = m.transform(X_train[numcols])
X_test[numcols] = m.transform(X_test[numcols])

p_val1 = SelectPercentile(percentile=(200*100)//X_train.shape[1]).fit(X_train, y_train)
X_train = p_val1.transform(X_train)
X_test = p_val1.transform(X_test)

poly = PolynomialFeatures(degree=2).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

p_val2 = SelectPercentile(percentile=(1000*100)//X_train.shape[1]).fit(X_train, y_train)
X_train = p_val2.transform(X_train)
X_test = p_val2.transform(X_test)

clf = CalibratedClassifierCV(RandomForestClassifier(n_estimators=1000, verbose=1, n_jobs=-1, random_state=42), method="sigmoid", cv=3).fit(X_train, y_tr_c)
print(clf.score(X_test, y_te_c))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   46.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    9.

0.7882154882154883


[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    1.8s finished


Alright, let's leave it as that and predict the submission.

In [10]:
X_t = pd.read_csv("test_features.csv", index_col=0, header=0)
indexes = X_t.index.values

X_t["date_recorded"] = pd.to_numeric(pd.to_datetime(X_t["date_recorded"]))
X_t["region_code"] = X_t["region_code"].astype(str)
X_t["district_code"] = X_t["district_code"].astype(str)
X_t[[c for c in X_t if X_t[c].dtype == 'object']] = X_t[[c for c in X_t if X_t[c].dtype == 'object']].astype(str).fillna("NAN")
X_t['longitude'] = X_t['longitude'].replace(0.0, np.nan)
X_t['longitude_bool'] = (X_t["longitude"] == np.nan).astype(int)
X_t['latitude'] = X_t['latitude'].replace(-2.000000e-08, np.nan)
X_t['latitude_bool'] = (X_t["longitude"] == np.nan).astype(int)
X_t['num_private'] = X_t['num_private'].replace(0.0, np.nan)
X_t["num_private_bool"] = (X_t["num_private"] == np.nan).astype(int)
X_t['construction_year'] = X_t['construction_year'].replace(0.0, np.nan)
X_t["construction_year_bool"] = (X_t["construction_year"] == np.nan).astype(int)
X_t['population'] = X_t['population'].replace(0.0, np.nan)
X_t["population_bool"] = (X_t["population"] == np.nan).astype(int)
X_t['longitude'] = X_t['longitude'].fillna(longitude_gm)
X_t['longitude'] = X_t['longitude'].fillna(longitude_m)
X_t['latitude'] = X_t['latitude'].fillna(latitude_gm)
X_t['latitude'] = X_t['latitude'].fillna(latitude_m)
X_t['num_private'] = X_t['num_private'].fillna(num_private_m)
X_t['construction_year'] = X_t['construction_year'].fillna(construction_year_m)
X_t['population'] = X_t['population'].fillna(population_m)
X_t = pd.concat([X_t.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_t[cols]), columns=o.get_feature_names(cols), index=X_t.index)], axis=1)
for c in invcols:
    X_t["{}(inv)".format(c)] = 1.0 / X_t[c]
X_t = pd.concat([X_t, pd.DataFrame(k.transform(X_t[numcols]), columns=kcols, index=X_t.index)], axis=1)
X_t[numcols] = m.transform(X_t[numcols])
X_t = p_val1.transform(X_t)
X_t = poly.transform(X_t)
X_t = p_val2.transform(X_t)

pd.DataFrame(np.concatenate([indexes.reshape(-1,1), ordin.inverse_transform(clf.predict(X_t).reshape(-1,1))], axis=1), columns=["id", "status_group"]).to_csv("submission.csv", index=False)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    2.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Paral