In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import cohen_kappa_score

In [2]:
inputfile = '../data/test/mexico_k_1_layers_5.csv'
profile_file = '../data/profiles.csv'

def get_data():
    profiles_file = pd.read_csv(profile_file)
    profiles_file = profiles_file[['profile_id', 'cwrb_reference_soil_group']]
    data = pd.read_csv(inputfile)
    data = profiles_file.merge(data, how="inner", left_on=[
        'profile_id'], right_on=['profile_id'])

    data = remove_small_classes(data, 15)
    
    y = data.cwrb_reference_soil_group.astype(str)
    X = data.drop(['cwrb_reference_soil_group'], axis=1)

    return X, y

def remove_small_classes(df, min):
    uniques = df.cwrb_reference_soil_group.unique()
    for u in uniques:
        cnt = df[df.cwrb_reference_soil_group == u].shape[0]
        if cnt < min:
            df = df[df.cwrb_reference_soil_group != u]
            print('Deleting {} with {} occurrences'.format(u, cnt))
    return df

def to_binary(y, label):
    final_y = np.zeros(y.shape)
    for i, val in enumerate(y):
        if val == label:
            final_y[i] = 1
    return final_y
            

# Test on a single class

In [3]:
X, y = get_data()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y)

y_train_binary = to_binary(y_train, 'Leptosols')
y_test_binary = to_binary(y_test, 'Leptosols')


Deleting Plinthosols with 6 occurrences
Deleting Histosols with 10 occurrences


In [4]:
# Original RF
clf_RF = RandomForestClassifier(min_samples_split=6,
                                n_estimators=1300, min_samples_leaf=2,
                                oob_score=True, class_weight="balanced", n_jobs=2)

clf_RF.fit(X_train.drop('profile_id', axis=1), y_train)

preds_RF_val = clf_RF.predict(X_test.drop('profile_id', axis=1))

print(cohen_kappa_score(preds_RF_val, y_test))


0.47539389814854205


In [26]:
# One VS All RF
clf_RF = RandomForestClassifier(min_samples_split=6,
                                n_estimators=1300, min_samples_leaf=2,
                                oob_score=True, class_weight="balanced", n_jobs=2)

clf_RF.fit(X_train.drop('profile_id', axis=1), y_train_binary)

preds_RF_val = clf_RF.predict(X_test.drop('profile_id', axis=1))

print(cohen_kappa_score(preds_RF_val, y_test_binary))


0.7434351335206649


# Create an Ensemble

In [5]:
ensemble = {}
for label in list(list(y.value_counts().keys())):
    y_train_binary = to_binary(y_train, label)
    y_test_binary = to_binary(y_test, label)
    
    clf_RF = RandomForestClassifier(min_samples_split=6,
                                n_estimators=1300, min_samples_leaf=2,
                                oob_score=True, class_weight="balanced", n_jobs=-1)
    clf_RF.fit(X_train.drop('profile_id', axis=1), y_train_binary)
    
    ensemble[label] = {'y_train': y_train_binary, 'y_test': y_test_binary, 'classifier': clf_RF}
    
    preds_RF_val = clf_RF.predict(X_test.drop('profile_id', axis=1))
    print(f'{label}: {cohen_kappa_score(preds_RF_val, y_test_binary)}')

    
    

Regosols: 0.4574669091516147
Phaeozems: 0.15445755817830242
Leptosols: 0.7544537448886723
Vertisols: 0.5862256093244933
Luvisols: 0.35526525410227816
Calcisols: 0.424404739228161
Cambisols: -0.0029054117566633675
Umbrisols: 0.5892233717858837
Solonchaks: 0.7681245225362873
Kastanozems: 0.05798211429729916
Chernozems: 0.05371891476025614
Fluvisols: 0.06772633033863174
Gleysols: 0.2094720579617494
Arenosols: 0.4311750374812594
Acrisols: 0.24988516306844277
Andosols: 0.39373814041745736
Planosols: 0.1873781935438149
Solonetz: 0.13110043105583913
Durisols: 0.3615712257453857
Gypsisols: 0.3615712257453857
Nitisols: -0.0011874721686209178
Lixisols: 0.0


In [6]:
def get_prediction(ensemble, X):
    preds = np.zeros(shape=(len(X), len(ensemble.keys())))
    for i, label in enumerate(list(ensemble.keys())):
        # Select only the probability of being this class
        preds_true = ensemble[label]['classifier'].predict_proba(X)[:,1]

        # Transform to [profile, [pred_label_1, ..., pred_label_n]]
        for j, p in enumerate(preds_true):
            preds[j][i] = p

    final_labels = []
    for p in preds:
        final_labels.append(list(ensemble.keys())[np.argmax(p)])
    
    return final_labels


In [7]:
preds = get_prediction(ensemble, X_test.drop('profile_id', axis=1))
print(cohen_kappa_score(preds, y_test))


0.46815470086778177


In [61]:
X_train.drop('profile_id', axis=1).values[0][0]

array([-0.93456948, -0.85811313, -0.47500972, -0.3608269 ,  0.02318173,
        0.82786882, -1.71417383,  0.14283889,  0.28680917, -0.27422442,
       -0.38584765, -0.1509784 , -0.22807206, -0.41336883, -0.47500972,
       -0.01979091,  0.02318173, -0.09190198, -1.4714568 , -0.14551303,
        0.28680917, -0.66300472,  0.28574424,  0.11476502, -0.47500972,
        0.43492374,  0.02318173, -0.42039155, -1.39055125, -0.72221688,
        0.62658086, -0.40699156,  0.28574424,  0.11476502, -0.47500972,
        0.43492374,  0.02318173, -0.42039155, -1.39055125, -0.72221688,
        0.62658086, -0.40699156,  0.28574424,  0.11476502, -0.47500972,
        0.43492374,  0.02318173, -0.42039155, -1.39055125, -0.72221688,
        0.62658086, -0.40699156,  3.        ])