In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, train_test_split, cross_val_score, GridSearchCV
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, make_scorer



In [2]:
def remove_small_classes(df, min):
    uniques = df.cwrb_reference_soil_group.unique()
    for u in uniques:
        cnt = df[df.cwrb_reference_soil_group == u].shape[0]
        if cnt < min:
            df = df[df.cwrb_reference_soil_group != u]
            print('Deleting {} with {} occurrences'.format(u, cnt))

    return df

def plot_confusion_matrix(y_true, y_pred, classes,
                            normalize=True,
                            title=None,
                            cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots(figsize=(11, 10))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    #ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    plt.show()

In [3]:
df = pd.read_csv('./test/mexico_k_1_layers_3.csv')
profiles = pd.read_csv('./profiles.csv')
profiles = profiles[['profile_id', 'cwrb_reference_soil_group']]
df = profiles.merge(df, how="inner", left_on=['profile_id'], right_on=['profile_id'])
df = df.drop(columns=list(
        df.loc[:, df.columns.str.contains('profile_layer_id')]))

df = remove_small_classes(df, 11)

y = df.cwrb_reference_soil_group
X = df.drop('cwrb_reference_soil_group', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25)

Deleting Plinthosols with 6 occurrences
Deleting Histosols with 10 occurrences


In [9]:
df.columns
#df.cwrb_reference_soil_group.value_counts().plot(kind='bar', title='Count');

Index(['profile_id', 'cwrb_reference_soil_group', 'upper_depth', 'lower_depth',
       'tceq_value_avg', 'clay_value_avg', 'elcosp_value_avg',
       'orgc_value_avg', 'phaq_value_avg', 'sand_value_avg', 'silt_value_avg',
       'latitude', 'longitude', 'upper_depth_1', 'lower_depth_1',
       'tceq_value_avg_1', 'clay_value_avg_1', 'elcosp_value_avg_1',
       'orgc_value_avg_1', 'phaq_value_avg_1', 'sand_value_avg_1',
       'silt_value_avg_1', 'upper_depth_2', 'lower_depth_2',
       'tceq_value_avg_2', 'clay_value_avg_2', 'elcosp_value_avg_2',
       'orgc_value_avg_2', 'phaq_value_avg_2', 'sand_value_avg_2',
       'silt_value_avg_2', 'n_layers'],
      dtype='object')

# Undersampling

## Meter todos a 40 (menor classe)

In [38]:
#over_df.cwrb_reference_soil_group.value_counts().plot(kind='bar', title='Count');
#under_df.cwrb_reference_soil_group.value_counts().plot(kind='bar', title='Count');
#X_resampled = pd.DataFrame(X_resampled, columns=df.columns)
X_resampled.cwrb_reference_soil_group.value_counts().plot(kind='bar', title='Count');


AttributeError: 'DataFrame' object has no attribute 'cwrb_reference_soil_group'

In [17]:
smaller_class = X_train.cwrb_reference_soil_group.value_counts().keys()[-1]
smaller_size = X_train.cwrb_reference_soil_group.value_counts()[-1]
under_df = X_train[X_train['cwrb_reference_soil_group'] == smaller_class]
for cl in X.cwrb_reference_soil_group.unique():
    if cl != smaller_class:
        df_class = X_train[X_train['cwrb_reference_soil_group'] == cl]
        df_class = df_class.sample(smaller_size)
        under_df = pd.concat([under_df, df_class], axis=0)
    

# Oversampling

In [4]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
X_resampled = pd.DataFrame(X_resampled, columns=df.columns)

In [31]:
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
X_resampled = pd.DataFrame(X_resampled, columns=df.drop('cwrb_reference_soil_group', axis=1).columns)

In [37]:
X_train



Unnamed: 0,profile_id,upper_depth,lower_depth,tceq_value_avg,clay_value_avg,elcosp_value_avg,orgc_value_avg,phaq_value_avg,sand_value_avg,silt_value_avg,...,upper_depth_2,lower_depth_2,tceq_value_avg_2,clay_value_avg_2,elcosp_value_avg_2,orgc_value_avg_2,phaq_value_avg_2,sand_value_avg_2,silt_value_avg_2,n_layers
809,208751,-0.935450,-0.797677,-0.476077,-0.359981,-0.177891,0.237596,-1.310456,0.045703,0.457324,...,-0.453983,-0.184674,-0.476077,1.800002,-0.177891,0.237596,-1.714944,-1.107578,-0.732219,2
4636,216995,-0.935450,-0.879411,2.453906,-0.814715,-0.338980,0.106116,0.792880,1.198985,-0.902154,...,-0.935450,-0.879411,2.453906,-0.814715,-0.338980,0.106116,0.792880,1.198985,-0.902154,1
1778,211394,-0.935450,-0.286842,0.261025,0.208435,1.271915,0.303336,0.711982,-0.242617,0.117455,...,-0.935450,-0.286842,0.261025,0.208435,1.271915,0.303336,0.711982,-0.242617,0.117455,1
3413,215685,-0.935450,-0.552476,-0.476077,0.549485,-0.359116,0.697776,-0.016095,-1.011471,0.967128,...,0.252168,0.244427,-0.476077,-1.383131,-0.379253,-0.551283,-0.258788,2.063946,-1.581893,5
2596,213836,-0.935450,-1.022445,-0.476077,-0.701031,0.023471,-0.222583,-1.714944,0.718451,-0.222415,...,-0.807058,-0.184674,-0.476077,-0.928398,0.023471,0.040376,-1.229559,1.102878,-0.562284,2
4774,217134,-0.935450,-0.940711,-0.476077,-0.928398,-0.379253,0.894996,-1.391354,0.910664,-0.222415,...,-0.004614,-0.143808,-0.476077,-0.701031,-0.379253,-0.419803,-1.310456,0.334024,0.457324,3
4905,217265,-0.935450,-0.858977,1.133264,-1.155764,-0.359116,-0.419803,1.116470,0.910664,0.117455,...,-0.935450,-0.858977,1.133264,-1.155764,-0.359116,-0.419803,1.116470,0.910664,0.117455,1
5868,218250,-0.935450,-0.552476,3.363000,0.776852,-0.338980,0.566296,0.469290,-0.915364,0.457324,...,0.476852,0.550929,3.363000,0.776852,-0.359116,-0.025363,0.388392,-1.011471,0.627259,6
1911,211833,-0.935450,-0.879411,-0.476077,0.435802,0.023471,-0.156843,1.278265,-0.242617,-0.222415,...,0.123777,0.040093,-0.476077,-0.246298,0.990008,-0.551283,1.116470,-0.338724,0.967128,5
2508,213567,-0.935450,-0.879411,0.347021,-0.359981,-0.177891,-0.419803,0.550187,-0.146510,0.797193,...,-0.935450,-0.879411,0.347021,-0.359981,-0.177891,-0.419803,0.550187,-0.146510,0.797193,1


# Testing


In [32]:
clf = RandomForestClassifier(
        n_estimators=1300, n_jobs=-1, min_samples_leaf=2, min_samples_split=6, oob_score=True)#,class_weight='balanced')

train = X_resampled
train_y = y_resampled

In [34]:
kappa_scorer = make_scorer(cohen_kappa_score)

#res = cross_val_score(clf, train, train_y, cv=10, verbose=2, scoring=kappa_scorer, n_jobs=-1)
print(res, 'avg: ', np.mean(res))


clf.fit(train, train_y)
y_pred = clf.predict(X_test)


#labels = list(y.unique())
#labels.sort()
#plot_confusion_matrix(y_test,y_pred, classes=labels)

[0.82440476 0.84345238 0.84761905 0.8375     0.86607143 0.87321429
 0.87703436 0.89210368 0.89873418 0.893912  ] avg:  0.8654046112115733


In [35]:
cohen_kappa_score(y_test,y_pred)

0.4884657530238077

# Other tests


In [54]:
tuning_params = [
    {'n_estimators': [1,5,10,20,100, 150, 200, 500, 1000, 1200, 2000], "n_jobs": [-1]}]
scores = ['f1_weighted', 'accuracy',
          'precision_weighted', 'recall_weighted']
print("# Tuning hyper-parameters for %s" % scores)
print()
gs = GridSearchCV(RandomForestClassifier(), tuning_params, cv=5,
                  scoring=scores, refit='precision_weighted', return_train_score=True)
gs.fit(X_train, y_train)
results = gs.cv_results_

# Tuning hyper-parameters for ['f1_weighted', 'accuracy', 'precision_weighted', 'recall_weighted']



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
          fontsize=16)

plt.xlabel("n_estimators")
plt.ylabel("Score")

ax = plt.gca()

# Get the regular numpy array from the MaskedArray
X_axis = np.array(results['param_n_estimators'].data, dtype=float)

for scorer, color in zip(sorted(scores), ['g', 'k']):
    for sample, style in (('train', '--'), ('test', '-')):
        sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
        sample_score_std = results['std_%s_%s' % (sample, scorer)]
        ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                        sample_score_mean + sample_score_std,
                        alpha=0.1 if sample == 'test' else 0, color=color)
        ax.plot(X_axis, sample_score_mean, style, color=color,
                alpha=1 if sample == 'test' else 0.7,
                label="%s (%s)" % (scorer, sample))

    best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
    best_score = results['mean_test_%s' % scorer][best_index]

    # Plot a dotted vertical line at the best score for that scorer marked by x
    ax.plot([X_axis[best_index], ] * 2, [0, best_score],
            linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

    # Annotate the best score for that scorer
    ax.annotate("%0.2f" % best_score,
                (X_axis[best_index], best_score + 0.005))

plt.legend(loc="best")
plt.grid('off')
plt.show()

In [53]:
df

Unnamed: 0,profile_id,cwrb_reference_soil_group,upper_depth,lower_depth,tceq_value_avg,clay_value_avg,elcosp_value_avg,orgc_value_avg,phaq_value_avg,sand_value_avg,...,tceq_value_avg_150,clay_value_avg_150,elcosp_value_avg_150,orgc_value_avg_150,phaq_value_avg_150,sand_value_avg_150,silt_value_avg_150,latitude_150,longitude_150,n_layers
0,205798,Regosols,1.311395,1.245665,-0.476077,-1.496814,-0.177891,-0.091103,0.145700,2.063946,...,-0.476077,-1.496814,-0.177891,-0.091103,0.145700,2.063946,-1.411958,-1.737948,0.817429,4
1,205801,Regosols,-0.486081,-0.389009,-0.476077,-1.269448,-0.177891,0.040376,-1.229559,0.814558,...,-0.476077,-1.269448,-0.177891,0.040376,-1.229559,0.814558,0.457324,-1.742299,0.833226,2
2,205803,Cambisols,1.889155,1.449999,-0.476077,-1.042081,-0.177891,-0.091103,-0.582378,0.814558,...,-0.476077,-1.042081,-0.177891,-0.091103,-0.582378,0.814558,0.117455,-1.747177,0.993230,5
3,205811,Regosols,0.573146,0.346594,-0.476077,-0.359981,-0.177891,0.040376,-0.258788,0.237917,...,-0.476077,-0.359981,-0.177891,0.040376,-0.258788,0.237917,0.117455,-1.760329,0.873154,3
4,205820,Cambisols,2.338523,1.449999,-0.476077,-0.814715,-0.177891,-0.091103,-0.501481,1.391198,...,-0.476077,-0.814715,-0.177891,-0.091103,-0.501481,1.391198,-1.242023,-1.773810,0.885816,6
5,205821,Regosols,-0.935450,-0.715943,-0.476077,-0.246298,-0.177891,0.040376,-0.744173,0.334024,...,-0.476077,-0.246298,-0.177891,0.040376,-0.744173,0.334024,-0.222415,-1.773831,0.921302,1
6,205826,Regosols,-0.357690,-0.389009,-0.476077,-0.814715,-0.177891,0.040376,-1.310456,1.006771,...,-0.476077,-0.814715,-0.177891,0.040376,-1.310456,1.006771,-0.562284,-1.777646,1.002172,2
7,205828,Regosols,-0.935450,-0.675076,-0.476077,-1.383131,-0.177891,0.040376,-1.472251,0.814558,...,-0.476077,-1.383131,-0.177891,0.040376,-1.472251,0.814558,0.627259,-1.780819,0.921989,1
8,205829,Regosols,0.605244,0.550929,-0.476077,-1.383131,-0.177891,0.040376,-0.582378,0.910664,...,-0.476077,-1.383131,-0.177891,0.040376,-0.582378,0.910664,0.457324,-1.782081,0.965411,3
9,205830,Regosols,1.728666,1.266098,-0.476077,-1.383131,-0.177891,-0.091103,-0.582378,1.967839,...,-0.476077,-1.383131,-0.177891,-0.091103,-0.582378,1.967839,-1.411958,-1.783317,0.897184,3
