In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from xgboost import XGBClassifier

In [2]:
import pandas as pd
import numpy as np

sample_train_df = pd.read_csv("data/s5e6/train.csv")
numerical_features = [x for x in sample_train_df.describe().columns if x != "id"]
categorical_features = [x for x in sample_train_df.columns if x not in numerical_features and x != "id"]
numerical_features, categorical_features

(['Temparature',
  'Humidity',
  'Moisture',
  'Nitrogen',
  'Potassium',
  'Phosphorous'],
 ['Soil Type', 'Crop Type', 'Fertilizer Name'])

In [3]:
import glob
files = glob.glob("data/s5e6/*.csv")
files

['data/s5e6/test.csv',
 'data/s5e6/previous_competition.csv',
 'data/s5e6/train.csv',
 'data/s5e6/sample_submission.csv']

In [7]:
test_file = files[0]
prev_file = files[1]
train_file = files[2]
test_df = pd.read_csv(test_file).drop(columns = ["id"])
train_df = pd.read_csv(train_file).drop(columns = ["id"])
train_df_append = pd.read_csv(prev_file)

train_df.columns, test_df.columns, train_df_append.columns

(Index(['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
        'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name'],
       dtype='object'),
 Index(['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
        'Nitrogen', 'Potassium', 'Phosphorous'],
       dtype='object'),
 Index(['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
        'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name'],
       dtype='object'))

In [9]:
train_df = pd.concat([train_df, train_df_append], ignore_index = True)
train_df.tail(10)

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
949990,36,59,32,Sandy,Wheat,34,17,17,28-28
949991,26,54,30,Black,Millets,35,14,0,20-20
949992,27,50,47,Black,Paddy,22,7,28,20-20
949993,36,68,38,Sandy,Paddy,19,1,4,20-20
949994,32,51,45,Clayey,Paddy,27,8,2,20-20
949995,32,71,61,Black,Tobacco,23,1,25,20-20
949996,35,72,47,Loamy,Millets,38,1,32,17-17-17
949997,28,50,61,Sandy,Maize,10,11,14,14-35-14
949998,29,57,63,Loamy,Ground Nuts,7,10,4,DAP
949999,25,72,42,Sandy,Wheat,38,2,6,17-17-17


In [10]:
## one-hot encoding of the categorical features.
oe = OrdinalEncoder()
train_df[categorical_features[:-1]] = oe.fit_transform(train_df[categorical_features[:-1]])
test_df[categorical_features[:-1]] = oe.transform(test_df[categorical_features[:-1]])

In [11]:
## transform the label as well.
target_feature = categorical_features[-1]
le = LabelEncoder()
train_df[target_feature] = le.fit_transform(train_df[target_feature])
train_df.head()

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,37,70,36,1.0,8.0,36,4,5,4
1,27,69,65,4.0,4.0,30,6,18,4
2,29,63,32,4.0,4.0,24,12,16,2
3,35,62,54,4.0,0.0,39,12,4,0
4,35,58,43,3.0,6.0,37,2,16,5


In [12]:
full_y, full_x = train_df[target_feature], train_df.drop(columns = [target_feature])
train_x, test_x, train_y, test_y = train_test_split(full_x, full_y, test_size = 0.2, random_state = 42, stratify = full_y)

In [28]:
model_xg_multi = XGBClassifier(
    objective = "multi:softprob",
    num_class = len(np.unique(train_y)),
    n_estimates = 4000,
    learning_rate = 0.03,
    max_depth = 12,
    colsmaple_bytree = 0.467,
    early_stopping_rounds = 100,
    reg_alpha = 2.7,
    reg_lambda = 1.4,
    gamma = 0.26,
    enable_categorical = True,
    tree_method = 'hist',
    max_delta_step = 4,
    subsample = 0.86,
    random_state = 13
)

model_xg_multi.fit(train_x, train_y, eval_set = [(test_x, test_y)])

[0]	validation_0-mlogloss:1.94532


Parameters: { "colsmaple_bytree", "n_estimates" } are not used.

  self.starting_round = model.num_boosted_rounds()


[1]	validation_0-mlogloss:1.94475
[2]	validation_0-mlogloss:1.94422
[3]	validation_0-mlogloss:1.94366
[4]	validation_0-mlogloss:1.94313
[5]	validation_0-mlogloss:1.94259
[6]	validation_0-mlogloss:1.94206
[7]	validation_0-mlogloss:1.94153
[8]	validation_0-mlogloss:1.94104
[9]	validation_0-mlogloss:1.94054
[10]	validation_0-mlogloss:1.94005
[11]	validation_0-mlogloss:1.93957
[12]	validation_0-mlogloss:1.93911
[13]	validation_0-mlogloss:1.93865
[14]	validation_0-mlogloss:1.93820
[15]	validation_0-mlogloss:1.93777
[16]	validation_0-mlogloss:1.93732
[17]	validation_0-mlogloss:1.93688
[18]	validation_0-mlogloss:1.93649
[19]	validation_0-mlogloss:1.93604
[20]	validation_0-mlogloss:1.93561
[21]	validation_0-mlogloss:1.93519
[22]	validation_0-mlogloss:1.93480
[23]	validation_0-mlogloss:1.93441
[24]	validation_0-mlogloss:1.93400
[25]	validation_0-mlogloss:1.93362
[26]	validation_0-mlogloss:1.93323
[27]	validation_0-mlogloss:1.93285
[28]	validation_0-mlogloss:1.93248
[29]	validation_0-mlogloss:1.

In [14]:
def create_balanced_splits():
    train_splits = {}
    for t in range(len(le.classes_)):
        pos_idx = train_y[train_y == t].index
        pos_samples = train_x.loc[pos_idx]
        pos_labels = train_y.loc[pos_idx]

        n_pos = len(pos_idx)
        neg_idx = train_y[train_y != t].index
        neg_x = train_x.loc[neg_idx]
        neg_y = train_y.loc[neg_idx]
        
        neg_x_sampled, _, neg_y_sampled, _ = train_test_split(neg_x, neg_y, train_size = n_pos, stratify = neg_y, random_state = 42)
        binary_x = pd.concat([pos_samples, neg_x_sampled])
        binary_y = pd.Series([1] * n_pos + [0] * n_pos, index = binary_x.index)

        ## shuffle.
        binary_x = binary_x.sample(frac = 1, random_state = 42)
        binary_y = binary_y.loc[binary_x.index]

        train_splits[t] = (binary_x, binary_y)
    
    return train_splits

In [15]:
train_splits = create_balanced_splits()

In [33]:
models_xg_bins = {}
for t_x, (btx, bty) in train_splits.items():
    model_tx = XGBClassifier(
        objective="binary:logistic",
        n_estimators = 1000,
        learning_rate = 0.03,
        max_depth = 12,
        colsmaple_bytree = 0.467,
        # early_stopping_rounds = 100,
        reg_alpha = 2.7,
        reg_lambda = 1.4,
        gamma = 0.26,
        enable_categorical = True,
        tree_method = 'hist',
        max_delta_step = 4,
        subsample = 0.86,
        random_state = 13,
        eval_metric="logloss"     # Optional but recommended for binary classification
    )

    model_tx.fit(btx, bty)
    models_xg_bins[t_x] = model_tx

Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "colsmaple_bytree" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [34]:
### Evaluation function.
def _get_score(actual, predicted):
    score = 0.0
    hits = 0
    seen = set()
    for i, pred in enumerate(predicted):
        if pred == np.int64(actual) and pred not in seen:
            hits += 1
            score += hits / (i + 1.0)
            seen.add(pred)
    
    return score ## since actual is ONE entity.


def get_best_and_full_accuracy_xgboost(ty, top3_probs):
    test_yl = ty.tolist()
    first_acc_l = [x for idx, x in enumerate(test_yl) if np.int64(x) == top3_probs[idx][0]]
    # full_acc_l = [x for idx, x in enumerate(test_yl) if np.int64(x) in top3_probs[idx]]
    score_accl_l = [_get_score(x ,top3_probs[idx]) for idx, x in enumerate(test_yl)]

    print(f"First accuracy: {(len(first_acc_l) / len(test_yl)):.2f}")
    print(f"Score accuracy: {(np.mean(score_accl_l)):.2f}")

In [35]:
def predict_and_score_xgboost_multi(model):
    y_pred_probs = model.predict_proba(test_x)
    top3_probs = np.argsort(y_pred_probs, axis = 1)[:, -3:][:, ::-1]
    get_best_and_full_accuracy_xgboost(test_y, top3_probs)

In [36]:
def interpret_predictions(predictions, k = 3):
    final_pred_array = np.zeros((test_x.shape[0], len(le.classes_)))
    for t_x, pred_arr in predictions.items():
        final_pred_array[:, t_x] = predictions[t_x][:, 1]
    topk_probs = np.argsort(final_pred_array, axis = 1)[:, -k:][:, ::-1]
    return topk_probs

def predict_and_score_xgboost_bins(models_dict):
    predictions = {}
    for t_x, model_tx in models_dict.items():
        predictions[t_x] = model_tx.predict_proba(test_x)
    top3_probs = interpret_predictions(predictions)
    get_best_and_full_accuracy_xgboost(test_y, top3_probs)    

In [37]:
predict_and_score_xgboost_multi(model_xg_multi)

First accuracy: 0.22
Score accuracy: 0.36


In [38]:
predict_and_score_xgboost_bins(models_xg_bins)

First accuracy: 0.27
Score accuracy: 0.41
