In [1]:
##Import the libraries

#!pip install xgboost
import numpy as np
import os
import pandas as pd
import random
from tqdm import tqdm
import xgboost as xgb
import scipy
from sklearn.metrics import fbeta_score
from PIL import Image

In [2]:
# Set the seed 
random_seed = 1
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
# Load data
train_path = 'train-jpg/'
test_path = 'test-jpg/'
train = pd.read_csv('train_v2.csv')  #Load label data
test = pd.read_csv('sample_submission_v2.csv') # load sample submission format file


In [4]:
# Function to extract the image features
def extract_features(df, data_path):
    im_features = df.copy()

    N = len(im_features.image_name.values)

    r_mean = np.zeros(N)
    g_mean = np.zeros(N)
    b_mean = np.zeros(N)

    r_std = np.zeros(N)
    g_std = np.zeros(N)
    b_std = np.zeros(N)

    r_max = np.zeros(N)
    g_max = np.zeros(N)
    b_max = np.zeros(N)

    r_min = np.zeros(N)
    g_min = np.zeros(N)
    b_min = np.zeros(N)

    r_kurtosis = np.zeros(N)
    g_kurtosis = np.zeros(N)
    b_kurtosis = np.zeros(N)
    
    r_skewness = np.zeros(N)
    g_skewness = np.zeros(N)
    b_skewness = np.zeros(N)

    for i, image_name in enumerate(tqdm(im_features.image_name.values, miniters=1000)): 
        im = Image.open(data_path + image_name + '.jpg')
        im = np.array(im)[:,:,:3]

        r = im[:,:,0].ravel()
        g = im[:,:,1].ravel()
        b = im[:,:,2].ravel()
        
        r_mean[i] = np.mean(r)
        g_mean[i] = np.mean(g)
        b_mean[i] = np.mean(b)

        r_std[i] = np.std(r)
        g_std[i] = np.std(g)
        b_std[i] = np.std(b)

        r_max[i] = np.max(r)
        g_max[i] = np.max(g)
        b_max[i] = np.max(b)

        r_min[i] = np.min(r)
        g_min[i] = np.min(g)
        b_min[i] = np.min(b)

        r_kurtosis[i] = scipy.stats.kurtosis(r)
        g_kurtosis[i] = scipy.stats.kurtosis(g)
        b_kurtosis[i] = scipy.stats.kurtosis(b)
        
        r_skewness[i] = scipy.stats.skew(r)
        g_skewness[i] = scipy.stats.skew(g)
        b_skewness[i] = scipy.stats.skew(b)


    im_features['r_mean'] = r_mean
    im_features['g_mean'] = g_mean
    im_features['b_mean'] = b_mean

    im_features['rgb_mean_mean'] = (r_mean + g_mean + b_mean)/3.0

    im_features['r_std'] = r_std
    im_features['g_std'] = g_std
    im_features['b_std'] = b_std

    im_features['rgb_mean_std'] = (r_std + g_std + b_std)/3.0

    im_features['r_max'] = r_max
    im_features['g_max'] = g_max
    im_features['b_max'] = b_max

    im_features['rgb_mean_max'] = (r_max + r_max + b_max)/3.0

    im_features['r_min'] = r_min
    im_features['g_min'] = g_min
    im_features['b_min'] = b_min

    im_features['rgb_mean_min'] = (r_min + g_min + b_min)/3.0

    im_features['r_range'] = r_max - r_min
    im_features['g_range'] = g_max - g_min
    im_features['b_range'] = b_max - b_min

    im_features['r_kurtosis'] = r_kurtosis
    im_features['g_kurtosis'] = g_kurtosis
    im_features['b_kurtosis'] = b_kurtosis
    
    im_features['r_skewness'] = r_skewness
    im_features['g_skewness'] = g_skewness
    im_features['b_skewness'] = b_skewness
    
    return im_features

In [5]:
# Extract features
print('Extracting train features')
train_features = extract_features(train, train_path)
print('Extracting test features')
test_features = extract_features(test, test_path)

  0%|          | 0/40479 [00:00<?, ?it/s]

Extracting train features


100%|██████████| 40479/40479 [21:17<00:00, 31.69it/s]
  0%|          | 0/61191 [00:00<?, ?it/s]

Extracting test features


100%|██████████| 61191/61191 [32:05<00:00, 31.79it/s]  


In [7]:
# pickle the data

# import pickle
# train_feat = open('train_features_xgb.pickle', 'wb')
# pickle.dump(train_features, train_feat, protocol=4)
# train_feat.close()

# test_feat = open('test_features_xgb.pickle', 'wb')
# pickle.dump(test_features, test_feat, protocol=4)
# test_feat.close()

In [5]:
#load the pickled features:
import pickle
pickle_XGtrain = open('train_features_xgb.pickle', 'rb')
train_features = pickle.load(pickle_XGtrain)

pickle_yGtrain = open('test_features_xgb.pickle', 'rb')
test_features = pickle.load(pickle_yGtrain)

In [6]:
# Prepare data
X = np.array(train_features.drop(['image_name', 'tags'], axis=1))
y_train = []

flatten = lambda l: [item for sublist in l for item in sublist]
labels = np.array(list(set(flatten([l.split(' ') for l in train_features['tags'].values]))))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [7]:
for tags in tqdm(train.tags.values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    y_train.append(targets)
    
y = np.array(y_train, np.uint8)

100%|██████████| 40479/40479 [00:00<00:00, 311913.23it/s]


In [8]:
print('X.shape = ' + str(X.shape))
print('y.shape = ' + str(y.shape))

X.shape = (40479, 25)
y.shape = (40479, 17)


In [13]:
#for i in range(50):
print(y[4])
#Multiclass indicators in y are present

[1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0]


In [19]:
n_classes = y.shape[1]

#Create X train and test dataset

X_test = np.array(test_features.drop(['image_name', 'tags'], axis=1))
X_train = np.array(train_features.drop(['image_name', 'tags'], axis=1))

In [21]:
# Train and predict with one-vs-all strategy
y_pred = np.zeros((X_test.shape[0], n_classes))
train_pred = np.zeros((X_train.shape[0], n_classes))

In [22]:
print('Wait for it ....')
for class_i in tqdm(range(n_classes), miniters=1): 
    model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, \
                              silent=True, objective='binary:logistic', nthread=-1, \
                              gamma=0, min_child_weight=1, max_delta_step=0, \
                              subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                              base_score=0.5, seed=random_seed, missing=None)
    model.fit(X, y[:, class_i])
    y_pred[:, class_i] = model.predict_proba(X_test)[:, 1]
    train_pred[:, class_i] = model.predict_proba(X_train)[:, 1]

  0%|          | 0/17 [00:00<?, ?it/s]

Wait for it ....


100%|██████████| 17/17 [00:40<00:00,  2.35s/it]


In [23]:
y_pred[10]

array([5.52039742e-02, 1.08458789e-03, 9.24624794e-04, 2.41803601e-02,
       1.75561965e-01, 1.84955250e-03, 9.98823419e-02, 9.98930871e-01,
       7.26763438e-03, 9.69229102e-01, 1.54007962e-02, 1.91045064e-03,
       1.30265400e-01, 1.11160226e-01, 1.53335016e-02, 9.71562695e-04,
       3.95965675e-04])

In [35]:
train_pred.shape

(40479, 17)

In [25]:
xg_results = pd.DataFrame(y_pred, columns = labels)
xg_results[:10]

Unnamed: 0,habitation,blooming,conventional_mine,clear,agriculture,blow_down,road,primary,slash_burn,partly_cloudy,bare_ground,selective_logging,water,cultivation,artisinal_mine,cloudy,haze
0,0.002174,0.001037,7.5e-05,0.99605,0.011664,0.00049,0.013481,0.998071,0.000145,0.00047,0.001654,0.000964,0.0457,0.002583,5.3e-05,0.000838,0.0101
1,0.001164,0.022799,6.9e-05,0.997537,0.010154,0.003608,0.013992,0.999713,0.000584,0.00248,0.002214,0.001206,0.033249,0.010839,4.8e-05,0.000196,0.000237
2,0.049324,0.000666,0.001554,0.100048,0.415631,0.000397,0.184673,0.993752,0.006896,0.913825,0.023416,0.001242,0.179434,0.14025,0.004264,0.001258,0.001376
3,0.003516,0.014136,9.6e-05,0.897158,0.358396,0.004822,0.035604,0.999271,0.010487,0.187719,0.002899,0.021896,0.147314,0.373048,5.3e-05,0.000175,0.000614
4,0.035261,0.000122,0.01049,0.063129,0.165018,7.4e-05,0.095166,0.848635,0.000475,0.549432,0.019123,0.000174,0.187398,0.072228,0.011306,0.323804,0.000748
5,0.000868,0.001959,6.9e-05,0.998237,0.007874,0.000585,0.005532,0.999693,0.000171,0.000827,0.000255,0.000173,0.023956,0.005134,4.9e-05,0.000158,0.000221
6,0.316631,0.001473,0.000587,0.156717,0.588099,0.000528,0.411588,0.989614,0.008487,0.839021,0.016196,0.003157,0.101179,0.157153,0.001061,0.008882,0.006538
7,0.926075,0.000119,0.011024,0.570237,0.393328,0.000237,0.756333,0.820249,0.000429,0.253441,0.017242,0.000338,0.305981,0.045304,0.002167,0.018932,0.025107
8,0.000529,0.000187,7.3e-05,0.997802,0.008735,0.000589,0.005051,0.999099,0.000113,0.000542,0.000798,0.00019,0.014152,0.002789,5.6e-05,0.000188,0.001635
9,0.00432,0.000102,0.0001,0.228761,0.545365,0.000198,0.071043,0.988821,0.002177,0.012337,0.011976,0.000655,0.180562,0.185905,5.4e-05,0.003374,0.855365


In [26]:
xg_train_res = pd.DataFrame(train_pred, columns = labels)
xg_train_res[:10]

Unnamed: 0,habitation,blooming,conventional_mine,clear,agriculture,blow_down,road,primary,slash_burn,partly_cloudy,bare_ground,selective_logging,water,cultivation,artisinal_mine,cloudy,haze
0,0.001883,0.000399,7.9e-05,0.226477,0.022625,0.000135,0.012316,0.990906,0.000108,0.000608,0.001385,0.000657,0.03424,0.003723,5.7e-05,0.004155,0.662653
1,0.232562,0.001776,0.000293,0.923149,0.937287,0.001081,0.470157,0.995969,0.00616,0.041268,0.012548,0.00555,0.225862,0.296143,0.002088,0.000446,0.000728
2,0.000746,0.001864,6.9e-05,0.997585,0.005462,0.000363,0.004895,0.999673,7.9e-05,0.000478,0.000262,6.3e-05,0.0177,0.003105,5.4e-05,0.000179,0.000276
3,0.000748,0.001976,6.9e-05,0.998075,0.014581,0.000439,0.005696,0.999592,0.000115,0.000492,0.000424,0.000502,0.014249,0.007352,5.8e-05,0.000157,0.001941
4,0.484765,0.004533,0.000335,0.89658,0.725722,0.001586,0.364156,0.959052,0.0201,0.033154,0.169749,0.028448,0.22043,0.279277,0.000202,0.00012,0.000699
5,0.076947,0.000239,0.000118,0.069836,0.249633,0.000169,0.311797,0.783625,0.000233,0.055279,0.000953,0.000913,0.320221,0.044244,0.00021,0.142006,0.601161
6,0.084798,0.000261,0.000721,0.902319,0.840131,0.000336,0.55829,0.991417,0.016904,0.006866,0.011421,0.004664,0.330327,0.358034,0.000548,0.000432,0.122194
7,0.002099,0.000531,0.000114,0.131926,0.038298,0.000163,0.015969,0.81717,0.000205,0.000832,0.005893,0.002925,0.097312,0.015868,5.7e-05,0.085541,0.808053
8,0.050083,0.006754,0.000284,0.979291,0.757353,0.006321,0.080613,0.999578,0.016249,0.019142,0.022703,0.037169,0.15384,0.649536,0.000107,0.000105,0.00046
9,0.037538,0.000105,0.002091,0.651742,0.616565,7.3e-05,0.481671,0.874904,0.001441,0.066047,0.023422,0.000401,0.361999,0.080957,0.038274,0.163409,0.177883


In [20]:
xg_results.to_csv('result_xgb_0.3.csv', index=False)

In [47]:
#Pickle the result
# import pickle
res = open('result_xgb_0.5.pickle', 'wb')
pickle.dump(xg_results, res, protocol=4)
res.close()

In [16]:
#Check predicted rare labels
print("Blow downs: ",len(xg_results[xg_results['blow_down']>0.2]))
print("conventional_mine: ",len(xg_results[xg_results['conventional_mine']>0.2]))
print("selective_logging: ",len(xg_results[xg_results['selective_logging']>0.2]))
print("slash_burn: ",len(xg_results[xg_results['slash_burn']>0.2]))
print("Cultivation: ",len(xg_results[xg_results['cultivation']>0.2]))


Blow downs:  23
conventional_mine:  34
selective_logging:  267
slash_burn:  56
Cultivation:  11461


In [29]:
#create prediction flag
preds = [' '.join(labels[y_pred_row > 0.2]) for y_pred_row in y_pred]

In [30]:
#Create a  dataframe wiht test images and labels 
subm = pd.DataFrame()
subm['image_name'] = test_features.image_name.values
subm['tags'] = preds
subm[:10]

Unnamed: 0,image_name,tags
0,test_0,clear primary
1,test_1,clear primary
2,test_2,agriculture primary partly_cloudy
3,test_3,clear agriculture primary cultivation
4,test_4,primary partly_cloudy cloudy
5,test_5,clear primary
6,test_6,habitation agriculture road primary partly_cloudy
7,test_7,habitation clear agriculture road primary part...
8,test_8,clear primary
9,test_9,clear agriculture primary haze


In [36]:
#Create flags for prediction flags
train_preds = [' '.join(labels[y_pred_row > 0.2]) for y_pred_row in train_pred]

In [39]:
#Create a  dataframe wiht train images and labels 
trainfrm = pd.DataFrame()
trainfrm['image_name'] = train_features.image_name.values
trainfrm['pred_tags'] = train_preds
trainfrm[:10]

Unnamed: 0,image_name,pred_tags
0,train_0,clear primary haze
1,train_1,habitation clear agriculture road primary wate...
2,train_2,clear primary
3,train_3,clear primary
4,train_4,habitation clear agriculture road primary wate...
5,train_5,agriculture road primary water haze
6,train_6,clear agriculture road primary water cultivation
7,train_7,primary haze
8,train_8,clear agriculture primary cultivation
9,train_9,clear agriculture road primary water


In [46]:

df_train= pd.read_csv('train_v2.csv')
df_train[:10]

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
5,train_5,haze primary water
6,train_6,agriculture clear cultivation primary water
7,train_7,haze primary
8,train_8,agriculture clear cultivation primary
9,train_9,agriculture clear cultivation primary road


In [47]:
# Join to train_df to get the original 
df_train=df_train.merge(trainfrm,on='image_name')
df_train[:5]

Unnamed: 0,image_name,tags,pred_tags
0,train_0,haze primary,clear primary haze
1,train_1,agriculture clear primary water,habitation clear agriculture road primary wate...
2,train_2,clear primary,clear primary
3,train_3,clear primary,clear primary
4,train_4,agriculture clear habitation primary road,habitation clear agriculture road primary wate...


In [48]:
#Create one hot encoding to compare the predicted and original labels 
df_train['tags1'] = df_train['tags'].apply(lambda x: x.split(' '))
df_train['pred_tags1'] = df_train['pred_tags'].apply(lambda x: x.split(' '))

In [50]:
df_train_actual=df_train[['image_name','tags1']]
df_train_actual[:2]

Unnamed: 0,image_name,tags1
0,train_0,"[haze, primary]"
1,train_1,"[agriculture, clear, primary, water]"


In [51]:
from sklearn.preprocessing import MultiLabelBinarizer
xgbst = MultiLabelBinarizer()

actual_results = pd.DataFrame(xgbst.fit_transform(df_train_actual['tags1']),
                   columns=xgbst.classes_,
                   index=df_train_actual.index)

actual_results

Unnamed: 0,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40474,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
40475,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
40476,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
40477,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0


In [52]:
df_train_pred=df_train[['image_name','pred_tags1']]
df_train_pred[:2]

Unnamed: 0,image_name,pred_tags1
0,train_0,"[clear, primary, haze]"
1,train_1,"[habitation, clear, agriculture, road, primary..."


In [55]:
from sklearn.preprocessing import MultiLabelBinarizer
xgbst = MultiLabelBinarizer()

pred_results = pd.DataFrame(xgbst.fit_transform(df_train_pred['pred_tags1']),
                   columns=xgbst.classes_,
                   index=df_train_pred.index)

pred_results

Unnamed: 0,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0
1,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1
2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40474,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
40475,1,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,0
40476,1,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0
40477,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1


In [56]:
#get precision -recall report
from sklearn.metrics import classification_report
import numpy as np
print(classification_report(actual_results, pred_results, target_names=labels))

                   precision    recall  f1-score   support

       habitation       0.62      0.96      0.75     12315
         blooming       0.73      0.86      0.79       339
conventional_mine       0.64      0.42      0.51       862
            clear       0.74      0.27      0.40       332
      agriculture       0.97      0.35      0.51        98
        blow_down       0.89      0.99      0.94     28431
             road       0.72      0.96      0.82      2089
          primary       0.94      0.77      0.85       100
       slash_burn       0.41      0.70      0.52      4477
    partly_cloudy       0.55      0.83      0.66      3660
      bare_ground       0.59      0.87      0.71      2697
selective_logging       0.76      0.95      0.84      7261
            water       0.96      1.00      0.98     37513
      cultivation       0.54      0.91      0.68      8071
   artisinal_mine       0.73      0.26      0.39       340
           cloudy       1.00      0.19      0.31       

In [52]:
subm.to_csv('submission_xgb_0.5.csv', index=False)