In [4]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import keras
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dropout, Flatten, Dense, InputLayer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, ReduceLROnPlateau
from tensorflow.keras import Sequential
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.backend import clear_session
import tensorflow as tf
from keras import backend as K

In [5]:
import pickle
INPUT_SHAPE = (128, 128, 3) # Image Dimensions
BATCH_SIZE = 128
DROPOUT_RATE = 0.5
EPOCHS = 25
LR = 0.0001 # Learning Rate
REG_STRENGTH = 0.01 # Regularization Strength
NFOLDS =2 # No of folds for cross validation
WORKERS = 4 # Multithreading no of threads
MAXQ = 10 # Max Queue size for multithreading
THRES = [0.2] * 17 # Threshold for truth value of label, applied on sigmoid output.

In [6]:
pd.read_csv('/home/jupyter/train_v2.csv')

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road
...,...,...
40474,train_40474,clear primary
40475,train_40475,cloudy
40476,train_40476,agriculture clear primary
40477,train_40477,agriculture clear primary road


In [7]:
TRAIN_PATH = '/home/jupyter/train-jpg'
TRAIN_CSV_PATH = 'rare.csv'

df_train = pd.read_csv('train_v2.csv')
# Construct dataframes holding training and test data information
df_train_rare = pd.read_csv(TRAIN_CSV_PATH, index_col=0)

# These files are numpy Arrays with shape Nx1
X_train_files_rare = np.array(df_train_rare['image_name'].tolist())
X_train_files_rare.reshape((X_train_files_rare.shape[0], 1))
y_train_rare = np.array(df_train_rare['tags'].tolist())

In [8]:
df_train_rare

Unnamed: 0,image_name,tags
5958,train_5958.jpg,clear primary
11498,train_11498.jpg,blow_down
12188,train_12188.jpg,artisinal_mine
29246,train_29246.jpg,selective_logging
17744,train_17744.jpg,clear cultivation primary
...,...,...
6590,train_6590.jpg,clear primary
21302,train_21302.jpg,clear primary
12962,train_12962.jpg,bare_ground
32612,train_32612.jpg,partly_cloudy primary


In [9]:
rare = sorted(['conventional_mine', 'blow_down', 'slash_burn', 'blooming', 'artisinal_mine', 'selective_logging', 'bare_ground'])
labels_count = {}

for tag in df_train_rare['tags'].values:
    labels_in_tag = tag.split(' ')
    for label in labels_in_tag:
        if label in labels_count:
            labels_count[label] += 1
        else:
            labels_count[label] = 0
import operator
sorted(labels_count.items(), key=operator.itemgetter(1))

[('conventional_mine', 99),
 ('blow_down', 100),
 ('cloudy', 114),
 ('haze', 138),
 ('habitation', 194),
 ('slash_burn', 208),
 ('cultivation', 212),
 ('blooming', 331),
 ('artisinal_mine', 338),
 ('selective_logging', 339),
 ('water', 356),
 ('partly_cloudy', 366),
 ('road', 389),
 ('agriculture', 611),
 ('bare_ground', 861),
 ('clear', 1378),
 ('primary', 1849)]

In [10]:
rare

['artisinal_mine',
 'bare_ground',
 'blooming',
 'blow_down',
 'conventional_mine',
 'selective_logging',
 'slash_burn']

In [29]:
def load_data(df_train, labels):
    X = []
    y = []

    for img_name, tags in df_train.values:
        try:
            img = cv2.imread('/home/jupyter/train-jpg/{}'.format(img_name))
            X.append(cv2.resize(img, (128,128)))

            y_label = np.zeros(len(labels))
            for tag in tags.split(' '):
                try:
                    idx = labels.index(tag)
                    y_label[idx] = 1
                except Exception as e:
                    pass
                
            y.append(y_label)
        
        except Exception as e:
            print('Issue with loading', img_name)
            
    return X, y

In [31]:
X, y = load_data(df_train_rare, rare)
X = np.array(X).reshape(-1,128,128,3)
X = X/255.
y = np.array(y)

In [6]:
# rare_x = open('rare_X', 'wb')
# pickle.dump(X, rare_X)
# rare_X.close()

# rare_y = open('rare_Y', 'wb')
# pickle.dump(y, rare_y)
# rare_y.close()

# pickle_result = open('rare_X', 'rb')
# X = pickle.load(pickle_result)

# pickle_result = open('rare_Y', 'rb')
# y = pickle.load(pickle_result)

In [9]:
from sklearn.model_selection import train_test_split
import time
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state = int(time.time()))

In [10]:
X_train.shape

(3762, 128, 128, 3)

In [11]:
def learning_curve(model_fit, key='acc', ylim=(0.8, 1.01)):
    plt.figure(figsize=(12,6))
    plt.plot(model_fit.history[key])
    plt.plot(model_fit.history['val_' + key])
    plt.title('Learning Curve')
    plt.ylabel(key.title())
    plt.xlabel('Epoch')
    plt.ylim(ylim)
    plt.legend(['train', 'test'], loc='best')
    plt.show()

In [14]:
def fbeta_score_K(y_true, y_pred):
    beta_squared = 4

    tp = K.sum(y_true * y_pred) + K.epsilon()
    fp = K.sum(y_pred) - tp
    fn = K.sum(y_true) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    result = (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())
    return result

In [18]:
optimizer = Adam(0.0001, decay=0.00001)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# We will create a different CNN for detecting the rare labels.
# Note that the dimension of the dense layer at the end in 7x1(7 rare labels) NOT 17x1.

for layer in base_model.layers:
    layer.trainable = False
    model = Sequential([
    base_model,
    Flatten(), 
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(7, activation='sigmoid')])

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[fbeta_score_K])
model.summary()

Model: "sequential_75"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 4, 4, 512)         14714688  
_________________________________________________________________
flatten_75 (Flatten)         (None, 8192)              0         
_________________________________________________________________
dense_150 (Dense)            (None, 128)               1048704   
_________________________________________________________________
dropout_75 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_151 (Dense)            (None, 7)                 903       
Total params: 15,764,295
Trainable params: 1,049,607
Non-trainable params: 14,714,688
_________________________________________________________________


In [19]:
for layer in model.layers[0].layers[1:]:
    layer.trainable = True

for layer in model.layers[0].layers:
    print(layer.name, layer. trainable)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[fbeta_score_K])
model.summary()

input_4 False
block1_conv1 True
block1_conv2 True
block1_pool True
block2_conv1 True
block2_conv2 True
block2_pool True
block3_conv1 True
block3_conv2 True
block3_conv3 True
block3_pool True
block4_conv1 True
block4_conv2 True
block4_conv3 True
block4_pool True
block5_conv1 True
block5_conv2 True
block5_conv3 True
block5_pool True
Model: "sequential_75"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 4, 4, 512)         14714688  
_________________________________________________________________
flatten_75 (Flatten)         (None, 8192)              0         
_________________________________________________________________
dense_150 (Dense)            (None, 128)               1048704   
_________________________________________________________________
dropout_75 (Dropout)         (None, 128)               0         
__________________________________________________

In [46]:
aug = keras.preprocessing.image.ImageDataGenerator(rotation_range=20, zoom_range=0.15,
                         width_shift_range=0.2, height_shift_range=0.2, shear_range=0.15,
                         horizontal_flip=True, fill_mode="nearest")
 
model_fit = model.fit_generator(aug.flow(X, y, batch_size=128),
                        validation_data=(X_val, y_val), steps_per_epoch=len(X) // 128,
                        epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [47]:
from sklearn.metrics import fbeta_score
y_pred = model.predict(X_val, batch_size=128)
score = fbeta_score(y_val, np.array(y_pred) > 0.2, beta=2, average='samples')
print("F beta score: ", score)
print("Error: %.2f%%" % (100-score*100))

F beta score:  0.4963924963924964
Error: 50.36%


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [80]:
# model.save('rare_model.h5')
# model.save_weights('rare_model_weights.h5')

In [61]:
# y_t1 = np.where(y_t > 0.2, 1, 0)
# pd.DataFrame(y_t1, columns=rare).astype(int).to_csv('rare_pred.csv')
# pd.DataFrame(y_val, columns=rare).astype(int).to_csv('rare_actual.csv')

### Train Data

In [88]:
X_rare, y_rare = load_data(df_train, rare)
X_rare_train = np.array(X_rare).reshape(-1,128,128,3)
X_rare_train = X_rare_train/255.
y_rare_train = np.array(y_rare)

In [102]:
y_pred_rare_train = model.predict(X_rare_train, batch_size=128)
# file = open('y_pred_rare_train', 'wb')
# pickle.dump(y_pred_train, file)
# file.close()

In [104]:
y_rare_train_full = np.where(y_pred_rare_train > 0.2, 1, 0)
pd.DataFrame(y_rare_train_full, columns=rare).astype(int).to_csv('rare_pred_train.csv')
pd.DataFrame(y_rare_train, columns=rare).astype(int).to_csv('rare_actual_train.csv')

In [139]:
# df_rare_pred_train = pd.read_csv('rare_pred_train.csv', index_col=0)

In [106]:
from sklearn.metrics import classification_report, confusion_matrix
train_predictions = pd.read_csv('2 - train_pred.csv')
train_predictions.head()

Unnamed: 0,image_name,tags,pred_tags
0,train_0.jpg,haze primary,clear haze primary
1,train_1.jpg,agriculture clear primary water,agriculture clear partly_cloudy primary road w...
2,train_2.jpg,clear primary,clear primary
3,train_3.jpg,clear primary,clear primary
4,train_4.jpg,agriculture clear habitation primary road,agriculture habitation partly_cloudy primary road


In [111]:
labels = ['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze', 'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water']

In [112]:
def return_Binary(tags):
    tags = tags.split(' ')
    return [1 if x in tags else 0 for x in labels]

In [113]:
train_predictions['pred_binary'] = train_predictions['pred_tags'].map(return_Binary)
train_predictions['actual_binary'] = train_predictions['tags'].map(return_Binary)
train_predictions.head()
pred_toArray = np.array(list(train_predictions['pred_binary']))
actual_toArray = np.array(list(train_predictions['actual_binary']))

Unnamed: 0,image_name,tags,pred_tags,pred_binary,actual_binary
0,train_0.jpg,haze primary,clear haze primary,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
1,train_1.jpg,agriculture clear primary water,agriculture clear partly_cloudy primary road w...,"[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
2,train_2.jpg,clear primary,clear primary,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3,train_3.jpg,clear primary,clear primary,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,train_4.jpg,agriculture clear habitation primary road,agriculture habitation partly_cloudy primary road,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, ..."


In [114]:
# Previos Matrix without Rare label prediction

                   precision    recall  f1-score   support

      agriculture       0.89      0.63      0.74     17287
   artisinal_mine       0.00      0.14      0.01         7
      bare_ground       0.04      0.16      0.07       232
         blooming       0.20      0.10      0.14       649
        blow_down       0.00      0.00      0.00        29
            clear       0.71      0.95      0.81     21278
           cloudy       0.87      0.75      0.80      2436
conventional_mine       0.00      0.00      0.00         0
      cultivation       0.06      0.62      0.11       445
       habitation       0.74      0.48      0.58      5580
             haze       0.65      0.63      0.64      2782
    partly_cloudy       0.99      0.34      0.51     21332
          primary       1.00      0.96      0.98     38899
             road       0.85      0.66      0.75     10324
selective_logging       0.00      0.00      0.00         7
       slash_burn       0.00      0.00      0.00       

  'recall', 'true', average, warn_for)


### Classification Report after the rare label training
### Imporved F1 scores for the rare labels

In [144]:
df_predBinary = pd.DataFrame(pred_toArray, columns=labels)
df_actualBinary = pd.DataFrame(actual_toArray, columns=labels)
for tag in rare:
    df_predBinary[tag] = df_rare_pred_train[tag]
print(classification_report(df_predBinary, df_actualBinary, target_names=labels))

                   precision    recall  f1-score   support

      agriculture       0.89      0.63      0.74     17287
   artisinal_mine       1.00      0.65      0.78       525
      bare_ground       1.00      0.15      0.25      5915
         blooming       0.94      0.15      0.26      2046
        blow_down       0.99      0.10      0.18       951
            clear       0.71      0.95      0.81     21278
           cloudy       0.87      0.75      0.80      2436
conventional_mine       1.00      0.57      0.73       175
      cultivation       0.06      0.62      0.11       445
       habitation       0.74      0.48      0.58      5580
             haze       0.65      0.63      0.64      2782
    partly_cloudy       0.99      0.34      0.51     21332
          primary       1.00      0.96      0.98     38899
             road       0.85      0.66      0.75     10324
selective_logging       0.94      0.23      0.37      1401
       slash_burn       1.00      0.12      0.21      1

## Test Data

In [145]:
TEST_PATH = 'test-jpg/test-jpg'
TEST_CSV_PATH = 'sample_submission_v2.csv'

In [150]:
def load_data_test(df_train):
    X = []

    for img_name, tags in df_train.values:
        try:
            img = cv2.imread('test-jpg/test-jpg/{}'.format(img_name + '.jpg'))
            X.append(cv2.resize(img, (128,128)))
        
        except Exception as e:
            print('Issue with loading', img_name)
            
    return X

In [152]:
df_test = pd.read_csv(TEST_CSV_PATH)
X_test = load_data_test(df_test)

In [153]:
X_test = np.array(X_test).reshape(-1,128,128,3)
X_test = X_test/255.

In [155]:
# Importing the results from existing model
result = pd.read_csv('results_rerun2.csv', index_col=0)

In [158]:
# Rare label prediction with test data
y_pred_rare_test = model.predict(X_test, batch_size=128)
# file = open('y_pred_rare_test', 'wb')
# pickle.dump(y_pred_rare_test, file)
# file.close()

In [188]:
# Applying threshold of 0.2
y_pred_rare_test_thresh2 = np.where(y_pred_rare_test > 0.2, 1, 0)
df_pred_rare_test_thresh2 = pd.DataFrame(y_pred_rare_test_thresh2, columns=rare).astype(int)

# df_pred_rare_test_thresh2.to_csv('rare_pred_test_thresh2.csv')

In [167]:
# We will get the probabilities from the rare label detection and use it to scale up the rare label probabilities of the existing model. 
for tag in rare:
    result[tag] = df_pred_rare_test[tag]

In [169]:
preds = []
for i in range(result.shape[0]):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > THRES, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))
    
df_test['tags'] = preds
df_test['image_name'] = df_test['image_name'].astype(str).str.slice(stop=-4)
df_test.to_csv('rare_test-submit.csv', index=False)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


# Kaggle score - 0.91599

In [172]:
# result.to_csv('results_rare_test.csv')

# Threshold 0.5 for rare

In [178]:
# Rare label prediction with test data
y_pred_rare_test = model.predict(X_test, batch_size=128)

In [185]:
y_pred_rare_test_thresh5 = np.where(y_pred_rare_test > 0.5, 1, 0)
df_pred_rare_test_thresh5 = pd.DataFrame(y_pred_rare_test_thresh5, columns=rare).astype(int)
df_pred_rare_test_thresh5.to_csv('rare_pred_test_thresh5.csv')

In [194]:
for tag in rare:
    result[tag] = df_pred_rare_test_thresh5[tag]

In [195]:
preds = []
for i in range(result.shape[0]):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > THRES, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))
    
df_test['tags'] = preds
# df_test['image_name'] = df_test['image_name'].astype(str).str.slice(stop=-4)
df_test.to_csv('rare_test-submit_thresh5.csv', index=False)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


# Kaggle Score: 0.92006
### Here the score increased, but it penalized the rare label accuracy. 