In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.misc import imread, imresize
import os
import matplotlib.pyplot as ply
from keras.utils.np_utils import to_categorical
from PIL import Image
%matplotlib inline

Using TensorFlow backend.


In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
image_data = {}
for image_file in os.listdir('data/images'):
    image_data[image_file.split(".")[0]] = imresize(imread(os.path.join('data/images', image_file)),(128,128)).astype(np.float32)

In [3]:
train.head

<bound method DataFrame.head of        id                       species   margin1   margin2   margin3  \
0       1                   Acer_Opalus  0.007812  0.023438  0.023438   
1       2         Pterocarya_Stenoptera  0.005859  0.000000  0.031250   
2       3          Quercus_Hartwissiana  0.005859  0.009766  0.019531   
3       5               Tilia_Tomentosa  0.000000  0.003906  0.023438   
4       6            Quercus_Variabilis  0.005859  0.003906  0.048828   
5       8          Magnolia_Salicifolia  0.070312  0.093750  0.033203   
6      10           Quercus_Canariensis  0.021484  0.031250  0.017578   
7      11                 Quercus_Rubra  0.000000  0.000000  0.037109   
8      14               Quercus_Brantii  0.005859  0.001953  0.033203   
9      15                Salix_Fragilis  0.000000  0.000000  0.009766   
10     17               Zelkova_Serrata  0.019531  0.031250  0.001953   
11     18         Betula_Austrosinensis  0.001953  0.001953  0.023438   
12     20          

In [None]:
def load_images():
    dr = "data/images/"
    ims = [Image.open(dr + f) for f in os.listdir(dr) if f.endswith('jpg')]
    max_height = max(i.height for i in ims)
    max_width = max(i.width for i in ims)
    newims = []
    img_rows, img_cols = 0, 0
    for im in ims[:1]:
        new_im = im.crop((0, 0, max_width, max_height))
        print new_im.size

        new_im.thumbnail((100,100))
        print new_im.size


        img_rows, img_cols = max(img_rows, new_im.height), max(img_cols, new_im.width)
        
        newims.append(np.asarray(new_im))
    print(img_rows,img_cols)
    return np.stack(newims),img_rows, img_cols
ims, img_row, img_col = load_images()

In [12]:
def prepare_data(train, image_data, test):
    le = LabelEncoder().fit(train.species)
    labels = le.transform(train.species)
    labels_cat = to_categorical(labels)
    classes = list(le.classes_)

    test_ids = test.id
    train_ids = train.id
    image_train = np.array([image_data[str(_)] for _ in train_ids])
    image_train = image_train.reshape((image_train.shape[0],1,256,256))
    image_test = np.array([image_data[str(_)] for _ in test_ids])
    image_test = image_test.reshape((image_test.shape[0],1,256,256))
    
    train = train.drop(['id', 'species'], axis=1)
    test = test.drop(['id'], axis=1)

    scaler = StandardScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)


    return train, labels_cat, classes, test_ids, test, image_train, image_test

In [13]:
train, labels, classes, test_ids, test, image_train, image_test = prepare_data(train, image_data, test)

AttributeError: 'numpy.ndarray' object has no attribute 'species'

In [None]:
sss = cross_validation.StratifiedShuffleSplit(labels, 1, test_size=0.2, random_state=23 )

for tr, ts in sss:
    train_x, test_x = train[tr], train[ts]
    train_y, test_y = labels[tr], labels[ts]

In [None]:
# Score: 0.77232. Rank: 326
rf = RandomForestClassifier(n_estimators=70, max_features=14, min_samples_split=1)
rf = rf.fit(train_x, train_y)
rf.score(test_x, test_y)

In [None]:
cv_score = cross_validation.cross_val_score(rf, train, labels, cv=5)
print cv_score
print np.mean(cv_score)

In [None]:
preds = rf.predict_proba(test)

In [None]:
def make_submit(preds):
    submission = pd.DataFrame(preds, columns=classes)
    submission.insert(0, 'id', test_ids)
    submission.reset_index()
    submission.to_csv('data/submit.csv', index=False)

In [None]:
ply.imshow(image_data['250'],cmap='Set3')

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Merge
from keras.layers.convolutional import Convolution2D, MaxPooling2D

#experiment with dropouts
img_model = Sequential()
img_model.add(Convolution2D(64, 5, 5, border_mode="same", input_shape=(1, 256, 256)))
img_model.add(Activation("relu"))
img_model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), border_mode="same"))
img_model.add(Convolution2D(32, 5, 5, border_mode="same"))
img_model.add(Activation("relu"))
img_model.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), border_mode="same"))
img_model.add(Flatten())
img_model.add(Dense(1024))
img_model.add(Activation("relu"))
img_model.add(Dropout(0.5))

feat_model = Sequential()
feat_model.add(Dense(1024, input_dim=192))
feat_model.add(Activation("sigmoid"))

model = Sequential()
#try dot product
model.add(Merge([img_model, feat_model], mode='concat'))              
model.add(Dense(512))
model.add(Activation("sigmoid"))
model.add(Dense(99))
model.add(Activation("softmax"))
if os.path.exist("weights.h5"):
    model.load_weights("weights.h5")
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.fit([image_train, train], labels, nb_epoch=100, batch_size=128)
model.save_weights("weights.h5")

In [None]:
preds = model.predict([image_test, test])
make_submit(preds)