In [18]:
# imports and constants

import numpy as np # importing numpy for work with matrices
import csv # importing csv to work with train_masks.csv
from scipy.spatial import KDTree # importing KDTree for fast neighbor searching
from time import gmtime, strftime # importing some stuff from time to check the speed for algorithm

TIF = len(".tif")
EPS = 0.001

TRAIN = "input/new_train_data.npz"
REAL_TRAIN = "input/train_data.npz"

TEST = "input/new_test_data.npz"
REAL_TEST = "input/test_data.npz"

SUBMISSION = "input/submission.csv"

ANSWERS = "train_masks.csv"

In [24]:
test_names = !ls input/new_test
train_names = !ls input/new_train

train_names = [name for name in train_names if "mask" not in name]

print(len(test_names), len(train_names))

5508 5635


In [25]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # don't forget to add 3 hours

with np.load(TRAIN, mmap_mode='r') as data:
    train_images = np.array([data[image_name].flatten() for image_name in train_names]).astype(int)
    
with np.load(TEST, mmap_mode='r') as data:
    test_images = np.array([data[image_name].flatten() for image_name in test_names]).astype(int)    
        
print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # don't forget to add 3 hours

2016-07-25 14:24:09
2016-07-25 14:26:30


In [26]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # don't forget to add 3 hours

tree = KDTree(train_images, leafsize=200)
        
print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # don't forget to add 3 hours

2016-07-25 14:26:43
2016-07-25 14:34:02


In [27]:
answers = {}
with open(ANSWERS) as train_masks:
    reader = csv.reader(train_masks)
    for row in reader:
        # dictionary will contain answer for each file from directory "train"
        answers[str(row[0]) + "_" + str(row[1]) + ".tif"] = row[2]

In [28]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # don't forget to add 3 hours

result = []

for i in range(len(test_images)):
    
    dist, ind = tree.query(test_images[i], k=1, p=2)
    neighbors_indeces = tree.query_ball_point(test_images[i], dist + EPS)

    with np.load(REAL_TRAIN, mmap_mode='r') as data:
        real_neighbors = np.array([data[img].flatten() for img in np.array(train_names)[neighbors_indeces]]).astype(int)

    with np.load(REAL_TEST, mmap_mode='r') as data:
        real_example = data[test_names[i]].flatten()
        
    real_tree = KDTree(real_neighbors, leafsize=10)
    dist, ind = real_tree.query(real_example, k=1, p=2)
    real_neighbor_name = train_names[neighbors_indeces[ind]]

    result.append((test_names[i][:-TIF], answers[real_neighbor_name]))
    
result.sort()
    
print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # don't forget to add 3 hours

2016-07-25 14:39:44
2016-07-26 01:32:08


In [36]:
int_result = []
for row in result:
    int_result.append((int(row[0]), row[1]))
int_result.sort()

In [37]:
with open(SUBMISSION, "w") as submission:
    writer = csv.writer(submission)
    writer.writerow(['img', 'pixels'])
    writer.writerows(int_result)