In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cv2 as cv
import sys

def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.3+
    count = len(it)
    def show(j):
        x = int(size*j/count)
        print("{}[{}{}] {}/{}".format(prefix, "#"*x, "."*(size-x), j, count), 
                end='\r', file=out, flush=True)
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    print("\n", flush=True, file=out)

### Data Loading

In [6]:
db = pd.read_csv("dict/overall.csv")

trainDb = db[db.group == "train"].reset_index(drop=True)
testDb = db[db.group == "test"].reset_index(drop=True)

In [7]:
# Get Validation Set Indices

nTrain = trainDb.shape[0]
nTest = testDb.shape[0]

np.random.seed(42)

msk = np.zeros(nTrain, dtype=int)
msk[:nTest] = 1
np.random.shuffle(msk)
msk = msk.astype(bool)

In [9]:
trainImages = np.zeros((trainDb.shape[0],300,300,3),dtype='uint8')
testImages = np.zeros((testDb.shape[0],300,300,3),dtype='uint8')

In [7]:
for i in progressbar(trainDb.index,'Computing: ',20):
    img = plt.imread("data/raw/{}".format(trainDb.file[i]))
    img = cv.resize(img, dsize=(300,300))
    trainImages[i] = img

for i in progressbar(testDb.index,'Computing: ',20):
    img = plt.imread("data/raw/{}".format(testDb.file[i]))
    img = cv.resize(img, dsize=(300,300))
    testImages[i] = img

Computing: [####################] 10479/10479



In [8]:
np.save("data/trainImages",trainImages)
np.save("data/testImages",testImages)

In [30]:
trainLabels = np.array([np.array((x,y)) for x,y in zip(trainDb.lat,trainDb.lng)])
testLabels = np.array([np.array((x,y)) for x,y in zip(testDb.lat,testDb.lng)])

In [31]:
np.save("data/trainLabels",trainLabels)
np.save("data/testLabels",testLabels)

In [3]:
trainImages = np.load("data/trainImages.npy")
testImages = np.load("data/testImages.npy")

trainLabels = np.load("data/trainLabels.npy")
testLabels = np.load("data/testLabels.npy")

In [8]:
valImages = trainImages[msk]
valLabels = trainLabels[msk]

trainImages = trainImages[~msk]
trainLabels = trainLabels[~msk]

In [9]:
print(trainLabels[1170])
print(valLabels[1170])

[38.2720508 -1.199847 ]
[38.42535736 -5.15649189]


In [11]:
np.save("data/trainImages",trainImages)
np.save("data/trainLabels",trainLabels)

np.save("data/valImages",valImages)
np.save("data/valLabels",valLabels)

In [24]:
db.group[db[db.group == "train"].index[msk]] = "validation"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db.group[db[db.group == "train"].index[msk]] = "validation"


In [25]:
db.group.value_counts()

train         7859
validation    2620
test          2620
Name: group, dtype: int64

In [26]:
db.to_csv("dict/overall.csv",index=False,encoding="utf-8")