In [3]:
import os 
import json
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from utils import *
from img_utils import *
import importlib
import torch
import PIL
from torchvision.transforms import v2


from tensorflow import keras

from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Model


from tensorflow.keras.applications import EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7
from tensorflow.keras.applications import MobileNet, MobileNetV3Large, MobileNetV3Small, MobileNetV2


from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPool2D, MaxPool1D, BatchNormalization
from tensorflow.keras.layers import Input, Flatten, Dense
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense


from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping

from img_utils import plot_hist

%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prepraing the Data

## Loading the Data

In [4]:
path = "../nybolig-scrape/output"
houses_df_ = data_to_DF(path, max_houses=1000)

[ WARN:0@35.867] global loadsave.cpp:244 findDecoder imread_(''): can't open/read file: check file path/integrity
[ WARN:0@42.352] global loadsave.cpp:244 findDecoder imread_(''): can't open/read file: check file path/integrity


In [134]:
from utils import remove_outliers

#Take only Copenhagen, and Ejerlejligheder
houses_df = houses_df_[(houses_df_['postal_code'] >= 1000) & (houses_df_['postal_code'] <= 2920)]
houses_df = houses_df[houses_df['type'] == 'ejerlejlighed']
houses_df = remove_outliers(houses_df, 'price')
#remove if image_floorplan is None
houses_df = houses_df[houses_df['image_floorplan'].notna()]

In [135]:
#Preprocess the images 
resize = True 
gray_scale = False 
threshhold = True 
img_width = 224 
img_height = 224

num_labels = 5

houses_df['labels'], bins = pd.qcut(houses_df['price'], q=num_labels, labels=False, retbins=True) 
id_2_label = {i: label for i, label in enumerate(bins)}
label_2_id = {label: i for i, label in enumerate(bins)}

In [136]:
images = preprocess_images(houses_df, column_name="image_floorplan", width=img_width, height=img_height, resize=resize, gray_scale=gray_scale, threshhold=threshhold)

In [137]:
print("len house", len(houses_df))
print("len images", len(images))
houses_df['image_floorplan_pp'] = images.tolist()

  0%|          | 0/44 [45:01<?, ?it/s]

len house 272
len images 272





In [138]:
display(houses_df.head(1))

Unnamed: 0,address,postal_code,type,price,size,basement_size,rooms,year_built,year_rebuilt,energy_label,image_floorplan,labels,image_floorplan_pp
0,Amagerfælledvej 138 2. tv 2300 København S,2300,ejerlejlighed,3995000,89,,3,2018,,A2015,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",2,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."


# Using Hugginface 
Guide: https://huggingface.co/blog/fine-tune-vit

In [118]:
# Split into training and validation sets
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_df, test_df = train_test_split(houses_df, test_size=0.2, random_state=0)
train_df, valid_df = train_test_split(train_df, test_size=0.20, random_state=0)

train_df_prepared = train_df[['image_floorplan_pp', 'labels']]
valid_df_prepared = valid_df[['image_floorplan_pp', 'labels']]
test_df_prepared = test_df[['image_floorplan_pp', 'labels']]

#Turn them into hugginface datasets
train_dataset = Dataset.from_pandas(train_df_prepared)
valid_dataset = Dataset.from_pandas(valid_df_prepared)
test_dataset = Dataset.from_pandas(test_df_prepared)

#print keys 
print(train_dataset.features)

{'image_floorplan_pp': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}


In [126]:
from transformers import AutoModelForImageClassification, Trainer, TrainingArguments, ViTImageProcessor


model_name_or_path = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_name_or_path)

def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = processor([x for x in np.array(example_batch['image_floorplan_pp'])], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

prepared_train = train_dataset.with_transform(transform)
prepared_eval = valid_dataset.with_transform(transform)
prepared_test = test_dataset.with_transform(transform)

In [120]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [121]:
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [122]:
from transformers import ViTForImageClassification
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=(num_labels+1),
    id2label=id_2_label,
    label2id=label_2_id
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [123]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-beans",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,

  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

In [124]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train,
    eval_dataset=prepared_eval,
    tokenizer=processor,
)

In [125]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

                                      
  0%|          | 0/44 [07:24<?, ?it/s]         

{'loss': 1.6861, 'learning_rate': 0.00015454545454545454, 'epoch': 0.91}


                                      
  0%|          | 0/44 [12:46<?, ?it/s]         

{'loss': 1.5421, 'learning_rate': 0.00010909090909090909, 'epoch': 1.82}


                                      
  0%|          | 0/44 [18:42<?, ?it/s]         

{'loss': 1.3495, 'learning_rate': 6.363636363636364e-05, 'epoch': 2.73}


                                      
  0%|          | 0/44 [23:49<?, ?it/s]         

{'loss': 1.1239, 'learning_rate': 1.8181818181818182e-05, 'epoch': 3.64}


                                      
100%|██████████| 44/44 [25:17<00:00, 34.49s/it]


{'train_runtime': 1517.406, 'train_samples_per_second': 0.456, 'train_steps_per_second': 0.029, 'train_loss': 1.3856831030412153, 'epoch': 4.0}
***** train metrics *****
  epoch                    =        4.0
  train_loss               =     1.3857
  train_runtime            = 0:25:17.40
  train_samples_per_second =      0.456
  train_steps_per_second   =      0.029


In [127]:
metrics = trainer.evaluate(prepared_test)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

100%|██████████| 7/7 [01:04<00:00,  9.17s/it]

***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.4364
  eval_loss               =     1.4122
  eval_runtime            = 0:01:14.62
  eval_samples_per_second =      0.737
  eval_steps_per_second   =      0.094





# Using Keras + Huggingface
Guide: https://www.philschmid.de/image-classification-huggingface-transformers-keras

### Processing Data

In [160]:
from transformers import ViTFeatureExtractor
from tensorflow import keras
from tensorflow.keras import layers

model_id = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

In [177]:

data_augmentation = keras.Sequential(
    [
        layers.Resizing(feature_extractor.size, feature_extractor.size),
        layers.Rescaling(1./255),
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(factor=0.01),
        layers.RandomZoom(
            height_factor=0.05, width_factor=0.05
        ),
    ],
    name="data_augmentation",
)

def augmentation(examples):
    # print(examples["img"])
    examples["pixel_values"] = [data_augmentation(image) for image in np.array(examples["image_floorplan_pp"])]
    return examples

# basic processing (only resizing)
def process(examples):
    examples.update(feature_extractor(np.array(examples['image_floorplan_pp']), ))
    return examples

houses_ds = houses_df[['image_floorplan_pp', 'labels']]
#Turn into huggingface dataset
houses_ds = Dataset.from_pandas(houses_ds)
processed_houses_ds = houses_ds.map(process, batched=True)

test_size = 0.15
processed_dataset = processed_houses_ds.shuffle().train_test_split(test_size=test_size)

Map: 100%|██████████| 272/272 [02:27<00:00,  1.85 examples/s]


### HyperParamters

In [179]:
from huggingface_hub import HfFolder
import tensorflow as tf

id_2_label = {i: label for i, label in enumerate(bins)}
label_2_id = {label: i for i, label in enumerate(bins)}

num_train_epochs = 5
train_batch_size = 32
eval_batch_size = 32
learning_rate = 3e-5
weight_decay_rate=0.01
num_warmup_steps=0
output_dir=model_id.split("/")[1]
#hub_token = HfFolder.get_token() # or your token directly "hf_xxx"
hub_model_id = f'{model_id.split("/")[1]}-euroSat'

### Converting the dataset to a tf.data.Dataset

In [180]:
from transformers import DefaultDataCollator

# Data collator that will dynamically pad the inputs received, as well as the labels.
data_collator = DefaultDataCollator(return_tensors="tf")

# converting our train dataset to tf.data.Dataset
tf_train_dataset = processed_dataset["train"].to_tf_dataset(
   columns=['pixel_values'],
   label_cols=["labels"],
   shuffle=True,
   batch_size=train_batch_size,
   collate_fn=data_collator)

# converting our test dataset to tf.data.Dataset
tf_eval_dataset = processed_dataset["test"].to_tf_dataset(
   columns=['pixel_values'],
   label_cols=["labels"],
   shuffle=True,
   batch_size=eval_batch_size,
   collate_fn=data_collator)

### Downloading and finetuning

In [181]:
from transformers import TFViTForImageClassification, create_optimizer
import tensorflow as tf

# create optimizer wight weigh decay
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps,
)

# load pre-trained ViT model
model = TFViTForImageClassification.from_pretrained(
    model_id,
    num_labels=num_labels+1,
    id2label=id_2_label,
    label2id=label_2_id,
)

# define loss
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# define metrics
metrics=[
    tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(3, name="top-3-accuracy"),
]

# compile model
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=metrics
              )

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing TFViTForImageClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTForImageClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFViTForImageClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [174]:
# alternatively create Image Classification model using Keras Layer and ViTModel
# here you can also add the processing layers of keras

import tensorflow as tf
from transformers import TFViTModel

base_model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')


# inputs
pixel_values = tf.keras.layers.Input(shape=(3,224,224), name='pixel_values', dtype='float32')

# model layer
vit = base_model.vit(pixel_values)[0]
classifier = tf.keras.layers.Dense(6, activation='softmax', name='outputs')(vit[:, 0, :])

# model
keras_model = tf.keras.Model(inputs=pixel_values, outputs=classifier)


All PyTorch model weights were used when initializing TFViTModel.

All the weights of TFViTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTModel for predictions without further training.


In [182]:
train_results = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_train_epochs,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [183]:
model.evaluate(tf_eval_dataset)




[1.6334806680679321, 0.1463414579629898, 0.7804877758026123]

# Here we go again, but hopefully simpler

In [210]:
from transformers import AutoImageProcessor, TFViTForImageClassification

import tensorflow as tf

from datasets import load_dataset
image = np.array(houses_df['image_floorplan_pp'].iloc[0])

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")

inputs = image_processor(image, return_tensors="tf")

logits = model(**inputs).logits

# model predicts one of the 1000 ImageNet classes

predicted_label = int(tf.math.argmax(logits, axis=-1))

print(model.config.id2label[predicted_label])

All PyTorch model weights were used when initializing TFViTForImageClassification.

All the weights of TFViTForImageClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTForImageClassification for predictions without further training.


bannister, banister, balustrade, balusters, handrail


In [211]:
print(model.config.id2label[predicted_label])

bannister, banister, balustrade, balusters, handrail


In [184]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import (
    Conv2D,
    BatchNormalization,
    LayerNormalization,
    Dense,
    Input,
    Embedding,
    MultiHeadAttention,
    Layer,
    Add,
    Resizing,
    Rescaling,
    Permute,
    Flatten,
    RandomFlip,
    RandomRotation,
    RandomContrast,
    RandomBrightness
)
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, TopKCategoricalAccuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import image_dataset_from_directory

from transformers import ViTConfig, ViTModel, AutoImageProcessor, TFViTModel


In [None]:
resize_rescale_reshape = Sequential([
    Resizing(224, 224),
    Rescaling(1./255),
    # transformer expects image shape (3,224,224)
    Permute((3,1,2))
])

base_model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k')
inputs = Input(shape=(224,224,3))
x = resize_rescale_reshape(inputs)
x = base_model.vit(x)[0][:,0,:] 
output= Dense(5, activation='softmax')(x) 

vit_model = Model(inputs=inputs, outputs=output)

In [196]:
vit_model.compile(
    optimizer = Adam(learning_rate = 0.001),
    loss = CategoricalCrossentropy(),
    metrics = [CategoricalAccuracy(name='accuracy')]
)

In [227]:
vit_model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 sequential_11 (Sequential)  (None, 3, 224, 224)       0         
                                                                 
 vit (TFViTMainLayer)        TFBaseModelOutputWithPo   85798656  
                             oling(last_hidden_state             
                             =(None, 197, 768),                  
                              pooler_output=None, hi             
                             dden_states=None, atten             
                             tions=None)                         
                                                                 
 tf.__operators__.getitem_3  (None, 768)               0         
  (SlicingOpLambda)                                        

In [237]:
train_df, test_df = train_test_split(houses_df, test_size=0.2, random_state=0)
train_df, valid_df = train_test_split(train_df, test_size=0.20, random_state=0)


#Load the data using tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices((np.array(train_df['image_floorplan_pp']), train_df['labels']))
valid_ds = tf.data.Dataset.from_tensor_slices((np.array(valid_df['image_floorplan_pp']), valid_df['labels']))
test_ds = tf.data.Dataset.from_tensor_slices((np.array(test_df['image_floorplan_pp']), test_df['labels']))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [230]:
#show the prepared dataset
print(prepared_train.features)


{'image_floorplan_pp': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}


In [231]:
#Train the model
history = vit_model.fit(
    prepared_train, 
    validation_data=prepared_eval,
    epochs=5
)

ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>