In [10]:
#warstwy przetwarzania wstępnego odpwiadają mapowaniu kolumn zadeklarowanych w źródłach danych np csv
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers

In [11]:
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'

tf.keras.utils.get_file('petfinder-mini.zip', dataset_url,extract=True,cache_dir='.')
dataframe = pd.read_csv(csv_file)

In [12]:
dataframe.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


In [13]:
#Modyfikacja kolumny AdoptionSpeed -> 0 - brak adopcji, 1 - adopcja
dataframe['target'] = np.where(dataframe['AdoptionSpeed']==4,0,1)

dataframe = dataframe.drop(columns=['AdoptionSpeed','Description'])

In [14]:
dataframe.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,1
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,1
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,1
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,1
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,1


In [15]:
#podział ramki na zbiory: treningowy, walidacyjny i testowy
train,val,test = np.split(dataframe.sample(frac=1),[int(0.8*len(dataframe)),int(0.9*len(dataframe))])

In [16]:
print(f'zbiór[ramka] treningowy: {len(train)}')
print(f'zbiór[ramka] walidacyjny: {len(val)}')
print(f'zbiór[ramka] testowy: {len(test)}')

zbiór[ramka] treningowy: 9229
zbiór[ramka] walidacyjny: 1154
zbiór[ramka] testowy: 1154


In [17]:
#tworzenie potoku wejściowego > utworzenie DataSet
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('target')
    df = {key:value[:,tf.newaxis] for key,value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df),labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [18]:
batch_size = 5
train_ds = df_to_dataset(train,batch_size=batch_size)

  df = {key:value[:,tf.newaxis] for key,value in dataframe.items()}


In [19]:
[(train_features,label_batch)] = train_ds.take(1)
print(f'every features: {list(train_features.keys())}')
print(f'a batch of ages: {train_features["Age"]}')
print(f'a batch of targets: {label_batch}')

every features: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt', 'target']
a batch of ages: [[ 2]
 [ 2]
 [12]
 [ 2]
 [ 3]]
a batch of targets: [1 1 0 1 1]


In [21]:
#zastosowanie warstw przetwarzania wstępnego:
#Normalization - normalizacja danych wejściowych
#CategoryEncoding - zamienia elmenty kategorialne w liczbach całkowitych na reprezentacje jednoaktywne,wieloaktywne i gęste
#StringLookup - zmienia wartości kategorii ciągów na indeksy liczb całkowitych
#IntegerLookup  Zmienia całkowite wartości kategoryczne na indeksy całkowite

#ujednolicenie dystrybucji danych
def get_normalization_layer(name,dataset):
    normalizer = layers.Normalization(axis=None)
    feature_ds = dataset.map(lambda x,y:x[name])
    normalizer.adapt(feature_ds)
    return normalizer

In [22]:
photo_count_col = train_features['PhotoAmt']
layer = get_normalization_layer('PhotoAmt',train_ds)
layer(photo_count_col)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[-0.5132344 ],
       [ 0.4487811 ],
       [-0.19256255],
       [ 1.4107965 ],
       [ 1.0901247 ]], dtype=float32)>

In [23]:
def get_category_encoding_layer(name,dataset,dtype,max_tokens=None):
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)

    feature_ds = dataset.map(lambda x,y:x[name])
    index.adapt(feature_ds)

    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
    return lambda feature:encoder(index(feature))

In [24]:
test_type_col = train_features['Type']
test_type_layer = get_category_encoding_layer(name='Type',dataset=train_ds,dtype='string')
test_type_layer(test_type_col)

<tf.Tensor: shape=(5, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)>

In [25]:
test_age_col = train_features['Age']
test_age_layer = get_category_encoding_layer(name='Age',dataset=train_ds,dtype='int64',max_tokens=5)
test_age_layer(test_age_col)

<tf.Tensor: shape=(5, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)>