<a href="https://www.kaggle.com/code/kingcee/nftprediction-finalsubmission?scriptVersionId=144823436" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
## Clean Up Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/train.csv')
test_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/test.csv')
train_df.head()

In [None]:
train_df['price']

In [None]:
# Our dtype is object, which generally means data is interpreted as a string
def clean_price(price_str):
    clean_price_str = price_str.replace(",", "")
    return float(clean_price_str)

clean_price('1,234')

In [None]:
# apply takes a function and runs it on every item in your series. It returns the result as a new column

train_df['clean_price'] = train_df['price'].apply(clean_price)

train_df['clean_price']

## Now that data is clean, it can be visualized with a histogram

In [None]:
plt.hist(train_df['clean_price'])
plt.show()

In [None]:
# since distribution is widely skewed, and the highest number is over 100 ETH
# This price is anomolous, so we can cap our data
train_df['clean_price'] = train_df['clean_price'].clip(0,100)

plt.hist(train_df['clean_price'])
plt.show()

## we can observe speed since it might affect value of a cat

In [None]:
train_df['speed'].value_counts()

In [None]:
#its the number we care about. so we can write a function to exxtract that and normalize it
# The re (regualr expressions) library lets you interact with text and manipulate and search strings
import re

unit_map = {
    'm': 1,
    'h': 60,
    'd': 60*24,
    'w': 60*24*7,
}

#this function takes a speed string from our dataset
#returns the number of minutes it represents
def clean_speed(speed_str):
    cooldown_str = re.findall('\((\d+[mhdw])\)', speed_str)[0]
    # At this point 'cooldown_str' looks like '24h'
    #so we want to slice the last item since thats where the letter goes.
    cooldown_count = int(cooldown_str[:-1])
    cooldown_unit = cooldown_str[-1]
    return cooldown_count * unit_map[cooldown_unit]

train_df['clean_speed'] = train_df['speed'].apply(clean_speed)
train_df['clean_speed']

In [None]:
#test
clean_speed("Plodding (4h)")

## Visualize cooldown_times for a better assessment

In [None]:
plt.hist(train_df['clean_speed'])
plt.show()

In [None]:
# visualizing relationships
plt.scatter(train_df['clean_speed'], train_df['clean_price'], c=(0, .5, 1, 0.05), s=100)
plt.title('Cooldown vs Price')
plt.xlabel('Cooldown (minutes)')
plt.ylabel('Price (Eth)')
plt.show()

In [None]:
# Thos graph makes it a little difficult to see whats going on
# One tool we can use to get a sense of the relationship between these thigns is their correlation coefficient
# If the coefficient is positive and large, this means a higher number in one value is likely to go along with a high number in the other
# If the coefficient is negative and large, it means the opposite: as one value goes up, the other goes down
train_df['clean_speed'].corr(train_df['clean_price'])

In [None]:
def predict_price(cooldown):
    b = 100 # Default price of a cat
    m = -0.01 # The amount the price falls for each minute of cooldown
    return b + (m * cooldown)

train_df['predicted_price'] = train_df['clean_speed'].apply(predict_price)
train_df['predicted_price']

In [None]:
plt.scatter(train_df['clean_speed'], train_df['clean_price'], c=(0, .5, 1, 0.05), s=100)
plt.scatter(train_df['clean_speed'], train_df['predicted_price'], c=(1, .01, .01, 0.1))
plt.title('Cooldown vs Price')
plt.xlabel('Cooldown (minutes)')
plt.ylabel('Price (Eth)')
plt.show()

In [None]:
# Now we can calculate something called "loss", or how far away from the right answer we are.
import numpy as np
np.mean(np.abs(train_df['clean_price'] - train_df['predicted_price']))

In [None]:
# So on average, we are 69ETH away from the right answe; This is ~100k, so not too great, but its a start.
# Instead of picking rnadom values for m and b, what if we let an algorith pick the best fit for us ?
# and se the values that produce lowest error
# This is the idea behind linear regression!

from sklearn.linear_model import LinearRegression

# Create LinearRegression Model
lr_model = LinearRegression()
# Train unit using .fit() method


In [None]:
train_df['clean_speed']

In [None]:
# pass in input data and output data.
# We have outlined Questions and Answers
# Input data and Output Data
# Question is: Based on x ('clean_speed') speed, what should the price be.
# so we train data with a cleaned up speed and a cleaned up price
lr_model.fit(train_df[['clean_speed']], train_df['clean_price'])

In [None]:
# we predict the same way we train, but we leave out the price since we are predicting as if we don't know it.
train_df['lr_predicted_price'] = lr_model.predict(train_df[['clean_speed']])
train_df['lr_predicted_price']


In [None]:
plt.scatter(train_df['clean_speed'], train_df['clean_price'], c=(0, .5, 1, .05), s=100)
plt.scatter(train_df['clean_speed'], train_df['lr_predicted_price'], c=(1, .01, .01, .1), s=10)
plt.title('Cooldown vs Price')
plt.xlabel('Cooldown (minutes)')
plt.ylabel('Price (eth)')
plt.show()

In [None]:
np.mean(np.abs(train_df['clean_price'] - train_df['lr_predicted_price']))

In [None]:
test_df

In [None]:
test_df['clean_speed'] = test_df['speed'].apply(clean_speed)
test_df['clean_speed']

In [None]:
test_df['price'] = lr_model.predict(test_df[['clean_speed']])

## Creating A Model with Square Root of Our Data

In [None]:
train_df['log_clean_speed'] = np.log(train_df['clean_speed'])
test_df['log_clean_speed'] = np.log(train_df['clean_speed'])

In [None]:
lr_more_features_model = LinearRegression()
lr_more_features_model.fit(train_df[['clean_speed', 'log_clean_speed']], train_df['clean_price'])
train_df['more_features_prediction'] = lr_more_features_model.predict(train_df[['clean_speed', 'log_clean_speed']])

In [None]:
plt.scatter(train_df['clean_speed'], train_df['clean_price'], c=(0, .5, 1, .05), s=100)
plt.scatter(train_df['clean_speed'], train_df['lr_predicted_price'], c=(1, .01, .01, .1), s=10)
plt.title('Cooldown vs Price')
plt.xlabel('Cooldown (minutes)')
plt.ylabel('Price (eth)')
plt.show()

In [None]:
np.mean(np.abs(train_df['clean_price'] - train_df['more_features_prediction']))

In [None]:
test_df['price'] = lr_more_features_model.predict(test_df[['clean_speed', 'log_clean_speed']])

In [None]:
submission_columns = test_df[['id', 'price']]
submission_columns.to_csv('submission.csv', index=False)

In [None]:
# Packages Used
import pandas as pd # Our old friend pandas for importing our tabular data
import cv2 # cv2 to manipulating images
import os # os for navigating our filesystem

In [None]:
#import training set
train_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/train.csv')
train_df.head()

In [None]:
# All our images are in this directory:
image_directory = '/kaggle/input/how-much-would-you-pay-for-a-fake-cat/images/images'
sorted(os.listdir(image_directory))[:10]

In [None]:
# If we open one of our image files, we can see the data inside it
file_path = os.path.join(image_directory, '00068810bf2226f2.png')
im = cv2.imread(file_path, cv2.COLOR_BGRA2RGB)
im

In [None]:
# Our image is a 3 DIMENSIONAL array
# The first 2 dimensions are simple: the x and y axes of our image
# The 3rd dimension is the color dimension: this holds information about how much red, blue, green and transparency that pixel has
# We can use matplot lib to show that image data in the way we expect

import matplotlib.pyplot as plt

plt.imshow(im)

In [None]:
# The image is literally just a numpy array, so we can see the exact dimensions by looking at the .shape property
im.shape

In [None]:
# For the sake of demonstrating that there's nothing "special" about this array, let's play around with it:
red_copy = im.copy()
red_copy[:,:,0] += 150

# Now everything is more red!
plt.imshow(red_copy)

In [None]:
# 3000x3000 is probably bigger than we need, and it might blow up our kaggle environment if we try to store all 2k of the photos
# Let's try resizing, and looking at the resulting image to see how much detail we lose:
for h in [2000, 1000, 500, 200, 100, 50, 20]:
    resized_im = cv2.resize(im, (h, h,))
    plt.title(f'{h}x{h}')
    plt.imshow(resized_im)
    plt.show()

In [None]:
# 200x200 seems to keep a lot of the important detail, but will make our resulting images less than 1% the size of the originals!
# This is something you can play with, but lets resize everything to 200x200--this will speed up future work

In [None]:
import os
import tqdm
resized_directory = '/kaggle/working/resized'
# os.mkdir(resized_directory)
images = []
for filename in tqdm.tqdm(os.listdir(image_directory)):
    im = cv2.imread(os.path.join(image_directory, filename), cv2.COLOR_BGRA2RGB)
    im = cv2.resize(im, (200, 200))
    write_path = os.path.join(resized_directory, filename)
    cv2.imwrite(write_path, im)

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import tqdm
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/train.csv')

image_dir = '/kaggle/working/resized'
image_data = []
for img_id in tqdm.tqdm(train_df['id']):
    img_path = os.path.join(image_dir, img_id + '.png')
    # Here, to keep our model simple we're reading in in grayscale, but 
    # we can play with this later
    im = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    im = im / 255
    im = np.expand_dims(im, axis=-1)
    image_data.append(im)

In [None]:
type(image_data)

In [None]:
list

In [None]:
for image in image_data[:4]:
    plt.imshow(image, cmap='gray')
    plt.show()

In [None]:
input_data = np.array(image_data).astype(np.float16)

In [None]:
type(input_data)

In [None]:
train_df['price'] = train_df['price'].apply(lambda p: p.replace(',', ''))
labels = train_df['price'].values.astype(np.float16)
labels = np.clip(labels, 0, 100)
labels

In [None]:
np.max(labels)

In [None]:
# Reality check, do we have the same number of input images and labels 
print(len(input_data))
print(len(labels))

In [None]:
input_data[0].shape

In [None]:
import tensorflow as tf

simple_model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=1)
])

simple_model.compile(loss='mean_absolute_error', optimizer='adam')

history = simple_model.fit(input_data, labels, epochs=100, batch_size=16, verbose=1)

In [None]:
simple_model.summary()

In [None]:
dense_weights = simple_model.layers[1].get_weights()[0]
dense_weights

In [None]:
dense_weights.reshape(200,200).shape

In [None]:
plt.imshow(dense_weights.reshape(200,200), cmap='gray')

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=64),
    tf.keras.layers.Dense(units=1)
])

model.compile(loss='mean_absolute_error', optimizer='adam')
history = model.fit(input_data, labels, epochs=20, batch_size=16, verbose=1)

In [None]:
model.summary()

In [None]:
# Let's actually visualize how our performance changes over time
plt.plot(history.history['loss'])

In [None]:
# Lets create a validation set so we can get a sense of how well our model generalizes:

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=256),
    tf.keras.layers.Dense(units=1)
])

model.compile(loss='mean_absolute_error', optimizer='adam')
history = model.fit(
    input_data,
    labels,
    validation_split=0.1,
    epochs=50, batch_size=16, verbose=0)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'], loc='upper left')

Making Predictions on Test Data

In [None]:
test_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/test.csv')

image_dir = '/kaggle/working/resized'
test_image_data = []
for img_id in tqdm.tqdm(test_df['id']):
    img_path = os.path.join(image_dir, img_id + '.png')
    # Here, to keep our model simple we're reading in in grayscale, but 
    # we can play with this later
    im = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    im = im / 255
    im = np.expand_dims(im, axis=-1)
    test_image_data.append(im)

In [None]:
test_input_data = np.array(test_image_data).astype(np.float16)

In [None]:
predictions = model.predict(test_input_data)

In [None]:
test_df['price'] = predictions

In [None]:
submission_df = test_df[['id', 'price']]

In [None]:
submission_df.to_csv('submission.csv', index=False)

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import tqdm
import matplotlib.pyplot as plt

train_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/train.csv')
test_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/test.csv')

image_dir = '/kaggle/working/resized'

def load_image_data(ids):
    image_data = []
    for img_id in tqdm.tqdm(ids):
        img_path = os.path.join(image_dir, img_id + '.png')
        im = cv2.imread(img_path)
        im = im / 255
        im = np.expand_dims(im, axis=-1)
        image_data.append(im)
    return image_data

train_images = load_image_data(train_df['id'])
test_images = load_image_data(test_df['id'])

In [None]:
train_input_data = np.array(train_images).astype(np.float16)
test_input_data = np.array(test_images).astype(np.float16)

In [None]:
train_df['price'] = train_df['price'].apply(lambda p: p.replace(',', ''))
train_labels = train_df['price'].values.astype(np.float16)
train_labels = np.clip(train_labels, 0, 100)
train_labels

In [None]:
# Lets check our dimensions
print(type(train_input_data))
print(train_input_data.shape)
print(type(train_labels))
print(train_labels.shape)

In [None]:
# Hmmm we have an extra dimension at the end of our input data; let's investigate
train_input_data[0][0][0] # This should show us the pixel from the first image

In [None]:
# We don't really need each rgb value to have it's own dimension, so we can remove that
# Keep all the                      images, rows, columns, pixels, but just the first value in each color dimension
train_input_data = train_input_data[:,      :,    :,       :,      0]
test_input_data = test_input_data[:,:,:,:,0]

In [None]:
train_input_data[0].shape

In [None]:
train_input_data[0][0][0] 

In [None]:
train_input_data.shape

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_input_data, train_labels, test_size=.2)

In [None]:
# Let's test it out with our model from last time!
import tensorflow as tf
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_input_data, train_labels, test_size=.2)

# model = tf.keras.models.Sequential([
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(units=256, activation='relu'),
#     tf.keras.layers.Dense(units=1)
# ])

# model.compile(loss='mean_absolute_error', optimizer='adam')
# history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16, verbose=1)

In [None]:
# Hmmm it didn't really do all that much better; some of this can be a result of the random way we broke up our training and testing data
# Let's try mutliple different train/test splits to get a better sense of how it will perform on unseen data
from tensorflow import keras 

tf.keras.backend.clear_session()
# final_val_losses = []
# for i in range(4):
#     X_train, X_test, y_train, y_test = train_test_split(train_input_data, train_labels, test_size=.2, random_state=i)
#     tf.keras.backend.clear_session()
#     keras.utils.set_random_seed(1)
#     model = tf.keras.models.Sequential([
#         tf.keras.layers.Flatten(),
#         tf.keras.layers.Dense(units=256, activation='relu'),
#         tf.keras.layers.Dense(units=1)
#     ])

#     model.compile(loss='mean_absolute_error', optimizer='adam')
#     history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16, verbose=0)
#     final_val_loss = history.history['val_loss'][-1]
#     print(final_val_loss)
#     final_val_losses.append(final_val_loss)
# print('Average Val Loss:')
# print(np.mean(final_val_losses))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_input_data, train_labels, test_size=.2, random_state=1)

In [None]:
x = np.array([
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
    [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0,0 ],
    [0, 0, 0, 0, 0, 0, 0, 0, 0,0 ],
]).astype(float)

In [None]:
plt.imshow(x, cmap='gray')

In [None]:
new_x = np.array([[0] * 5]*5).astype(float)
for i in range(x.shape[0] // 2):
    for j in range(x.shape[1] // 2):
        new_value = np.mean(x[(i*2):(i*2)+2,(j*2):(j*2)+2])
        new_x[i][j] = new_value

In [None]:
new_x

In [None]:
plt.imshow(new_x, cmap='gray')

In [None]:
plt.imshow(x, cmap='gray')

In [None]:
# Right now our dense neural network kind of assumes that any two pixels are equally related
# But we know that's not true--pixels close together are more likely to share a meaningful relationship

In [None]:
tf.keras.backend.clear_session()
cnn_model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(8, kernel_size=(3, 3), activation="relu", ),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(.5),
    tf.keras.layers.Dense(units=128, activation='relu'),
    tf.keras.layers.Dense(units=1)
])

cnn_model.compile(loss='mean_absolute_error', optimizer='adam')
history = cnn_model.fit(train_input_data, train_labels, validation_data=(X_test, y_test), epochs=30, batch_size=16, verbose=1)

In [None]:
cnn_model.summary()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'], loc='upper left')

In [None]:
test_df['price'] = cnn_model.predict(test_input_data)

In [None]:
submission_df = test_df[['id', 'price']]
submission_df.to_csv('submission.csv', index=False)

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import tqdm
import matplotlib.pyplot as plt

train_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/train.csv')
test_df = pd.read_csv('/kaggle/input/how-much-would-you-pay-for-a-fake-cat/test.csv')

image_dir = '/kaggle/working/resized'

def load_image_data(ids):
    image_data = []
    for img_id in tqdm.tqdm(ids):
        img_path = os.path.join(image_dir, img_id + '.png')
        im = cv2.imread(img_path)
        im = im / 255
        im = np.expand_dims(im, axis=-1)
        image_data.append(im)
    return image_data

train_images = load_image_data(train_df['id'])
test_images = load_image_data(test_df['id'])

In [None]:
train_image_data = np.array(train_images).astype(np.float16)
test_image_data = np.array(test_images).astype(np.float16)

In [None]:
train_df['price'] = train_df['price'].apply(lambda p: p.replace(',', ''))
train_labels = train_df['price'].values.astype(np.float16)
train_labels = np.clip(train_labels, 0, 100)
train_labels

In [None]:
import re

unit_map = {
    'm': 1,
    'h': 60,
    'd': 60*24,
    'w': 60*24*7,
}

# This function takes a speed string from out dataset
# and returns the number of minutes it represents
def clean_speed(speed_str):
    cooldown_str = re.findall('\((\d+[mhdw])\)', speed_str)[0]
    # At this point `cooldown_str` looks like "24h"
    cooldown_count = int(cooldown_str[:-1])
    cooldown_unit = cooldown_str[-1]
    return cooldown_count * unit_map[cooldown_unit]

train_speed = train_df['speed'].apply(clean_speed)
test_speed = test_df['speed'].apply(clean_speed)

In [None]:
from sklearn.model_selection import train_test_split

# image_train, image_val, speed_train, speed_val, label_train, label_val = train_test_split(
#     train_image_data, train_speed, train_labels, test_size=.2, random_state=1)

In [None]:
# print(image_train.shape)
# print(image_val.shape)
# print(speed_train.shape)
# print(speed_val.shape)
# print(label_train.shape)
# print(label_val.shape)


In [None]:
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras import Model

# Image Model
image_input = Input(shape=(200,200,3,))
image_model = Conv2D(16, (3, 3), activation="relu")(image_input)
image_model = MaxPooling2D((2,2))(image_model)
image_model = Conv2D(6, (3, 3), activation="relu")(image_input)
image_model = Flatten()(image_model)
image_model = Dense(32, activation='relu')(image_model)
image_model = Model(inputs=image_input, outputs=image_model)

# Speed Model
speed_input = Input(shape=(1,))
speed_model = Dense(32, activation="sigmoid")(speed_input)
speed_model = Model(inputs=speed_input, outputs=speed_model)

# combine the output of the two branches
combined_layer = Concatenate()([image_model.output, speed_model.output])

# apply a FC layer and then a regression prediction on the
# combined outputs
final_model = Dense(2, activation="relu")(combined_layer)
final_model = Dense(1, activation="linear")(final_model)
# our model will accept the inputs of the two branches and
# then output a single value
model = Model(
    inputs=[image_model.input, speed_model.input],
    outputs=final_model)

In [None]:
from tensorflow.keras.optimizers import Adam
model.compile(loss='mean_absolute_error', optimizer=Adam(learning_rate=0.004))
model.summary()

In [None]:
history = model.fit(
    x=(train_image_data, train_speed),
    y=train_labels,
    validation_split=0.2,
    epochs=40)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'], loc='upper left')

In [None]:
test_df['price'] = model.predict((test_image_data, test_speed))

In [None]:
plt.hist(test_df['price'])

In [None]:
test_df[['id', 'price']].to_csv('submission.csv', index=False)