In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats, linalg
import torch
from fastai.vision.all import *
import image_tabular
import re
import os
import shutil
import stat
import random

In [None]:
train_data = pd.read_csv("C:/worksspace/purr-price-prediction/train.csv")
test_data = pd.read_csv("C:/worksspace/purr-price-prediction/test.csv")

# approach: (1) use only images to predict (2) image combined with tabular data, for this purpose we will use the library image_tabular
# but image-tabular doesn't seem to be maintained anymore
# what is cool down time? 

In [None]:
train_data

In [None]:
train_data.dtypes

In [None]:
# firstly, we want to checkout how the speed influences the price (correlation, genral trend)

train_data[train_data["price"].str.contains(",")]

In [None]:
train_data["price"] = train_data["price"].apply(lambda x : x.replace(",","") if "," in x else x)

In [None]:
train_data[train_data["price"].str.contains(",")]

In [None]:
train_data["price"] = train_data[("price")].astype(float)

In [None]:
train_data["speed"].unique()

In [None]:
# re.search(r"\(([A-Za-z0-9]+)\)", 'Snappy (10m)').group(1)
def get_time(s):
    res_str = re.search(r"\(([A-Za-z0-9]+)\)", s).group(1)
    res = parse_time_string(res_str)
    return res

# we want to convert all time strings to numbers (unit is minute)
def parse_time_string(s):
    items = re.match(r"([0-9]+)([a-z]+)", s, re.I).groups()
    num = int(items[0])
    unit = items[1]
    if "m" in unit:
        return num
    elif "h" in unit:
        return 60 * num
    elif "d" in unit:
        return 60 * 24 * num
    else: # "w"
        return 60 * 24 * 7 * num

In [None]:
stuff = re.match(r"([0-9]+)([a-z]+)", "10m", re.I).groups()
"m" in stuff[1]

In [None]:
train_data["speed"] = train_data["speed"].apply(lambda x: get_time(x))

In [None]:
train_data["id"]

In [None]:
train_data.dtypes

In [None]:
sns.histplot(data=train_data, x="speed")
# when plotting count or density, be careful with continuous data

In [None]:
sns.displot(x=train_data["price"], bins=25, kde=True)

In [None]:
sns.boxplot(x=train_data["price"])

In [None]:
# back up the original data
train_data_original = train_data

In [None]:
train_data_original

In [None]:
train_data = train_data_original.drop(train_data_original[train_data_original["price"] > 20].index)

In [None]:
train_data

In [None]:
sns.boxplot(x=train_data["price"])

In [None]:
# the relation between speed and price
sns.lineplot(data=train_data, x="speed", y="price", errorbar=None)
# it seems that generally very long cooling time corresponds to lower prices

In [None]:
# the correlation matrix ssuggests the same kind of weak correlation
sns.heatmap(train_data.corr(), annot=True)

In [None]:
path = Path("C:\worksspace\purr-price-prediction\images\images")

In [None]:
full_paths_im = get_image_files(path)

In [None]:
# checkout the first image
Image.open(full_paths_im[0]).to_thumb(256, 256)

In [None]:
# test

first_file_name = full_paths_im[0]
first_file_name

In [None]:
# test

"png" in first_file_name.name
name_test = os.path.basename("./images/7d46fd31038904e2.png")
train_data.loc[train_data['image'] == "./images/"+name_test, "price"] 

In [None]:
# copy all the images, whose relative row is not removed from the training data
# to a different directory
# also we don't want to edit the original images
root_path = "C:/worksspace/purr-price-prediction/"

if os.listdir(root_path + "images/images_copy/"): # if the directory is not empty

    all_files = os.listdir(root_path + "images/images_copy/")
    
    for f in all_files:
        os.remove(root_path + "images/images_copy/" + f)

# shutil.copyfile(src = root_path + "images/images/7d46fd31038904e2.png", dst = root_path + "images/images_copy/7d46fd31038904e2.png")

In [None]:
all_original_files = os.listdir(root_path + "images/images/")

for f in all_original_files:
    if not locate_price(f).empty:
        shutil.copy(src = root_path + "images/images/" + f, dst = "images/images_copy/" + f)

In [None]:
# test
locate_price('00068810bf2226f2.png').empty

In [None]:
# function for getting labels, in our case it's price
def locate_price(filename):
    """
    function locates the price of an image according to the file name of the image
    :param filename: string e.g. '00068810bf2226f2.png'
    :return: 1x1 data frame with the price (float)
    """
    return train_data.loc[train_data['image'] == "./images/"+ filename, "price"] 

def get_price(full_path_im):
    """
    function returns the price of an image according to the full path to the image
    :param full_path_im: Path, e.g. Path('C:/worksspace/purr-price-prediction/images/images/00068810bf2226f2.png')
    :return: float
    """
    filename = os.path.basename(full_path_im.name)
    return float(locate_price(filename))

In [None]:


# list(set(random.sample(list(train_data.index), round(num_rows))))

def splitter(df):
    num_rows = len(train_data.index)
    
    num_train = round(num_rows * 0.8)
    num_valid = num_rows - num_train
    
    all_indices = list(train_data.index)
    
    train_indices = list(set(random.sample(list(train_data.index), num_train)))
    valid_indices = [i for i in all_indices if i not in train_indices]
    
    return train_indices, valid_indices

In [None]:
dblock = DataBlock(
    blocks = (ImageBlock, RegressionBlock),
    get_items = get_image_files,
    get_y = get_price,
    splitter = RandomSplitter(),
    item_tfms = Resize(224), 
)

In [None]:
path = Path(root_path + "\images\images_copy")
dsets = dblock.datasets(path)

In [None]:
dsets.train[3]

In [None]:
dls = dblock.dataloaders(path)

In [None]:
dls.show_batch()

In [None]:
xb,yb = dls.one_batch()
xb.shape,yb.shape

# 64 is the defaut batch size
# dataloader converts the input image to tensor of size [3, 244, 244]

# TODO: deal with the splitter, batch size

In [None]:
learn = vision_learner(dls, resnet18, y_range=(-1, 1))

In [None]:
learn.loss_func
# FlattenedLoss of MSELoss(): mean squared error loss

In [None]:
learn.lr_find()

In [None]:
learn.show_results(ds_type=DatasetType.Train, rows=4, figsize=(8,10))