# Import Libraries

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import zipfile
import os
from matplotlib.image import imread
from google.colab import files
import matplotlib.pyplot as plt


from keras.applications.vgg16 import VGG16, preprocess_input

from tensorflow.keras.preprocessing import image

In [2]:
!nvidia-smi  #Checking GPU Information

/bin/bash: line 1: nvidia-smi: command not found


# Loading Dataset from Official Link in Colab

In [4]:
!apt-get install -y megatools

# this is a code block to download the visuelle 2.0 dataset from MEGA.nz(official link)
# the official link is taken from https://humaticslab.github.io/forecasting/visuelle
# this code downloads a .zip file to the path mentioned
# the below unzip function unzips the visulle 2.0 dataset to the desired path

url = 'https://mega.nz/file/2FdClL6K#zlt5MQS10glkHlR6o7Y9P7fkC1Kly2ZwHd9JpRiy8sQ'
path = '/content/data'
!mkdir -p {path}
!megadl {url} --path {path}

def unzip_file(zip_path, extract_path):

    if not os.path.exists(zip_path):
        print(f"The file {zip_path} does not exist")
        return

    os.makedirs(extract_path, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        print(f"Files extracted to {extract_path}")

unzip_file('/content/data/visuelle2.zip', '/content/data/visuelle2.0/')

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  megatools
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 207 kB of archives.
After this operation, 898 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 megatools amd64 1.10.3-1build1 [207 kB]
Fetched 207 kB in 1s (269 kB/s)
Selecting previously unselected package megatools.
(Reading database ... 121752 files and directories currently installed.)
Preparing to unpack .../megatools_1.10.3-1build1_amd64.deb ...
Unpacking megatools (1.10.3-1build1) ...
Setting up megatools (1.10.3-1build1) ...
Processing triggers for man-db (2.10.2-1) ...
[0KDownloaded visuelle2.zip
Files extracted to /content/data/visuelle2.0/


# Loading and Displaying Data from CSV File

This code snippet reads a CSV file named 'sales.csv' from the specified path using Pandas.
The loaded data is stored in a Pandas DataFrame named 'data'.
Subsequently, the 'head()' function is used to display the first few rows of the DataFrame.

1. Ensure that the files are located in the correct directory ("/.../visuelle2/...").


In [None]:
path = str('/content/data/visuelle2.0/visuelle2/')
data = pd.read_csv(path + "sales.csv")
display(data.head())
print('\nColumns:',data.columns)

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,2,3,4,5,6,7,8,9,10,11
0,0,5,36,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,22,...,1.0,1.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0
1,1,2,51,SS17,long sleeve,violet,PE17/00002.png,acrylic,2016-11-28,17,...,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0
2,2,5,10,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,3,9,41,SS17,culottes,yellow,PE17/00009.png,scuba crepe,2016-11-28,32,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,4,5,13,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,26,...,4.0,0.0,3.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0



Columns: Index(['Unnamed: 0', 'external_code', 'retail', 'season', 'category', 'color',
       'image_path', 'fabric', 'release_date', 'restock', '0', '1', '2', '3',
       '4', '5', '6', '7', '8', '9', '10', '11'],
      dtype='object')


### regarding corrupted image

In [None]:
#the dataset downloaded from official link contains a corrupted image
#to crosscheck the image
image_raw = imread(path + "images/AI19/04442.png")
print(image_raw.shape)

# Displaying the image
plt.figure(figsize=[12,8])
plt.imshow(image_raw)

# to upload the image manually (download from visuelle 1.0 dataset)
# first we remove the corrupted image, we change directory,
# upload the image and revert back to previous directory path


os.remove(path + "images/AI19/04442.png")
os.chdir(path + "images/AI19/")
files.upload()
os.chdir('/content/data/')

# Features encoding

This code snippet focuses on encoding categorical variables and images within a Pandas DataFrame. The categorical variables, namely "category," "color," and "fabric," are encoded using the LabelEncoder from scikit-learn. The resulting encoded values are stored in new columns named "category_encoded," "color_encoded," and "fabric_encoded."

Furthermore, images specified in the "image_path" column are processed using a pre-trained VGG16 model to extract features. The 'encode_image' function handles the loading, preprocessing, and feature extraction for each image path. The resulting image features are stored in the "image_features" column of the DataFrame.

Noted Points:
1. Ensure that DataFrame contains the required columns: "category," "color," "fabric," and "image_path."
2. Ensure that no image file is corrupted

In [None]:
# Encode categorical variables using LabelEncoder
le_category = LabelEncoder()
data["category_encoded"] = le_category.fit_transform(data["category"])

le_color = LabelEncoder()
data["color_encoded"] = le_color.fit_transform(data["color"])

le_fabric = LabelEncoder()
data["fabric_encoded"] = le_fabric.fit_transform(data["fabric"])

# Extracting image features

In [None]:
model = VGG16(weights="imagenet", include_top=False) #VGG16 CNN model, faster and better than inception v3 model

image_features_dict = {}  # Store features for efficient retrieval

def encode_image(image_path):
    if image_path not in image_features_dict:
        # Loads and preprocess the image (only if not already encoded)
        img = image.load_img(path +'images/' + image_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x) # input shape (1*224*224*3)

        # Extract features using VGG16 -> output shape (1*7*7*512)
        features = model.predict(x, verbose=0).flatten()  # Flatten -> shape (25088,)

        # Store features in dictionary for future retrieval
        # taking mean - reducing 25088 features to one value
        image_features_dict[image_path] = features.mean()
        if int(len(image_features_dict))%1000 == 0:
          print ("step: ", len(image_features_dict))

    return image_features_dict[image_path]  # Retrieve stored features

# Apply encoding efficiently using the dictionary
data["image_features"] = data["image_path"].apply(encode_image)

# Note:
#  1. 'PE17/00410.png', 'PE17/00585.png' are same images.
#  2. 'PE17/00423.png', 'PE18/01924.png' are same images.
# so unique images -> 5353 images


In [None]:
#@title alternative to running the above code

# image features dictionary is stored as a pickle file
# no need to run the cnn model
files.upload() # upload "" dataimg_flat_df.pkl ""
img_features = pd.read_pickle('/content/data/img_flat_df.pkl')
img_mean_df = pd.DataFrame(img_features.mean(axis=1), columns=['image_features'])
data = data.merge(img_mean_df, left_on='image_path', right_index=True)

In [None]:
#@title Encoding Release dates feature
# to encode the release dates
# release dates are encoded in ascending order, such that lower numbers
# represent older dates and higher numbers represents recent dates

date_encoder = LabelEncoder()

# create a separate df for unique release dates, sort it and apply encoding
dates_df = pd.DataFrame(data['release_date'].unique()).reset_index(drop=True).sort_values(0)
dates_df['date_encoded'] = date_encoder.fit_transform(dates_df[0])
dates_df.set_index(0, inplace=True)

# merge the encoding to data file
data = data.merge(dates_df, left_on='release_date', right_index=True)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,7,8,9,10,11,category_encoded,color_encoded,fabric_encoded,image_features,date_encoded
0,0,5,36,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,22,...,0.0,2.0,0.0,0.0,0.0,11,4,0,2.499182,0
2,2,5,10,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,...,1.0,1.0,0.0,0.0,1.0,11,4,0,2.499182,0
4,4,5,13,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,26,...,1.0,0.0,0.0,0.0,0.0,11,4,0,2.499182,0
5,5,5,41,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,...,1.0,0.0,0.0,0.0,1.0,11,4,0,2.499182,0
1,1,2,51,SS17,long sleeve,violet,PE17/00002.png,acrylic,2016-11-28,17,...,0.0,0.0,1.0,1.0,0.0,11,7,0,2.578126,0


In [None]:
df = data.copy()
# Print the resulting DataFrame
print(df.columns)


Index(['Unnamed: 0', 'external_code', 'retail', 'season', 'category', 'color',
       'image_path', 'fabric', 'release_date', 'restock', '0', '1', '2', '3',
       '4', '5', '6', '7', '8', '9', '10', '11', 'category_encoded',
       'color_encoded', 'fabric_encoded', 'image_features', 'date_encoded'],
      dtype='object')


# Exracting goolge trend features


In [None]:
# Read the sales data frame from a csv file
sales_df = pd.read_csv("/content/drive/MyDrive/Master_Thesis/Datasets/visuelle2/sales.csv")

# Read the gtrends data frame from a csv file
gtrends_df = pd.read_csv("/content/drive/MyDrive/Master_Thesis/Datasets/visuelle2/vis2_gtrends_data.csv")

# Create an empty list to store the trends data
gtrends = []

# Iterate over each row of the sales data frame
for index, row in sales_df.iterrows():
    # Get the product information from the row
    cat, col, fab, start_date = row["category"], row["color"], row["fabric"], row["release_date"]

    # Convert the start date to a datetime object
    start_date = pd.to_datetime(start_date)

    # Calculate the end date as the start date plus 52 weeks
    end_date = start_date + pd.DateOffset(weeks=52)

    # Format the start and end dates as strings in YYYY-MM-DD format
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")

    # Get the gtrends data for the corresponding category, color, and fabric
    # Use the loc function to filter the gtrends data frame by the date range and the columns
    gtrends_data = gtrends_df.loc[(gtrends_df["date"] >= start_date_str) & (gtrends_df["date"] <= end_date_str), [cat, col, fab]]

    # Get the values of the three columns as numpy arrays
    cat_gtrend = gtrends_data[cat].values
    col_gtrend = gtrends_data[col].values
    fab_gtrend = gtrends_data[fab].values

    # Stack the three arrays into a 2D array
    # google trends of category, color, fabric of the products for the past 52 weeks
    multitrends = np.vstack([cat_gtrend, col_gtrend, fab_gtrend])

    # Append the 2D array to the gtrends list
    gtrends.append(multitrends)


# Adding to dataframe

In this code, we add new columns to the DataFrame (`df`) based on the Google Trends data stored in the "gtrends" list:

In [None]:
df["cat_gtrend"] = df.apply(lambda x: gtrends[x.name][0], axis=1)
df["col_gtrend"] = df.apply(lambda x: gtrends[x.name][1], axis=1)
df["fab_gtrend"] = df.apply(lambda x: gtrends[x.name][2], axis=1)

# Save final dataframe

In [None]:
#save the file
df.to_pickle('/content/dataFinal.pkl') # final preprocessed data
df.head(5)

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,10,11,category_encoded,color_encoded,fabric_encoded,image_features,date_encoded,cat_gtrend,col_gtrend,fab_gtrend
0,0,5,36,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,22,...,0.0,0.0,11,4,0,2.499182,0,"[85.0, 84.0, 76.0, 65.0, 61.0, 59.0, 54.0, 53....","[20.0, 18.0, 18.0, 16.0, 18.0, 19.0, 17.0, 18....","[68.0, 69.0, 71.0, 78.0, 72.0, 69.0, 66.0, 69...."
2,2,5,10,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,...,0.0,1.0,11,4,0,2.499182,0,"[85.0, 84.0, 76.0, 65.0, 61.0, 59.0, 54.0, 53....","[20.0, 18.0, 18.0, 16.0, 18.0, 19.0, 17.0, 18....","[68.0, 69.0, 71.0, 78.0, 72.0, 69.0, 66.0, 69...."
4,4,5,13,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,26,...,0.0,0.0,11,4,0,2.499182,0,"[85.0, 84.0, 76.0, 65.0, 61.0, 59.0, 54.0, 53....","[20.0, 18.0, 18.0, 16.0, 18.0, 19.0, 17.0, 18....","[68.0, 69.0, 71.0, 78.0, 72.0, 69.0, 66.0, 69...."
5,5,5,41,SS17,long sleeve,grey,PE17/00005.png,acrylic,2016-11-28,15,...,0.0,1.0,11,4,0,2.499182,0,"[85.0, 84.0, 76.0, 65.0, 61.0, 59.0, 54.0, 53....","[20.0, 18.0, 18.0, 16.0, 18.0, 19.0, 17.0, 18....","[68.0, 69.0, 71.0, 78.0, 72.0, 69.0, 66.0, 69...."
1,1,2,51,SS17,long sleeve,violet,PE17/00002.png,acrylic,2016-11-28,17,...,1.0,0.0,11,7,0,2.578126,0,"[85.0, 84.0, 76.0, 65.0, 61.0, 59.0, 54.0, 53....","[56.0, 58.0, 59.0, 54.0, 58.0, 55.0, 58.0, 59....","[68.0, 69.0, 71.0, 78.0, 72.0, 69.0, 66.0, 69...."
