In [202]:
# Import libraries
import os
import glob
import time
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec

import sklearn
import tensorflow as tf
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


# Set up environment
task = 'task1'
train_path = './data/train/'
test_path = './data/test/sessions_test_' + task + '.csv'
PREDS_PER_SESSION = 100
slice_size = 1000                   # Slicing data for memory management, None for no slicing
non_text_cols = ['id', 'locale', 'price']

In [203]:
# Function to load data
def load_data(file_path, slice_size=None):
    return pd.read_csv(file_path, nrows=slice_size) if slice_size else pd.read_csv(file_path)


# Function to clean products data
def clean_data(df):
    # Fill missing values
    df.fillna('unknown', inplace=True)
    
    # Remove special characters
    pattern = [
        (r'<.*?>', ''),                 # Remove HTML tags
        (r'[^\x00-\x7F]+', ' '),        # Remove non-ASCII characters
        (r'[^a-zA-Z0-9\s]', ''),        # Remove special characters
        (r'\s+', ' '),                  # Remove extra spaces
        (r'^\s+|\s+?$', ''),            # Remove leading and trailing spaces
    ]
    
    # Clean text
    for col in df.select_dtypes('object').columns:
        if col not in non_text_cols:
            text = df[col].str.lower()
            for p, r in pattern:
                text = text.str.replace(p, r, regex=True)
            df[col] = text
    
    return df


# Function to get products data
def get_products_data(train_path, slice_size):
    processed_file = train_path + '/products_train_processed.csv'
    
    # Check if processed file exists
    if os.path.exists(processed_file):
        return load_data(processed_file, slice_size)
    else:
        train_prod = clean_data(load_data(train_path + '/products_train.csv', slice_size))
        train_prod.to_csv(processed_file, index=False)
        return train_prod


# Load data and clean products data
train_prod = get_products_data(train_path, slice_size)
train_sess = load_data(train_path + '/sessions_train.csv', slice_size)
test_sess = load_data(test_path, slice_size)

In [205]:
train_prod.head(1)

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,red dragon amberjack 3 steel tip 22 gramm wolf...,30.95,red dragon,unknown,unknown,rdd0089,unknown,unknown,amberjacks steel dartpfeile sind verf gbar in ...


In [206]:
train_sess.head(1)

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
