In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/train.csv')
df = df[df["user_id"].isin([0,1,2,3])]

df = pd.read_csv("data/item_metadata.csv")

titles = df["title"].dropna()
all_words = [word for title in titles for word in title.split()]

uqw = set(all_words)
print(f"Number of unique words in title: {len(uqw)}")

Number of unique words in title: 219347


In [4]:
print(titles[0])
print(titles[1])
print(titles[2])

Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)
Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract, and Gingko Biloba Leaf Extract, 5 fl. oz.
Eye Patch Black Adult with Tie Band (6 Per Pack)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50000)
X = vectorizer.fit_transform(df["title"].fillna(""))
print(X.shape[1])

50000


In [3]:
# Example using gensim's Word2Vec
from gensim.models import Word2Vec

titles = df["title"].dropna().apply(str.split).tolist()
stores = df["store"].dropna().apply(str.split).tolist()
description = df["description"].dropna().apply(str.split).tolist()
category = df["category"].dropna().apply(str.split).tolist()
model = Word2Vec(titles, vector_size=100, window=5, min_count=2, workers=4)

# Average vectors per title
def get_vector(title):
    words = title.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df["title"] = df["title"].fillna("").apply(get_vector)
df["store"] = df["store"].fillna("").apply(get_vector)
df["description"] = df["description"].fillna("").apply(get_vector)
df["category"] = df["category"].fillna("").apply(get_vector)


In [4]:
print(df["title"].head())
print(df["category"])

0    [-0.39734843, 0.22038518, 0.38574108, 0.026300...
1    [-0.6260919, 0.27584687, 0.5380544, -0.4122063...
2    [0.23102407, 0.2777291, 0.24306421, -0.2691973...
3    [-0.8889685, -0.40278447, 0.2526689, -0.497835...
4    [-0.2482645, 0.29172987, 1.1735274, -0.3074607...
Name: title, dtype: object
0         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                                ...                        
262129    [0.3575052, -0.86605746, -1.1214151, 1.2633457...
262130    [0.3575052, -0.86605746, -1.1214151, 1.2633457...
262131    [0.3575052, -0.86605746, -1.1214151, 1.2633457...
262132    [0.3575052, -0.86605746, -1.1214151, 1.2633457...
262133    [0.3575052, -0.86605746, -1.1214151, 1.2633457...
Name: category, Length: 262134, dtype:

In [12]:
print(df.columns)
### KATEGORIE DO WEKTORYZACJI
"""
1. main_category
2. title
3. store
5. description
6. category
"""
#### WYRZUCIĆ
"""
0. features   -> narazie zróbmy uproszczoną wersje
1. categories -> narazie zróbmy uproszczoną wersje
2. image_count
3. has_images
4. image_urls
5. images
"""
# print(df.head())
print(df["category"][0])

Index(['parent_asin', 'main_category', 'title', 'average_rating',
       'rating_number', 'price', 'store', 'features', 'description', 'images',
       'categories', 'image_count', 'has_images', 'image_urls', 'category'],
      dtype='object')
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


high_cardinality_cols = ['parent_asin', 'image_urls', 'images', 'has_images','features', 'categories']

categorical_cols = [col for col in df.select_dtypes(include='object').columns
                    if col not in high_cardinality_cols]



In [None]:
# STEP 3: Apply Bag of Words transformation to each categorical column
for col in categorical_cols:
    print(f"Processing column: {col}")

    # Convert categorical values into space-separated string tokens (if needed)
    text_data = df[col].fillna("").astype(str)

    try:
        vectorizer = CountVectorizer(max_features=1000, stop_words=None)
        X = vectorizer.fit_transform(text_data)
        bow_df = pd.DataFrame(X.toarray(), columns=[f"{col}_{feat}" for feat in vectorizer.get_feature_names_out()])
        bow_features.append(bow_df)
    except ValueError as e:
        print(f"Skipping column '{col}' due to error: {e}")

    # Initialize CountVectorizer
    # vectorizer = CountVectorizer(stop_words='english')

    # Transform the column into BoW representation
    # X_bow = vectorizer.fit_transform(text_data)

    # Create a DataFrame from the result with meaningful column names
    # bow_df = pd.DataFrame(X_bow.toarray(), columns=[f"{col}_{feat}" for feat in vectorizer.get_feature_names_out()])

    # bow_features.append(bow_df)

# STEP 4: Concatenate all BoW DataFrames horizontally
bow_result = pd.concat(bow_features, axis=1)


# Optional: Add numerical columns if you want to retain them
numerical_cols = df.select_dtypes(include=['number']).reset_index(drop=True)
final_df = pd.concat([numerical_cols, bow_result], axis=1)

# Show result

Processing column: main_category
Processing column: title
Processing column: store
Processing column: description
Processing column: category


In [1]:
final_df

NameError: name 'final_df' is not defined