### Setting everything up for the notebook

#### Importing required libraries

In [1]:
import os
import polars as pl
import numpy as np
import torch
from transformers import pipeline, AutoTokenizer, AutoModel

2024-12-08 08:33:32.326354: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-08 08:33:33.724332: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Definition of needed functions

In [2]:
def generate_embedding(text: str):
    """
    Generates an embedding (vector representation) for a given text.

    This function takes a text input, tokenizes it using a tokenizer,
    processes the tokens through a pre-trained model, and returns the 
    mean of the last hidden state as a 1-dimensional list of floats.

    Parameters:
    ----------
    text : str
        The input text for which the embedding will be generated.

    Returns:
    -------
    list[float]
        A 1-dimensional list representing the embedding of the input text.

    Example Usage:
    --------------
    embedding = generate_embedding("Example text for generating an embedding.")
    print(embedding)  # Outputs a list of floats
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

#### Ensure reproducibility

In [3]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f6d41da32b0>

### 1. Load Data

In [4]:
train_data = pl.read_parquet("../data/processed/train_data.parquet")  # userId, jokeId, rating (TRAIN ONLY)
test_data = pl.read_parquet("../data/processed/test_data.parquet")    # userId, jokeId, rating (TEST ONLY)
items = pl.read_parquet("../data/processed/shuffled_jokes.parquet")    # jokeId, jokeText

### 2. Generate Joke-Level Text Features (Independent of Train/Test)

In [5]:
items = items.with_columns([
    pl.col("jokeText").str.lengths().alias("text_length"),
    pl.col("jokeText").str.split(" ").arr.lengths().alias("word_count")
])

### 3. Sentiment Analysis

In [6]:
# sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased", device=0)
# items = items.with_columns(
#     pl.col("jokeText").apply(lambda txt: sentiment_analyzer(txt)[0]["score"]).alias("sentiment_score")
# )

> Showed no effect in my case with jokes

### 4. Compute Embeddings for Each Joke

In [7]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

embeddings = [generate_embedding(txt) for txt in items["jokeText"]]
items = items.with_columns(pl.Series(name="embeddings", values=embeddings))

### 5. Compute Rating-Based Features From Train Only (No Leakage)

In [8]:
joke_stats = (
    train_data.groupby("jokeId")
    .agg([
        pl.count("rating").alias("num_ratings"),
        pl.col("rating").mean().alias("avg_rating"),
        pl.col("rating").std().alias("rating_std"),
    ])
)

user_stats = (
    train_data.groupby("userId")
    .agg([
        pl.count("rating").alias("num_ratings_user"),
        pl.col("rating").mean().alias("avg_user_rating"),
        pl.col("rating").std().alias("user_rating_std_dev"),
    ])
)

In [9]:
items.null_count()

jokeId,jokeText,text_length,word_count,embeddings
u32,u32,u32,u32,u32
0,0,0,0,0


In [10]:
# Compute global means from train_data for filling missing values in test_data
# These are only from train_data
global_means = {
    "num_ratings": float(joke_stats["num_ratings"].mean()) if joke_stats.height > 0 else 0.0,
    "avg_rating": float(train_data["rating"].mean()) if train_data.height > 0 else 5.0,  # fallback mean rating
    "rating_std": float(train_data["rating"].std()) if train_data.height > 1 else 0.0,
    "num_ratings_user": float(user_stats["num_ratings_user"].mean()) if user_stats.height > 0 else 1.0,
    "avg_user_rating": float(train_data["rating"].mean()) if train_data.height > 0 else 5.0,
    "user_rating_std_dev": float(train_data["rating"].std()) if train_data.height > 1 else 0.0,
}

# Join these rating-based stats into items
items = items.join(joke_stats, on="jokeId", how="left")

# Fill missing joke stats in items with global means
# Even though we do can skip it in our case, it is better to leave this POC
items = items.with_columns([
    pl.col("num_ratings").fill_null(global_means["num_ratings"]),
    pl.col("avg_rating").fill_null(global_means["avg_rating"]),
    pl.col("rating_std").fill_null(global_means["rating_std"])
])

> At this point, items has embeddings and full joke-level features derived from train_data only.
We'll save this as items_with_all_features.parquet for future clustering.

In [11]:
items.write_parquet("../data/processed/items_with_all_features.parquet")

### 6. Prepare Train and Test Feature Datasets Without Embeddings

> Since embeddings can be large and not needed directly in train/test sets, we exclude embeddings column from train/test output.

In [12]:
# We'll join items (excluding embeddings) + user_stats and joke_stats to train/test data.

# For train_data: join user_stats and joke_stats from train_data only
train_data = train_data.join(items.select([
    "jokeId", "text_length", "word_count",
    "num_ratings", "avg_rating", "rating_std"
]), on="jokeId", how="left")

# Add user-level features to train_data
train_data = train_data.join(user_stats, on="userId", how="left")

# Fill missing in train_data
train_data = train_data.with_columns([
    pl.col("num_ratings_user").fill_null(global_means["num_ratings_user"]),
    pl.col("avg_user_rating").fill_null(global_means["avg_user_rating"]),
    pl.col("user_rating_std_dev").fill_null(global_means["user_rating_std_dev"])
])

# For test_data: join items (excluding embeddings)
test_data = test_data.join(items.select([
    "jokeId", "text_length", "word_count",
    "num_ratings", "avg_rating", "rating_std"
]), on="jokeId", how="left")

# Join user_stats to test_data
test_data = test_data.join(user_stats, on="userId", how="left")

In [13]:
test_data.null_count()

userId,jokeId,rating,text_length,word_count,num_ratings,avg_rating,rating_std,num_ratings_user,avg_user_rating,user_rating_std_dev
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0


In [14]:
# Fill missing in test_data using global means
# The same as before, we will leave it,
# imagining, that it might be needed in the future
test_data = test_data.with_columns([
    pl.col("num_ratings_user").fill_null(global_means["num_ratings_user"]),
    pl.col("avg_user_rating").fill_null(global_means["avg_user_rating"]),
    pl.col("user_rating_std_dev").fill_null(global_means["user_rating_std_dev"])
])

### 7. Save Final Train and Test Features Without Embeddings

In [15]:
train_data = train_data.with_columns(
    (pl.col("rating") >= 0).cast(pl.Int8).alias("rating")
)

test_data = test_data.with_columns(
    (pl.col("rating") >= 0).cast(pl.Int8).alias("rating")
)


train_data.write_parquet("../data/processed/train_features.parquet")
test_data.write_parquet("../data/processed/test_features.parquet")