## Part 1: Training using active learning and pseudo labeling

In [1]:
import pandas as pd
from xgboost import XGBClassifier
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../dataset/yelp_pseudo_features.csv")
df.tail()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny,scores_pooling_mean,scores_pooling_max,scores_pooling_min,scores_pooling_std,scores_pooling_sum,scores_concat,scores_shift_psm,scores_shift_ratio
10495,,,,1,Dining here guarantees you’ll leave with a les...,,,,,,-0.0516,-0.0516,-0.0516,0.0,-0.0516,-0.0516,0.0,0.0
10496,,,,1,This amusement park feels like an endurance te...,,,,,,0.7351,0.7351,0.7351,0.0,0.7351,0.7351,0.0,0.0
10497,,,,1,"The library inspires nostalgia, mostly for bet...",,,,,,0.4404,0.4404,0.4404,0.0,0.8808,0.4404|0.4404,0.0,0.0
10498,,,,1,"The spa offers relaxation, though mostly for y...",,,,,,0.26335,0.5267,0.0,0.26335,0.5267,0.5267|0.0,0.277413,1.0
10499,,,,1,"You’ll leave this bar with stories to tell, mo...",,,,,,-0.26415,-0.0516,-0.4767,0.21255,-0.5283,-0.0516|-0.4767,0.0,0.0


In [3]:
# Mark the generated data as positive.
df["is_generated"] = df["review_id"].isna().astype(int)

In [4]:
# Random sampling the initial samples (30 positive and 30 negatives)
real_samples = df[df["is_generated"] == 0].sample(30, random_state=42)
generated_samples = df[df["is_generated"] == 1].sample(30, random_state=42)

# Selected features from previous step:
feature_cols = ["scores_pooling_mean", "scores_pooling_max", "scores_pooling_min", "scores_pooling_std",
                "scores_pooling_sum", "scores_shift_psm", "scores_shift_ratio"]

# Construction the training set:
initial_samples = pd.concat([real_samples, generated_samples])
X_train = initial_samples[feature_cols]
y_train = initial_samples["is_generated"]
remaining_data = df.drop(initial_samples.index)


### 1.1 Active Learning

In [59]:
def train_active_learning(X_train, y_train, remaining_samples, target_positive_cnt=500, step=50):
    print(remaining_samples.shape)
    iteration = 0
    positive_cnt = 30
    classifier = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

    while positive_cnt < target_positive_cnt:
        iteration += 1
        classifier.fit(X_train, y_train)

        X_remaining = remaining_samples[feature_cols]
        remaining_samples["probability"] = classifier.predict_proba(X_remaining)[:, 1]

        sorted_samples = remaining_samples.sort_values(by="probability", ascending=False)
        high_conf_samples = sorted_samples.head(step)
        low_conf_samples = sorted_samples.tail(step)

        if high_conf_samples.empty and low_conf_samples.empty:
            break

        X_train = pd.concat([X_train, high_conf_samples[feature_cols], low_conf_samples[feature_cols]])

        ##### Querying the actual labels #####
        y_high_conf = high_conf_samples["is_generated"]
        y_low_conf = low_conf_samples["is_generated"]
        y_train = pd.concat([y_train, y_high_conf, y_low_conf])

        remaining_samples = remaining_samples.drop(index=high_conf_samples.index)
        remaining_samples = remaining_samples.drop(index=low_conf_samples.index)

        positive_cnt += high_conf_samples.shape[0]
        print(f"Iteration {iteration}, cur positive samples count: {positive_cnt}")

    return classifier

In [72]:
classifier = train_active_learning(X_train, y_train, remaining_data, target_positive_cnt=500, step=50)

# Final top_500 results with the highest probability
df["probability"] = classifier.predict_proba(df[feature_cols])[:, 1]
top_500_samples = df.sort_values(by="probability", ascending=False).head(500)

output = top_500_samples[["probability", "is_generated", "text"]]

output.to_csv("../output/top500_active_learning.csv", index=False)

# Calculating raw Precision@500 without double check from GPT
precision_at_500 = top_500_samples["is_generated"].sum() / len(top_500_samples)
print(f"Precision@500: {precision_at_500:.4f}")

(10440, 20)
Iteration 1, cur positive samples count: 80
Iteration 2, cur positive samples count: 130
Iteration 3, cur positive samples count: 180
Iteration 4, cur positive samples count: 230
Iteration 5, cur positive samples count: 280
Iteration 6, cur positive samples count: 330
Iteration 7, cur positive samples count: 380
Iteration 8, cur positive samples count: 430
Iteration 9, cur positive samples count: 480
Iteration 10, cur positive samples count: 530
Precision@500: 0.5800


### 1.2 Pseudo labeling


In [66]:
def train_pseudo_labeling(X_train, y_train, remaining_samples, target_positive_cnt=500, step=50):
    print(remaining_samples.shape)
    iteration = 0
    positive_cnt = 30
    classifier = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

    while positive_cnt < target_positive_cnt:
        iteration += 1
        classifier.fit(X_train, y_train)

        X_remaining = remaining_samples[feature_cols]
        remaining_samples["probability"] = classifier.predict_proba(X_remaining)[:, 1]

        sorted_samples = remaining_samples.sort_values(by="probability", ascending=False)
        high_conf_samples = sorted_samples.head(step)
        low_conf_samples = sorted_samples.tail(step)

        if high_conf_samples.empty and low_conf_samples.empty:
            break

        X_train = pd.concat([X_train, high_conf_samples[feature_cols], low_conf_samples[feature_cols]])

        ##### Assigning the pseudo labels #####
        y_high_conf = pd.Series(1, index=high_conf_samples.index)
        y_low_conf = pd.Series(0, index=low_conf_samples.index)
        y_train = pd.concat([y_train, y_high_conf, y_low_conf])

        remaining_samples = remaining_samples.drop(index=high_conf_samples.index)
        remaining_samples = remaining_samples.drop(index=low_conf_samples.index)

        positive_cnt += high_conf_samples.shape[0]
        print(f"Iteration {iteration}, cur positive samples count: {positive_cnt}")

    return classifier


In [71]:
classifier = train_pseudo_labeling(X_train, y_train, remaining_data, target_positive_cnt=500, step=50)

# Final top_500 results with the highest probability
df["probability"] = classifier.predict_proba(df[feature_cols])[:, 1]
top_500_samples = df.sort_values(by="probability", ascending=False).head(500)

output = top_500_samples[["probability", "is_generated", "text"]]

output.to_csv("../output/top500_pseudo_labeling.csv", index=False)

# Calculating raw Precision@500 without double check from GPT
precision_at_500 = top_500_samples["is_generated"].sum() / len(top_500_samples)
print(f"Precision@500: {precision_at_500:.4f}")

(10440, 20)
Iteration 1, cur positive samples count: 80
Iteration 2, cur positive samples count: 130
Iteration 3, cur positive samples count: 180
Iteration 4, cur positive samples count: 230
Iteration 5, cur positive samples count: 280
Iteration 6, cur positive samples count: 330
Iteration 7, cur positive samples count: 380
Iteration 8, cur positive samples count: 430
Iteration 9, cur positive samples count: 480
Iteration 10, cur positive samples count: 530
Precision@500: 0.3180


## Part 2: evaluating the results using LLM (GPT-4o) as ground truth.

In [75]:
from openai import OpenAI
import os

api_key = "sk"
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", api_key))


def gpt_review_check(df, verbose=False):
    df["gpt_check"] = 0
    for idx, row in tqdm(df[df["is_generated"] == 0].iterrows()):
        review_text = row["text"]

        prompt = f"Review text:{review_text}. Does this review seem sarcastic? Answer 'Yes' or 'No'."

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system",
                 "content": "You are an assistant tasked with determining whether a given Yelp review is sarcastic or not. Sample sarcastic review may like \"The server was efficient at ignoring us.\""},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150
        )

        response_text = response.choices[0].message.content.strip().lower()
        if "yes" in response_text:
            df.at[idx, "gpt_check"] = 1

        if verbose:
            print("-----------------------------------------")
            print("Check result:", response_text)
            print(review_text)
    return df

In [79]:
def calculate_precision_at_k(df, k):
    df = df.sort_values(by="probability", ascending=False).head(k)
    positive_samples = ((df["is_generated"] == 1) | (df["gpt_check"] == 1)).sum()
    p_at_k = positive_samples / k
    print(f"Precision@{k}: {p_at_k:.4f}")
    return p_at_k


def eval_results(file_path):
    print("-----------------------------------------")
    print("Evaluating result:", file_path)
    df_res = pd.read_csv(file_path)
    df_res = gpt_review_check(df_res, verbose=False)
    for k in [10, 20, 30, 50, 100, 200, 500]:
        calculate_precision_at_k(df_res, k)

In [80]:
eval_results("../output/top500_active_learning.csv")

-----------------------------------------
Evaluating result: ../output/top500_active_learning.csv


210it [01:40,  2.09it/s]

Precision@10: 1.0000
Precision@20: 1.0000
Precision@30: 0.9667
Precision@50: 0.9800
Precision@100: 0.9800
Precision@200: 0.8800
Precision@500: 0.6380





In [81]:
eval_results("../output/top500_pseudo_labeling.csv")

-----------------------------------------
Evaluating result: ../output/top500_pseudo_labeling.csv


341it [02:26,  2.32it/s]

Precision@10: 0.5000
Precision@20: 0.5000
Precision@30: 0.4333
Precision@50: 0.4800
Precision@100: 0.4300
Precision@200: 0.4200
Precision@500: 0.3860



