# Weak Labelling for Trust Scoring



### 1. Import Libraries

In [11]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### 2. Load Reviews Dataset

In [12]:
try:
    df = pd.read_csv("../data/processed/reviews_clean.csv")
except FileNotFoundError:
    print("Clean processed data not found, checking raw data...")
    df = pd.read_json("../data/raw/AMAZON_FASHION.json", lines=True)
    if 'reviewText' not in df.columns:
         df = pd.read_csv("../data/raw/scraped_reviews.csv")

# Rename columns to match the labelling logic requirements
column_mapping = {
    'reviewerID': 'user_id',
    'asin': 'product_id',
    'reviewText': 'review_text',
    'overall': 'rating',
    'unixReviewTime': 'review_timestamp',
    'reviewTime': 'review_date_str'
}
df.rename(columns=column_mapping, inplace=True)

# Ensure review_text is string
df['review_text'] = df['clean_review_text'].fillna("") if 'clean_review_text' in df.columns else df['review_text'].fillna("")

# Handle date if needed
if 'review_timestamp' in df.columns:
    df['review_date'] = pd.to_datetime(df['review_timestamp'], unit='s')
else:
    df['review_date'] = pd.to_datetime(df['review_date_str'])

print(f"Dataset Schema: {df.columns.tolist()}")
df.head()

Dataset Schema: ['user_id', 'product_id', 'rating', 'review_text', 'summary', 'verified', 'review_timestamp', 'clean_review_text', 'review_date']


Unnamed: 0,user_id,product_id,rating,review_text,summary,verified,review_timestamp,clean_review_text,review_date
0,A1D4G1SNUZWQOT,7106116521,5,exactly what i needed.,perfect replacements!!,True,1413763200,exactly what i needed.,2014-10-20
1,A3DDWDH9PX2YX2,7106116521,2,"i agree with the other review, the opening is ...","I agree with the other review, the opening is ...",True,1411862400,"i agree with the other review, the opening is ...",2014-09-28
2,A2MWC41EW7XL15,7106116521,4,love these... i am going to order another pack...,My New 'Friends' !!,False,1408924800,love these... i am going to order another pack...,2014-08-25
3,A2UH2QQ275NV45,7106116521,2,too tiny an opening,Two Stars,True,1408838400,too tiny an opening,2014-08-24
4,A89F3LQADZBS5,7106116521,3,okay,Three Stars,False,1406419200,okay,2014-07-27


### 3. Rule 1: Review Length < 15 words AND Rating is 1 or 5


In [13]:
df["review_length"] = df["review_text"].astype(str).apply(lambda x: len(x.split()))

df["rule_short_extreme"] = (
    (df["review_length"] < 15) &
    (df["rating"].isin([1, 5]))
).astype(int)

print(f"Rule 1 Triggered: {df['rule_short_extreme'].sum()} reviews")

Rule 1 Triggered: 284279 reviews


### 4. Rule 2: User Posts > 3 Reviews per Day


In [14]:
df['review_day'] = df['review_date'].dt.date

# Efficient count using transform
df['daily_count'] = df.groupby(['user_id', 'review_day'])['user_id'].transform('count')

df["rule_high_frequency"] = (df["daily_count"] > 3).astype(int)

print(f"Rule 2 Triggered: {df['rule_high_frequency'].sum()} reviews")

Rule 2 Triggered: 13194 reviews


### 5. Rule 3: Rating Deviation ≥ 3 from Product Mean


In [15]:
product_mean_rating = df.groupby("product_id")["rating"].transform("mean")

df["product_mean_rating"] = product_mean_rating
df["rating_deviation"] = abs(df["rating"] - df["product_mean_rating"])

df["rule_rating_deviation"] = (df["rating_deviation"] >= 3).astype(int)

print(f"Rule 3 Triggered: {df['rule_rating_deviation'].sum()} reviews")

Rule 3 Triggered: 16232 reviews


### 6. Rule 4: Duplicate / Near-Duplicate Reviews


In [16]:
# Efficient exact duplicate check
df["rule_duplicate"] = df.duplicated(subset=['review_text'], keep=False).astype(int)

print(f"Rule 4 Triggered: {df['rule_duplicate'].sum()} reviews")

Rule 4 Triggered: 151850 reviews


### 7. Weighted Fake Score Calculation

- Duplicate text: Weight = 2 (Very strong)
- High frequency: Weight = 2 (Strong)
- Rating deviation: Weight = 1 (Medium)
- Short + extreme: Weight = 1 (Weak)

In [17]:
df["fake_score"] = (
    2 * df["rule_duplicate"] +
    2 * df["rule_high_frequency"] +
    1 * df["rule_rating_deviation"] +
    1 * df["rule_short_extreme"]
)

print(f"Fake Score Distribution:")
print(df["fake_score"].value_counts().sort_index())

Fake Score Distribution:
fake_score
0    528623
1    194702
2     58272
3     88432
4      7927
5      4333
6       114
Name: count, dtype: int64


### 8. Final Weak Label Assignment

**FAKE (1)** → fake_score ≥ 2  
**REAL (0)** → fake_score < 2

In [18]:
df["fake_label"] = (df["fake_score"] >= 2).astype(int)

print("Weak Labelling Complete.")

Weak Labelling Complete.


### 9. Confidence-Based Labeling

**High Fake** → fake_score ≥ 3  
**High Real** → fake_score = 0  
**Uncertain** → fake_score ∈ {1, 2}

In [19]:
df["label_confidence"] = np.where(
    df["fake_score"] >= 3, "high_fake",
    np.where(df["fake_score"] == 0, "high_real", "uncertain")
)

print("\nConfidence Distribution:")
print(df["label_confidence"].value_counts())
print(f"\nUncertain samples: {(df['label_confidence'] == 'uncertain').sum()} ({(df['label_confidence'] == 'uncertain').sum() / len(df) * 100:.2f}%)")


Confidence Distribution:
label_confidence
high_real    528623
uncertain    252974
high_fake    100806
Name: count, dtype: int64

Uncertain samples: 252974 (28.67%)


### 10. Label Distribution Check

In [20]:
print("Label Distribution (Percentage):")
print(df["fake_label"].value_counts(normalize=True) * 100)
print("\nNote: This weighted approach is more academically sound than simple OR logic.")

Label Distribution (Percentage):
fake_label
0    81.972183
1    18.027817
Name: proportion, dtype: float64

Note: This weighted approach is more academically sound than simple OR logic.


### 11. Save Labeled Dataset

In [21]:
output_path = "../data/processed/labeled_reviews.csv"
df.to_csv(output_path, index=False)
print(f"Labeled dataset saved to: {output_path}")
print(f"\nColumns saved: {df.columns.tolist()}")

Labeled dataset saved to: ../data/processed/labeled_reviews.csv

Columns saved: ['user_id', 'product_id', 'rating', 'review_text', 'summary', 'verified', 'review_timestamp', 'clean_review_text', 'review_date', 'review_length', 'rule_short_extreme', 'review_day', 'daily_count', 'rule_high_frequency', 'product_mean_rating', 'rating_deviation', 'rule_rating_deviation', 'rule_duplicate', 'fake_score', 'fake_label', 'label_confidence']


### 12. Convert Fake Score to Continuous Trust Score

The `fake_score` is inverted and normalized into a continuous `trust_score` ∈ [0, 1]:

- `fake_score = 0`   → `trust_score = 1.0` (fully trusted)
- `fake_score = max` → `trust_score = 0.0` (least trusted)

Both `fake_score` and `trust_score` are retained in the final dataset.

In [None]:
# Step 5: Convert fake_score to continuous trust_score ∈ [0, 1]
max_fake = df["fake_score"].max()
df["trust_score"] = 1 - (df["fake_score"] / max_fake)

print("Trust Score Statistics:")
print(df["trust_score"].describe().round(4))
print(f"\nfake_score retained: {'fake_score' in df.columns}")
print(f"trust_score range: [{df['trust_score'].min():.4f}, {df['trust_score'].max():.4f}]")

In [None]:
# Save trust-scored dataset (fake_score preserved alongside trust_score)
trust_output_path = "../data/processed/trust_scored_dataset.csv"
df.to_csv(trust_output_path, index=False)
print(f"Trust-scored dataset saved to: {trust_output_path}")
print(f"\nColumns saved: {df.columns.tolist()}")