In [1]:
import pandas as pd
from datasets import load_dataset
import joblib
import os
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from datetime import datetime
# import multiprocessing as mp
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# 1. Data Import

In [2]:
full_path = "../data/amazon_reviews_beauty.joblib"
file_name = "amazon_reviews_beauty.joblib"
if os.path.exists(full_path):
    print(f"The file {file_name} exists in the folder.")
    # Load the joblib file
    df = joblib.load('../data/amazon_reviews_beauty.joblib')
else:
    print(f"The file {file_name} does not exists in the folder. Importing...")
    # Load the review dataset
    review_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", split="full", trust_remote_code=True)
    # Load the metadata dataset
    meta_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
    # Convert datasets to pandas DataFrames
    df_reviews = pd.DataFrame(review_dataset).sample(n=10000, random_state=2024)
    df_meta = pd.DataFrame(meta_dataset)
    # Merge the datasets on parent_asin
    df = pd.merge(df_reviews, df_meta, on='parent_asin', how='left', suffixes=('_review', '_meta'))
    # Save the DataFrame as a joblib file
    joblib.dump(df, '../data/amazon_reviews_beauty.joblib') 

The file amazon_reviews_beauty.joblib exists in the folder.


In [3]:
df.columns

Index(['rating', 'title_review', 'text', 'images_review', 'asin',
       'parent_asin', 'user_id', 'timestamp', 'helpful_vote',
       'verified_purchase', 'main_category', 'title_meta', 'average_rating',
       'rating_number', 'features', 'description', 'price', 'images_meta',
       'videos', 'store', 'categories', 'details', 'bought_together',
       'subtitle', 'author'],
      dtype='object')

In [4]:
df.shape

(10000, 25)

# 2. Outliers scoring

## 2.1 Firt version

In [48]:
class TemporalOutlierDetector(BaseEstimator, TransformerMixin):
    """
    Detect temporal outliers based on review counts within specified time windows.

    Parameters:
    -----------
    time_column : str, default='timestamp'
        Name of the column containing timestamp information.
    window_size : str, default='D'
        Size of the time window for grouping reviews (e.g., 'D' for daily).

    Methods:
    --------
    fit(X, y=None)
        Fit method required by scikit-learn's BaseEstimator. Returns self.
    transform(X)
        Transform method to calculate temporal outlier flags based on review counts.
        Returns a numpy array of temporal outlier flags.

    Explanation:
    ------------
    This class groups the data by time periods defined by `window_size`, counts reviews within each period,
    and identifies temporal outliers based on a threshold of review counts.
    """
    def __init__(self, time_column='timestamp', window_size='D'):
        self.time_column = time_column
        self.window_size = window_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        Transform method to calculate temporal outlier flags based on review counts.

        Parameters:
        -----------
        X : pandas DataFrame
            Input data with timestamp information.

        Returns:
        --------
        numpy.ndarray
            Numpy array containing temporal outlier flags (1 for outlier, 0 otherwise).
        """
        X = X.copy()
        X['datetime'] = pd.to_datetime(X[self.time_column], unit='s', errors='coerce')
        X['review_count'] = X.groupby(X['datetime'].dt.to_period(self.window_size))['datetime'].transform('count')
        X['temporal_outlier'] = (X['review_count'] > X['review_count'].quantile(0.95)).astype(int)
        return X[['temporal_outlier']].values  # Return numpy array

class BehavioralOutlierDetector(BaseEstimator, TransformerMixin):
    """
    Detect behavioral outliers based on user review patterns and rating deviations.

    Parameters:
    -----------
    user_column : str, default='user_id'
        Name of the column containing user identifiers.
    time_column : str, default='timestamp'
        Name of the column containing timestamp information.
    rating_column : str, default='rating'
        Name of the column containing rating information.
    window_size : str, default='D'
        Size of the time window for grouping reviews (e.g., 'D' for daily).
    review_threshold : int, default=3
        Threshold for identifying high-frequency users based on review counts.
    rating_deviation_threshold : float, default=1.5
        Threshold for identifying users with rating deviations from overall averages.

    Methods:
    --------
    fit(X, y=None)
        Fit method required by scikit-learn's BaseEstimator. Returns self.
    transform(X)
        Transform method to calculate behavioral outlier flags based on user behavior.
        Returns a numpy array of behavioral outlier flags.

    Explanation:
    ------------
    This class identifies behavioral outliers among users based on two criteria: high-frequency reviewing
    and rating deviations. It computes user review counts and average ratings, identifies high-frequency
    users and users with rating deviations, and flags outliers accordingly.
    """
    
    def __init__(self, user_column='user_id', time_column='timestamp', rating_column='rating', window_size='D', review_threshold=3, rating_deviation_threshold=1.5):
        self.user_column = user_column
        self.time_column = time_column
        self.rating_column = rating_column
        self.window_size = window_size
        self.review_threshold = review_threshold
        self.rating_deviation_threshold = rating_deviation_threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        Transform method to calculate behavioral outlier flags based on user behavior.

        Parameters:
        -----------
        X : pandas DataFrame
            Input data with user, timestamp, and rating information.

        Returns:
        --------
        numpy.ndarray
            Numpy array containing behavioral outlier flags (1 for outlier, 0 otherwise).
        """
        X = X.copy()
        X['datetime'] = pd.to_datetime(X[self.time_column], unit='s', errors='coerce')
        
        user_review_counts = X.groupby([self.user_column, X['datetime'].dt.to_period(self.window_size)]).size().reset_index(name='review_count')
        high_frequency_users = user_review_counts[user_review_counts['review_count'] > self.review_threshold][self.user_column].unique()
        
        user_avg_ratings = X.groupby(self.user_column)[self.rating_column].mean()
        overall_avg_rating = X[self.rating_column].mean()
        deviating_users = user_avg_ratings[abs(user_avg_ratings - overall_avg_rating) > self.rating_deviation_threshold].index
        
        X['high_frequency_outlier'] = X[self.user_column].isin(high_frequency_users).astype(int)
        X['rating_deviation_outlier'] = X[self.user_column].isin(deviating_users).astype(int)
        
        return X[['high_frequency_outlier', 'rating_deviation_outlier']].values  # Return numpy array

# def text_outliers(text_data, threshold=0.1):
#     vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
#     tfidf_matrix = vectorizer.fit_transform(text_data)
#     cosine_sim = cosine_similarity(tfidf_matrix, dense_output=False)
#     avg_similarity = np.mean(cosine_sim, axis=1)
#     return (avg_similarity < threshold).astype(int).reshape(-1, 1)  # Return 2D numpy array

def text_outliers(text_data, z_score_threshold):
    """
    Identify text outliers based on cosine similarity scores.

    Parameters:
    -----------
    text_data : list or array-like
        List of text documents to analyze.
    z_score_threshold : float
        Threshold value for identifying outliers based on z-scores of cosine similarities.

    Returns:
    --------
    numpy.ndarray
        Numpy array containing text outlier flags (1 for outlier, 0 otherwise).

    Explanation:
    ------------
    This function calculates cosine similarity scores for text documents represented as TF-IDF vectors,
    computes z-scores based on the distribution of these scores, and identifies outliers based on a
    dynamically determined threshold (`z_score_threshold`).
    """
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(text_data)
    cosine_sim = cosine_similarity(tfidf_matrix, dense_output=False)
    avg_similarity = np.mean(cosine_sim, axis=1)
    
    # Print debug information
    print("Average similarity (first 10):", avg_similarity[:10])
    
    mean_sim = np.mean(avg_similarity)
    std_sim = np.std(avg_similarity)
    z_scores = (avg_similarity - mean_sim) / std_sim
    
    # Print more debug information
    print("Mean similarity:", mean_sim)
    print("Standard deviation of similarity:", std_sim)
    print("Z-scores (first 10):", z_scores[:10])
    
    return (z_scores < -z_score_threshold).astype(int).reshape(-1, 1)

# def main_code(df):
# Prepare the data
print("Preparing data...")
df['helpful_vote'] = pd.to_numeric(df['helpful_vote'], errors='coerce')
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['verified_purchase'] = df['verified_purchase'].astype(int)
# Ensure text columns are converted to strings
text_columns = ['title_review', 'text', 'main_category', 'title_meta', 'description']
for col in text_columns:
    df[col] = df[col].astype(str)
df['combined_text'] = df['title_review'] + ' ' + df['text'] + ' ' + df['main_category'] + ' ' + df['title_meta'] + ' ' + df['description']

# Handle missing values
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df[col] = df[col].fillna(df[col].mean())

# For non-numeric columns, fill with a placeholder
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_columns:
    df[col] = df[col].fillna("Unknown")

# Convert timestamp to datetime and handle invalid values
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')

print("Data preparation complete.")

# Define the feature pipeline
feature_pipeline = ColumnTransformer([
    ('num', StandardScaler(), ['rating', 'helpful_vote', 'verified_purchase', 'price', 'average_rating', 'rating_number']),
    ('text', FunctionTransformer(text_outliers, kw_args={'z_score_threshold': 35}), 'combined_text'),
    ('temporal', TemporalOutlierDetector(), ['timestamp']),
    ('behavioral', BehavioralOutlierDetector(), ['user_id', 'timestamp', 'rating'])
])

# Fit and transform the data using the pipeline
print("Running outlier detection pipeline...")
with tqdm(total=5, desc="Processing") as pbar:
    # Transform features
    X_transformed = feature_pipeline.fit_transform(df)
    pbar.update(1)

    # Ensure X_transformed is a 2D numpy array
    X_transformed = np.asarray(X_transformed)
    if X_transformed.ndim == 1:
        X_transformed = X_transformed.reshape(-1, 1)

    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42, n_jobs=-1)
    iso_forest_outliers = iso_forest.fit_predict(X_transformed)
    pbar.update(1)

    # Local Outlier Factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1, n_jobs=-1)
    lof_outliers = lof.fit_predict(X_transformed)
    pbar.update(1)

    # Add outlier results to the dataframe
    df['isolation_forest_outlier'] = (iso_forest_outliers == -1).astype(int)
    df['lof_outlier'] = (lof_outliers == -1).astype(int)
    # Convert small float values to 0 or 1
    df['text_outlier'] = (X_transformed[:, 0] > 0.5).astype(int)
    df['temporal_outlier'] = (X_transformed[:, 1] > 0.5).astype(int)
    df['high_frequency_outlier'] = (X_transformed[:, 2] > 0.5).astype(int)
    df['rating_deviation_outlier'] = (X_transformed[:, 3] > 0.5).astype(int)
    pbar.update(1)

    # Calculate overall outlier score
    df['outlier_score'] = (df['isolation_forest_outlier'] + 
                           df['lof_outlier'] + 
                           df['text_outlier'] + 
                           df['temporal_outlier'] + 
                           df['high_frequency_outlier'] + 
                           df['rating_deviation_outlier'])
    # Mark as outlier if at least two methods flagged it as an outlier
    df['is_outlier'] = (df['outlier_score'] >= 2).astype(int)
    pbar.update(1)

print("Outlier detection complete.")

# # Identify samples most likely to be outliers
# top_outliers = df.nlargest(10, 'outlier_score')
# print("\nTop 10 potential outliers:")
# print(top_outliers[['rating', 'helpful_vote', 'verified_purchase', 'price', 'outlier_score', 'title_review', 'main_category', 'high_frequency_outlier', 'rating_deviation_outlier']])

# # Calculate metrics
# total_samples = len(df)
# print(f"\nOutlier Detection Results:")
# for outlier_type in ['isolation_forest_outlier', 'lof_outlier', 'text_outlier', 'temporal_outlier', 'high_frequency_outlier', 'rating_deviation_outlier']:
#     count = df[outlier_type].sum()
#     print(f"{outlier_type}: {count} outliers ({count/total_samples:.2%})")

# Assuming df is your DataFrame
# main_code(df)

Preparing data...
Data preparation complete.
Running outlier detection pipeline...


Processing:  20%|█████████████████▌                                                                      | 1/5 [00:02<00:10,  2.72s/it]

Average similarity (first 10): [[0.03610338]
 [0.02530873]
 [0.04091656]
 [0.05355231]
 [0.02495647]
 [0.02806567]
 [0.0405637 ]
 [0.03375799]
 [0.04532672]
 [0.02853394]]
Mean similarity: 0.035800519662544855
Standard deviation of similarity: 0.011623486776102018
Z-scores (first 10): [[ 0.0260563 ]
 [-0.90263711]
 [ 0.44014649]
 [ 1.52723454]
 [-0.93294272]
 [-0.66544998]
 [ 0.40978894]
 [-0.17572411]
 [ 0.81956509]
 [-0.62516367]]


Processing: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.41it/s]

Outlier detection complete.





In [46]:
# Identify samples most likely to be outliers
top_outliers = df.nlargest(10, 'outlier_score')
print("\nTop 10 potential outliers:")
top_outliers#[['rating', 'helpful_vote', 'verified_purchase', 'price', 'outlier_score', 'title_review', 'main_category', 'high_frequency_outlier', 'rating_deviation_outlier']]


Top 10 potential outliers:


Unnamed: 0,rating,title_review,text,images_review,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,author,combined_text,isolation_forest_outlier,lof_outlier,text_outlier,temporal_outlier,high_frequency_outlier,rating_deviation_outlier,outlier_score,is_outlier
4354,5.0,All the perfect star reviews on here are correct,All the perfect star reviews on here are corre...,[],B01D2IXB20,B01D2IXB20,AED24UIY2S3ACO4U5XAHOVF4HU6Q,NaT,15,1,...,Unknown,All the perfect star reviews on here are corre...,1,1,1,1,0,1,5,1
77,5.0,I pray this sproduct NEVER gets discontinued !,I have usded this product around my eyes for o...,[],B01IA95GV0,B01IA95GV0,AFA3ROXMMCIMZVLXNGYA2ZXZDNTQ,NaT,5,0,...,Unknown,I pray this sproduct NEVER gets discontinued !...,1,1,1,1,0,0,4,1
312,5.0,Looks beautiful,Works perfectly in the salon for color process...,[],B005EUEK4S,B005EUEK4S,AFOMGW34K26S4NHKWSMYTSBITTQQ,NaT,2,1,...,Unknown,Looks beautiful Works perfectly in the salon f...,1,1,1,0,0,1,4,1
508,5.0,Duradero,Siempre que uso este perfume me alagan y creen...,[],B08739QVNW,B0BTJ6SYKB,AE7UXIOTHPPS54LJV2G6EHMTCBFQ,NaT,4,1,...,Unknown,Duradero Siempre que uso este perfume me alaga...,0,1,1,1,0,1,4,1
1049,5.0,Perfect 😍,I lovvvvveeeeee it!!!! You can't go wrong with...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B06Y22GS6X,B06Y22GS6X,AEBPCYFU5LWAYDN35KZEBND77SCQ,NaT,4,0,...,Unknown,Perfect 😍 I lovvvvveeeeee it!!!! You can't go ...,1,1,1,1,0,0,4,1
1481,5.0,If you have smelly pits you need this!!!,Before anything you won’t receive expired deod...,[],B072Y3CGRX,B072Y3CGRX,AHPICTZYV5JFT55CXJ2EOAERLYJQ,NaT,0,0,...,Unknown,If you have smelly pits you need this!!! Befor...,1,1,1,0,0,1,4,1
1728,5.0,So Cute!,The headbands are so cute! I love the colors ...,[],B072K1ZW8L,B09FP8PP2K,AFLR6AKBXXIYLBTERI5KAG3I7TTA,NaT,11,1,...,Unknown,So Cute! The headbands are so cute! I love th...,1,1,1,1,0,0,4,1
3174,5.0,The hint you’ve all been waiting for..,Listen up ya’ll - so many people (me included)...,[],B07V6RQGRR,B07V6RQGRR,AFDRB6KJKCI67P3LTWAIULAZGOJA,NaT,21,0,...,Unknown,The hint you’ve all been waiting for.. Listen ...,1,1,1,1,0,0,4,1
3243,5.0,Fantastic!,I really like this product. There is a learnin...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B07ZJKVVLW,B07ZJKVVLW,AFD6UD3I66OFS3ZNXACZKTSZPYVQ,NaT,389,1,...,Unknown,Fantastic! I really like this product. There i...,1,1,1,1,0,0,4,1
3297,5.0,Amazing Deal,Amazing wig! I can't believe how nice this is...,[],B00ITMWBFI,B00ITMWBFI,AFVWTIFV725AOUFM32QM74EPQIJA,NaT,10,0,...,Unknown,Amazing Deal Amazing wig! I can't believe how...,1,1,1,1,0,0,4,1


In [47]:
# Calculate metrics
total_samples = len(df)
print(f"\nOutlier Detection Results:")
for outlier_type in ['isolation_forest_outlier', 'lof_outlier', 'text_outlier', 'temporal_outlier', 'high_frequency_outlier', 'rating_deviation_outlier']:
    count = df[outlier_type].sum()
    print(f"{outlier_type}: {count} outliers ({count/total_samples:.2%})")


Outlier Detection Results:
isolation_forest_outlier: 1000 outliers (10.00%)
lof_outlier: 1000 outliers (10.00%)
text_outlier: 6080 outliers (60.80%)
temporal_outlier: 550 outliers (5.50%)
high_frequency_outlier: 0 outliers (0.00%)
rating_deviation_outlier: 540 outliers (5.40%)


In [22]:
df[['isolation_forest_outlier', 'lof_outlier', 'text_outlier', 'temporal_outlier', 'high_frequency_outlier', 'rating_deviation_outlier']]

Unnamed: 0,isolation_forest_outlier,lof_outlier,text_outlier,temporal_outlier,high_frequency_outlier,rating_deviation_outlier
0,0,0,1,0,0,0
1,0,0,0,0,0,0
2,0,0,1,0,0,0
3,0,0,1,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
9995,0,1,1,0,0,0
9996,0,0,0,0,0,0
9997,0,0,1,0,0,0
9998,0,1,1,0,0,0


In [63]:
np.mean(df["parent_asin"].value_counts().values)

np.float64(1.3542795232936078)

In [73]:
df.groupby('parent_asin').size()

parent_asin
1477044280     1
6041134546     1
B000050FDE     3
B000068PBJ     1
B000068PBL     1
              ..
B0C7WQK2QW     1
B0C9CWKY9G    17
B0CC929DZZ     1
B0CDH5TH82     2
B0CDNZ7F2V     4
Length: 7384, dtype: int64

## 2.2 Second version

In [5]:
class TemporalOutlierDetector(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='timestamp', window_size='D', group_column='parent_asin'):
        self.time_column = time_column
        self.window_size = window_size
        self.group_column = group_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['datetime'] = pd.to_datetime(X[self.time_column], unit='s', errors='coerce')
        X['review_count'] = X.groupby([self.group_column, X['datetime'].dt.to_period(self.window_size)])['datetime'].transform('count')
        X['temporal_outlier'] = (X['review_count'] > X.groupby(self.group_column)['review_count'].transform(lambda x: x.quantile(0.95))).astype(int)
        return X[['temporal_outlier']].values


class BehavioralOutlierDetector(BaseEstimator, TransformerMixin):
    def __init__(self, user_column='user_id', time_column='timestamp', rating_column='rating', window_size='D', review_threshold=3, rating_deviation_threshold=1.5, group_column='parent_asin'):
        self.user_column = user_column
        self.time_column = time_column
        self.rating_column = rating_column
        self.window_size = window_size
        self.review_threshold = review_threshold
        self.rating_deviation_threshold = rating_deviation_threshold
        self.group_column = group_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['datetime'] = pd.to_datetime(X[self.time_column], unit='s', errors='coerce')

        # High-frequency user detection
        user_review_counts = X.groupby([self.group_column, self.user_column, X['datetime'].dt.to_period(self.window_size)]).size().reset_index(name='review_count')
        high_frequency_users = user_review_counts[user_review_counts['review_count'] > self.review_threshold][self.user_column].unique()

        # Rating deviation detection
        user_avg_ratings = X.groupby([self.group_column, self.user_column])[self.rating_column].mean().reset_index()
        overall_avg_ratings = X.groupby(self.group_column)[self.rating_column].mean().reset_index()
        deviating_users = user_avg_ratings.merge(overall_avg_ratings, on=self.group_column, suffixes=('_user', '_overall'))
        deviating_users['rating_deviation'] = abs(deviating_users[f'{self.rating_column}_user'] - deviating_users[f'{self.rating_column}_overall'])
        deviating_users = deviating_users[deviating_users['rating_deviation'] > self.rating_deviation_threshold][self.user_column].unique()

        X['high_frequency_outlier'] = X[self.user_column].isin(high_frequency_users).astype(int)
        X['rating_deviation_outlier'] = X[self.user_column].isin(deviating_users).astype(int)

        return X[['high_frequency_outlier', 'rating_deviation_outlier']].values

# def text_outliers(text_data, description_data, group_data, z_score_threshold):
#     """
#     Identify text outliers based on cosine similarity scores with product descriptions.

#     Parameters:
#     -----------
#     text_data : list or array-like
#         List of text documents to analyze (combined text for each review).
#     description_data : list or array-like
#         List of product descriptions corresponding to each text document.
#     group_data : list or array-like
#         List of group identifiers (e.g., parent_asin) indicating which product each review belongs to.
#     z_score_threshold : float
#         Threshold value for identifying outliers based on z-scores of cosine similarities.

#     Returns:
#     --------
#     numpy.ndarray
#         Numpy array containing text outlier flags (1 for outlier, 0 otherwise).

#     Explanation:
#     ------------
#     This function calculates cosine similarity scores between each review's combined text and the corresponding
#     product description within each product group. It computes z-scores based on the distribution of these scores 
#     and identifies outliers based on a dynamically determined threshold (`z_score_threshold`).
#     """
#     # Vectorize the text data and description data
#     vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    
#     combined_text_tfidf = vectorizer.fit_transform(text_data)
#     description_tfidf = vectorizer.transform(description_data)
    
#     # Initialize array to store outlier flags
#     outlier_flags = np.zeros(len(text_data))
    
#     # Process each group separately
#     for group_id in np.unique(group_data):
#         # Get indices for the current group
#         group_indices = np.where(group_data == group_id)[0]
        
#         # Extract the text data and description data for the current group
#         group_texts = combined_text_tfidf[group_indices]
#         group_description = description_tfidf[group_indices[0]]  # Assume description is the same for all in group
        
#         # Compute cosine similarity between each review's combined text and the group's description
#         cosine_sim = np.array([cosine_similarity(group_texts[i], group_description).flatten()[0]
#                                for i in range(len(group_texts))])
        
#         # Calculate z-scores for the cosine similarities
#         mean_sim = np.mean(cosine_sim)
#         std_sim = np.std(cosine_sim)
#         z_scores = (cosine_sim - mean_sim) / std_sim
        
#         # Identify outliers based on z-scores
#         group_outliers = (z_scores < -z_score_threshold).astype(int)
        
#         # Update the outlier flags for the current group
#         outlier_flags[group_indices] = group_outliers
    
#     return outlier_flags.reshape(-1, 1)

def text_outliers(text_data, description_data, group_data, z_score_threshold):
    group_texts = text_data
    description_tfidf = description_data

    group_indices = np.where(group_data == group_data[0])[0]
    group_description = description_tfidf[group_indices[0]]  # Assuming description is the same for all in group

    # Compute cosine similarity between each review's combined text and the group's description
    cosine_sim = np.array([cosine_similarity(group_texts[i], group_description).flatten()[0]
                           for i in range(group_texts.shape[0])])

    # Calculate z-scores for the cosine similarities
    mean_sim = np.mean(cosine_sim)
    std_sim = np.std(cosine_sim)
    z_scores = (cosine_sim - mean_sim) / std_sim

    # Identify outliers based on z-score threshold
    outliers = np.where(z_scores > z_score_threshold)[0]

    return outliers


class TextOutlierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, z_score_threshold=1.5):
        self.z_score_threshold = z_score_threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assuming X is a DataFrame and the columns are named as expected
        combined_text = X['combined_text'].values
        description = X['description'].values
        parent_asin = X['parent_asin'].values
        
        return text_outliers(combined_text, description, parent_asin, self.z_score_threshold)




print("Preparing data...")
df['helpful_vote'] = pd.to_numeric(df['helpful_vote'], errors='coerce')
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['verified_purchase'] = df['verified_purchase'].astype(int)
# Ensure text columns are converted to strings
text_columns = ['title_review', 'text', 'main_category', 'title_meta', 'description']
for col in text_columns:
    df[col] = df[col].astype(str)
df['combined_text'] = df['title_review'] + ' ' + df['text'] + ' ' + df['main_category'] + ' ' + df['title_meta'] + ' ' + df['description']

# Handle missing values
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df[col] = df[col].fillna(df[col].mean())

# For non-numeric columns, fill with a placeholder
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_columns:
    df[col] = df[col].fillna("Unknown")

# Convert timestamp to datetime and handle invalid values
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')

print("Data preparation complete.")


feature_pipeline = ColumnTransformer([
    ('num', StandardScaler(), ['rating', 'helpful_vote', 'verified_purchase', 'price', 'average_rating', 'rating_number']),
    # ('text', TextOutlierTransformer(z_score_threshold=1.5), ['combined_text', 'description', 'parent_asin']),
    ('temporal', TemporalOutlierDetector(group_column='parent_asin'), ['timestamp', 'parent_asin']),
    ('behavioral', BehavioralOutlierDetector(group_column='parent_asin'), ['user_id', 'timestamp', 'rating', 'parent_asin'])
])


Preparing data...
Data preparation complete.


In [6]:
print("Running outlier detection pipeline...")
with tqdm(total=5, desc="Processing") as pbar:
    X_transformed = feature_pipeline.fit_transform(df)
    pbar.update(1)

    X_transformed = np.asarray(X_transformed)
    if X_transformed.ndim == 1:
        X_transformed = X_transformed.reshape(-1, 1)

    iso_forest = IsolationForest(contamination=0.1, random_state=42, n_jobs=-1)
    iso_forest_outliers = iso_forest.fit_predict(X_transformed)
    pbar.update(1)

    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1, n_jobs=-1)
    lof_outliers = lof.fit_predict(X_transformed)
    pbar.update(1)

    df['isolation_forest_outlier'] = (iso_forest_outliers == -1).astype(int)
    df['lof_outlier'] = (lof_outliers == -1).astype(int)
    df['text_outlier'] = (X_transformed[:, 0] > 0.5).astype(int)
    df['temporal_outlier'] = (X_transformed[:, 1] > 0.5).astype(int)
    df['high_frequency_outlier'] = (X_transformed[:, 2] > 0.5).astype(int)
    df['rating_deviation_outlier'] = (X_transformed[:, 3] > 0.5).astype(int)
    pbar.update(1)

    df['outlier_score'] = (df['isolation_forest_outlier'] + 
                           df['lof_outlier'] + 
                           df['text_outlier'] + 
                           df['temporal_outlier'] + 
                           df['high_frequency_outlier'] + 
                           df['rating_deviation_outlier'])
    df['is_outlier'] = (df['outlier_score'] >= 2).astype(int)
    pbar.update(1)

print("Outlier detection complete.")


Running outlier detection pipeline...


Processing: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.38it/s]

Outlier detection complete.





In [7]:
# Calculate metrics
total_samples = len(df)
print(f"\nOutlier Detection Results:")
for outlier_type in ['isolation_forest_outlier', 'lof_outlier', 'text_outlier', 'temporal_outlier', 'high_frequency_outlier', 'rating_deviation_outlier']:
    count = df[outlier_type].sum()
    print(f"{outlier_type}: {count} outliers ({count/total_samples:.2%})")


Outlier Detection Results:
isolation_forest_outlier: 1000 outliers (10.00%)
lof_outlier: 1000 outliers (10.00%)
text_outlier: 6080 outliers (60.80%)
temporal_outlier: 550 outliers (5.50%)
high_frequency_outlier: 0 outliers (0.00%)
rating_deviation_outlier: 540 outliers (5.40%)


## 2.3. Third version

Dataset characteristics:

Amazon reviews typically have multiple reviews per product.
However, in your case, you mentioned having a maximum of 20 observations per product and often just 1.
This low number of observations per group can indeed cause issues with outlier detection algorithms that rely on local density or isolation concepts.


Problems with the current approach:

Isolation Forest and LOF work best with larger sample sizes.
With only 1-20 observations per group, these methods can't effectively identify outliers within each product group.
It may lead to unreliable results or errors, as we've seen.


Suggested approach:
Instead of grouping by 'parent_asin', it would be more appropriate to apply these methods to the entire dataset. Here's a revised strategy:
a. Use global outlier detection:
Apply Isolation Forest and LOF to the entire dataset without grouping.
b. Retain product-specific features:
Include product-specific features (e.g., price, average_rating) in the analysis to capture product-level variations.
c. Focus on review-specific features:
Emphasize features that are specific to individual reviews (e.g., rating, helpful_vote, verified_purchase).
d. Consider temporal and behavioral outliers:
Keep the TemporalOutlierDetector and BehavioralOutlierDetector as they are, as these can still provide valuable insights at the product group level.

In [37]:
class TemporalOutlierDetector(BaseEstimator, TransformerMixin):
    def __init__(self, time_column='timestamp', window_size='D', group_column='parent_asin'):
        self.time_column = time_column
        self.window_size = window_size
        self.group_column = group_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['datetime'] = pd.to_datetime(X[self.time_column], unit='s', errors='coerce')
        X['review_count'] = X.groupby([self.group_column, X['datetime'].dt.to_period(self.window_size)])['datetime'].transform('count')
        X['temporal_outlier'] = (X['review_count'] > X.groupby(self.group_column)['review_count'].transform(lambda x: x.quantile(0.95))).astype(int)
        return X[['temporal_outlier']].values


class BehavioralOutlierDetector(BaseEstimator, TransformerMixin):
    def __init__(self, user_column='user_id', time_column='timestamp', rating_column='rating', window_size='D', review_threshold=3, rating_deviation_threshold=1.5, group_column='parent_asin'):
        self.user_column = user_column
        self.time_column = time_column
        self.rating_column = rating_column
        self.window_size = window_size
        self.review_threshold = review_threshold
        self.rating_deviation_threshold = rating_deviation_threshold
        self.group_column = group_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['datetime'] = pd.to_datetime(X[self.time_column], errors='coerce')

        # High-frequency user detection
        user_review_counts = X.groupby([self.group_column, self.user_column, X['datetime'].dt.to_period(self.window_size)]).size().reset_index(name='review_count')
        high_frequency_users = user_review_counts[user_review_counts['review_count'] > self.review_threshold][self.user_column].unique()

        # Rating deviation detection
        user_avg_ratings = X.groupby([self.group_column, self.user_column])[self.rating_column].mean().reset_index()
        overall_avg_ratings = X.groupby(self.group_column)[self.rating_column].mean().reset_index()
        deviating_users = user_avg_ratings.merge(overall_avg_ratings, on=self.group_column, suffixes=('_user', '_overall'))
        deviating_users['rating_deviation'] = abs(deviating_users[f'{self.rating_column}_user'] - deviating_users[f'{self.rating_column}_overall'])
        deviating_users = deviating_users[deviating_users['rating_deviation'] > self.rating_deviation_threshold][self.user_column].unique()

        # Combine both outlier types into a single score
        X['behavioral_outlier'] = ((X[self.user_column].isin(high_frequency_users) | 
                                    X[self.user_column].isin(deviating_users))).astype(int)

        return X[['behavioral_outlier']].values

class GroupwiseIsolationForest(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.1, random_state=42, max_samples=1000, n_estimators=100):
        self.contamination = contamination
        self.random_state = random_state
        self.max_samples = max_samples
        self.n_estimators = n_estimators
        self.group_models = {}

    def fit(self, X, y=None):
        groups = X[X.columns[-1]]
        features = X.iloc[:, :-1]
        
        for group in groups.unique():
            group_mask = (groups == group)
            group_features = features[group_mask]
            
            # If the group has more samples than max_samples, take a random sample
            if len(group_features) > self.max_samples:
                group_features = group_features.sample(n=self.max_samples, random_state=self.random_state)
            
            iso_forest = IsolationForest(contamination=self.contamination, 
                                         random_state=self.random_state, 
                                         max_samples=min(len(group_features), 256),
                                         n_estimators=self.n_estimators, n_jobs=-1)
            iso_forest.fit(group_features)
            self.group_models[group] = iso_forest
        
        return self

    def transform(self, X):
        groups = X[X.columns[-1]]
        features = X.iloc[:, :-1]
        outlier_scores = np.zeros(X.shape[0])
        
        for group in groups.unique():
            group_mask = (groups == group)
            group_features = features[group_mask]
            
            if group in self.group_models:
                outlier_scores[group_mask] = self.group_models[group].decision_function(group_features)
            else:
                # If we encounter a new group during transform, we'll use the global mean
                outlier_scores[group_mask] = np.mean(list(self.group_models.values())[0].decision_function(group_features))
        
        return outlier_scores.reshape(-1, 1)
        
class GroupwiseLOF(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=20, contamination=0.1, min_group_size=10):
        self.n_neighbors = n_neighbors
        self.contamination = contamination
        self.min_group_size = min_group_size
        self.group_models = {}

    def fit(self, X, y=None):
        groups = X[X.columns[-1]]
        features = X.iloc[:, :-1]
        
        for group in groups.unique():
            group_mask = (groups == group)
            group_features = features[group_mask]
            
            if len(group_features) >= self.min_group_size:
                n_neighbors = min(self.n_neighbors, len(group_features) - 1)
                lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=self.contamination, n_jobs=-1)
                lof.fit(group_features)
                self.group_models[group] = lof
            else:
                print(f"Warning: Group {group} has fewer than {self.min_group_size} samples. Skipping LOF for this group.")
        
        return self

    def transform(self, X):
        groups = X[X.columns[-1]]
        features = X.iloc[:, :-1]
        outlier_scores = np.zeros(X.shape[0])
        
        for group in groups.unique():
            group_mask = (groups == group)
            group_features = features[group_mask]
            
            if group in self.group_models:
                outlier_scores[group_mask] = -self.group_models[group].negative_outlier_factor_
            else:
                # For groups without a model (due to small size), assign a neutral score
                outlier_scores[group_mask] = 0
        
        return outlier_scores.reshape(-1, 1)

    def transform(self, X):
        groups = X[X.columns[-1]]
        features = X.iloc[:, :-1]
        outlier_scores = np.zeros(X.shape[0])
        
        for group in groups.unique():
            group_mask = (groups == group)
            group_features = features[group_mask]
            
            if group in self.group_models:
                outlier_scores[group_mask] = -self.group_models[group].negative_outlier_factor_
            else:
                # If we encounter a new group during transform, we'll use the global mean
                outlier_scores[group_mask] = np.mean(-list(self.group_models.values())[0].negative_outlier_factor_)
        
        return outlier_scores.reshape(-1, 1)


class TextOutlierTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, z_score_threshold=1.5):
        self.z_score_threshold = z_score_threshold
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

    def fit(self, X, y=None):
        combined_text = X['combined_text_review'].values
        self.vectorizer.fit(combined_text)
        return self

    def transform(self, X):
        combined_text = X['combined_text_review'].values
        description = X['combined_text_product'].values
        parent_asin = X['parent_asin'].values
        
        combined_text_tfidf = self.vectorizer.transform(combined_text)
        description_tfidf = self.vectorizer.transform(description)
        
        outlier_scores = []
        
        for group_id in np.unique(parent_asin):
            group_mask = (parent_asin == group_id)
            group_texts = combined_text_tfidf[group_mask]
            group_description = description_tfidf[group_mask][0]  # Assuming description is the same for all in group
            
            cosine_sim = cosine_similarity(group_texts, group_description)
            
            mean_sim = np.mean(cosine_sim)
            std_sim = np.std(cosine_sim)
            z_scores = (cosine_sim - mean_sim) / std_sim
            
            group_outlier_scores = (z_scores < -self.z_score_threshold).astype(float)
            outlier_scores.extend(group_outlier_scores)
        
        return np.array(outlier_scores).reshape(-1, 1)

In [38]:
print("Preparing data...")
df['helpful_vote'] = pd.to_numeric(df['helpful_vote'], errors='coerce')
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['verified_purchase'] = df['verified_purchase'].astype(int)
# Ensure text columns are converted to strings
text_columns = ['title_review', 'text', 'main_category', 'title_meta', 'description']
for col in text_columns:
    df[col] = df[col].astype(str)
df['combined_text_review'] = df['title_review'] + ' ' + df['text']
df['combined_text_product'] = df['main_category'] + ' ' + df['title_meta'] + ' ' + df['description']
# Handle missing values
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df[col] = df[col].fillna(df[col].mean())
# For non-numeric columns, fill with a placeholder
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_columns:
    df[col] = df[col].fillna("Unknown")
# Convert timestamp to datetime and handle invalid values
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
print("Data preparation complete.")

feature_pipeline = ColumnTransformer([
    ('num', StandardScaler(), ['rating', 'helpful_vote', 'verified_purchase', 'price', 'average_rating', 'rating_number']),
    ('text', TextOutlierTransformer(z_score_threshold=1.5), ['combined_text_review', 'combined_text_product', 'parent_asin']),
    ('temporal', TemporalOutlierDetector(group_column='parent_asin'), ['timestamp', 'parent_asin']),
    ('behavioral', BehavioralOutlierDetector(group_column='parent_asin'), ['user_id', 'timestamp', 'rating', 'parent_asin']),
])

print("Running outlier detection pipeline...")
with tqdm(total=5, desc="Processing") as pbar:
    X_transformed = feature_pipeline.fit_transform(df)
    pbar.update(1)

    X_transformed = np.asarray(X_transformed)
    if X_transformed.ndim == 1:
        X_transformed = X_transformed.reshape(-1, 1)

    iso_forest = IsolationForest(contamination=0.1, random_state=42, n_jobs=-1)
    iso_forest_outliers = iso_forest.fit_predict(X_transformed)
    pbar.update(1)

    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1, n_jobs=-1)
    lof_outliers = lof.fit_predict(X_transformed)
    pbar.update(1)

    df['isolation_forest_outlier'] = (iso_forest_outliers == -1).astype(int)
    df['lof_outlier'] = (lof_outliers == -1).astype(int)
    df['text_outlier'] = (X_transformed[:, 0] > 0.5).astype(int)
    df['temporal_outlier'] = (X_transformed[:, 1] > 0.5).astype(int)
    df['high_frequency_outlier'] = (X_transformed[:, 2] > 0.5).astype(int)
    df['rating_deviation_outlier'] = (X_transformed[:, 3] > 0.5).astype(int)
    pbar.update(1)

    df['outlier_score'] = (df['isolation_forest_outlier'] + 
                           df['lof_outlier'] + 
                           df['text_outlier'] + 
                           df['temporal_outlier'] + 
                           df['high_frequency_outlier'] + 
                           df['rating_deviation_outlier'])
    df['is_outlier'] = (df['outlier_score'] >= 2).astype(int)
    pbar.update(1)

print("Outlier detection complete.")

Preparing data...
Data preparation complete.
Running outlier detection pipeline...


  z_scores = (cosine_sim - mean_sim) / std_sim
Processing: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:13<00:00,  2.77s/it]

Outlier detection complete.





In [39]:
# Calculate metrics
total_samples = len(df)
print(f"\nOutlier Detection Results:")
for outlier_type in ['temporal_outlier', 'behavioral_outlier', 'isolation_forest_outlier', 'lof_outlier', 'text_outlier']:
    count = df[outlier_type].sum()
    print(f"{outlier_type}: {count} outliers ({count/total_samples:.2%})")


Outlier Detection Results:
temporal_outlier: 550 outliers (5.50%)
behavioral_outlier: 550 outliers (5.50%)
isolation_forest_outlier: 1000 outliers (10.00%)
lof_outlier: 1000 outliers (10.00%)
text_outlier: 6080 outliers (60.80%)


In [41]:
df['is_outlier'].value_counts(normalize=True)*100

is_outlier
0    84.35
1    15.65
Name: proportion, dtype: float64

In [43]:
df['text_outlier'].value_counts(normalize=True)

text_outlier
1    0.608
0    0.392
Name: proportion, dtype: float64

In [49]:
df[lambda x: (x['text_outlier']==1) & (x['description'].apply(lambda x: len(x) > 0))][['title_review','text', 'main_category', 'title_meta', 'description']][0:50]

Unnamed: 0,title_review,text,main_category,title_meta,description
0,Spray,WOW what a great scent and it works very well.,All Beauty,Soft & Dri Soft Scent Aerosol Anti-Perspirant ...,[]
2,...,Great product!,All Beauty,Grip Hair Pins U Shaped Bobby Pins Hair Bun St...,[]
3,Five Stars,Wooow thank you,All Beauty,Argan Oil Hair Treatment Gift Set - 3 Value Pa...,[]
7,Here's a tutorial for the product.,[[VIDEOID:b856b4ca5487125ecf1cfc26024ff47b]] R...,All Beauty,"Vitamin C Serum For Face, Topical Facial Serum...","[""Drop A Decade From Your Face from using our ..."
9,NYC lipsticks are pigment rich,Good color,All Beauty,NYC Expert Last Lip Color - 449 Creamy Mauve,[]
13,So cute,If you are scrolling for a review to help you ...,All Beauty,LEEYDESIGN Cat Ears Headband Party Headbands G...,"['Product description', 'Item Features :', 'St..."
14,I can breath.,"These are great! After wearing a mask all day,...",All Beauty,"Mask Bracket, Protect Lipstick Lips - Internal...","['Unisex-adult', 'Protect Female makeup, do no..."
15,Great Product,"These were perfect! Exactly as described, and ...",All Beauty,Pandahall 10pcs Golden Iron Clip-on Earring Co...,[]
16,Outstanding quality and value👍,Love these products everything I have received...,All Beauty,Beard Brush & Comb Set for Men – 100% Boar Bri...,[]
17,Wife is happy!,Purchased as a gift for my wife. She is total...,All Beauty,Emjoi Micro-Pedi POWER Pro Callus Remover,"[""The Corded rechargeable Micro-Pedi Pro is id..."
