In [1]:
import pandas as pd
import re

# Load your dataset
data = pd.read_csv('/content/reviews_supplements.csv')
df = pd.DataFrame(data)

# Updated clean_text function to handle non-string values (like NaN)
def clean_text(text):
    if isinstance(text, str):  # Check if the value is a string
        text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove non-alphanumeric characters
        return text.strip()
    return text  # If not a string, return the value as is (e.g., NaN)

# Apply the cleaning function to 'title' and 'text' columns
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)

# Ensure 'helpful_vote' column is numeric
df['helpful_vote'] = pd.to_numeric(df['helpful_vote'], errors='coerce')

# Handle any NaN values in 'helpful_vote' (optional, you can choose to drop or fill them)
df['helpful_vote'].fillna(0, inplace=True)  # Filling with 0, but you can choose another approach

# Convert 'verified_purchase' to boolean
df['verified_purchase'] = df['verified_purchase'].apply(lambda x: True if x == True else False)

# Drop 'date', 'time', and 'timestamp' columns
df = df.drop(columns=['date', 'time', 'timestamp'])

# Explicitly cast each column to its appropriate datatype
df = df.astype({
    'rating': 'string',
    'title': 'string',
    'text': 'string',
    'asin': 'string',
    'parent_asin': 'string',
    'user_id': 'string',
    'helpful_vote': 'string',
    'verified_purchase': 'string'
})

# Save the cleaned DataFrame to a CSV file
df.to_csv('/content/cleaned_reviews_supplements.csv', index=False)

# Display cleaned DataFrame
print(df.dtypes)  # To confirm data types
print(df)         # Display the cleaned DataFrame

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['helpful_vote'].fillna(0, inplace=True)  # Filling with 0, but you can choose another approach


rating               string[python]
title                string[python]
text                 string[python]
asin                 string[python]
parent_asin          string[python]
user_id              string[python]
helpful_vote         string[python]
verified_purchase    string[python]
dtype: object
      rating                                      title  \
0          4                  B Complex in gel cap form   
1          5                                 Five Stars   
2          5                                 Five Stars   
3          5               Vitamin Shoppe Dry Vitamin A   
4          5        Un producto que compro regularmente   
...      ...                                        ...   
16666      5                                       Love   
16667      5                           Used to use this   
16668      5                                works great   
16669      4  That this product is unbelievably perfect   
16670      5                                 Othe

In [2]:
!git clone https://github.com/gretelai/gdpr-helpers.git
!cd gdpr-helpers; pip install -Uqq .
!pip install --upgrade tabulate

fatal: destination path 'gdpr-helpers' already exists and is not an empty directory.
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for gdpr-helpers (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.19.0 requires tabulate>=0.9, but you have tabulate 0.8.9 which is incompatible.[0m[31m
Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
  Attempting uninstall: tabulate
    Found existing installation: tabulate 0.8.9
    Uninstalling tabulate-0.8.9:
      Successfully uninstalled tabulate-0.8.9
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts

In [3]:
import os
if not os.getcwd().endswith('gdpr-helpers'):
    os.chdir('gdpr-helpers')

import glob
from gdpr_helpers import Anonymizer, reports  # Import reports module
import pandas as pd # Import pandas for DataFrame operations
search_pattern = "/content/cleaned_reviews_supplements.csv"

am = Anonymizer(
    project_name="gdpr-workflow",
    run_mode="cloud",
    transforms_config="src/config/transforms_config.yaml",
    synthetics_config="src/config/synthetics_config.yaml",
    endpoint="https://api.gretel.cloud"
    )

def modified_ner_report(report):
    """Modified ner_report function to handle the TypeError and ValueError."""
    try:
        df = pd.DataFrame(report)
    except ValueError:
        # If ValueError occurs due to uneven lengths, handle it (e.g., return a message)
        return {"md": "**NER report data has uneven lengths. Cannot create DataFrame.**"}

    # Check if the DataFrame is empty
    if df.empty:
        return {"md": "**No entities detected** by named entity recognition."}
    # Check the number of detected entities using len() instead of comparing to an integer
    num_entities = len(df.entities_detected.value_counts().keys()) if 'entities_detected' in df else 0  # Handle missing column
    if num_entities > 3:
        ner_text = "<b>Multiple PII types</b> detected by named entity recognition."
    elif num_entities > 0: # Check if any entities are detected
        ner_text = "<b>PII detected</b> by named entity recognition."
    else:
        ner_text = "**No entities detected** by named entity recognition."
    return {"md": ner_text}

# Replace the original ner_report with the modified one
reports.ner_report = modified_ner_report # Overwriting original ner_report

for dataset_path in glob.glob(search_pattern):
    am.anonymize(dataset_path=dataset_path)

Found cached Gretel credentials
Using endpoint https://api.gretel.cloud
Logged in as hitlergocrazy@gmail.com ✅
Follow along with model training at: https://console.gretel.ai/proj_2n5I6nJGQZa5EOv9CSzHCYvIbAM
Anonymizing '/content/cleaned_reviews_supplements.csv'
**NER report data has uneven lengths. Cannot create DataFrame.**
Transforms finished.
Processing time: 5.354 seconds
Record count: 16671

Columns transformed
| field_name   | transform_type   |   transformed_count |
|:-------------|:-----------------|--------------------:|
| user_id      | hash             |               16671 |




Actgan - Job completed: : 306it [06:56,  1.36s/it, epoch=597, loss_g=-1.45, loss_d=0.0081, loss_r=0]


dict_keys(['left_rows', 'left_cols', 'right_rows', 'right_cols', 'fields', 'left_correlation', 'right_correlation', 'correlation_difference', 'memorized_lines', 'mean_absolute_error', 'average_divergence', 'left_pca', 'right_pca', 'synthetic_data_quality_score', 'field_correlation_stability', 'principal_component_stability', 'field_distribution_stability', 'privacy_protection_level', 'membership_inference_attack_score', 'attribute_inference_attack_score', 'data_privacy_score', 'model_id', 'project_name', 'model_name', 'model_type', 'fatal_error', 'summary', 'total_time_seconds', 'total_billed_seconds', 'total_input_bytes', 'total_output_bytes', 'total_billed_bytes', 'job_status', 'job_type'])


Synthesis finished.
Lines memorized: 0

Model training time: 395.458 seconds

Job status: completed

Job type: actgan

Privacy report
|                        | value    |
|:-----------------------|:---------|
| outlier_filter         | Medium   |
| similarity_filter      | Medium   |
| overfitt

  data = data.fillna("")
  if data[0] != data[1]:
  data = data.fillna("")
  if data[0] != data[1]:


KeyError: 'html'

### Ignore the HTML error above, it shows the failure in presenting the report in html form, which can be found on grete's dashboard as well