In [1]:
import pandas as pd
import re
import datetime
import pytz # PST time zone

## Constants

In [None]:
# Generate a unique ID based on timestamp
# Define PST timezone
pst = pytz.timezone("America/Los_Angeles")
# Get current time in PST
pst_time = datetime.datetime.now(pytz.utc).astimezone(pst)
# Format the time
timestamp = pst_time.strftime("%d-%m_%H-%M-%S")

In [2]:
FILE_PATH = 'generated_sequences_no_dp (2).jsonl'
EXPORT_FILE_PATH = f'sample_data_for_quality_check_{timestamp}.csv'

## Read the data

In [3]:
raw_data = pd.read_json(path_or_buf=FILE_PATH, lines=True)

In [4]:
raw_data.head(2)

Unnamed: 0,generated_text
0,"System prompt : Given the Product Title, Produ..."
1,"System prompt : Given the Product Title, Produ..."


In [5]:
raw_data['generated_text'].iloc[0]

"System prompt : Given the Product Title, Product Category, Review Rating and Review Title, you are required to generate the Review | Product Title: Case for Galaxy Note 9,Cutebe Shockproof Series Hard PC+ TPU Bumper Protective Case for Samsung Galaxy Note 9 Crystal | Product Category: Cell Phones & Accessories | Review Rating: 4 | Review Title: Not a bad price for protection and cuteness | Review: Looks and works great. It was a little little on the loose fitting side but now it's fine. I've dropped my phone quite a bit and my phone has come out fine. I have a tempered glass screen protector on it and I'm pretty sure that's what saved my phone. I don't think this case would have protected it. I'm not sure how well it would have protected the camera on the back of the phone. It is a little bit loose and I've had it come off a few times. I haven't had it fall off yet though. I would recommend this case. It's a great price for a cute case that gives you some protection. I would recommend

## Parse the data

In [6]:
patterns = {
    "System prompt": r"System prompt : (.*?) \|",
    "Product Title": r"Product Title: (.*?) \|",
    "Product Category": r"Product Category: (.*?) \|",
    "Review Rating": r"Review Rating: (\d+) \|",
    "Review Title": r"Review Title: (.*?) \|",
    "Review": r"Review: (.*)"
}

def extract_fields(text):
    return {key: re.search(pattern, text).group(1) if re.search(pattern, text) else None for key, pattern in patterns.items()}

In [7]:
# Assuming df is the existing DataFrame with a column 'generated_text'
extracted_data = raw_data["generated_text"].apply(lambda x: extract_fields(x)).apply(pd.Series)

In [8]:
# Concatenate extracted fields with the original DataFrame
data = pd.concat([raw_data, extracted_data], axis=1)


## Analysis and Saving

In [9]:
data.head(1)

Unnamed: 0,generated_text,System prompt,Product Title,Product Category,Review Rating,Review Title,Review
0,"System prompt : Given the Product Title, Produ...","Given the Product Title, Product Category, Rev...","Case for Galaxy Note 9,Cutebe Shockproof Serie...",Cell Phones & Accessories,4,Not a bad price for protection and cuteness,Looks and works great. It was a little little ...


In [10]:
len(data)

10000

In [11]:
# Select a random 5% sample of the data
sample_data = data.sample(frac=0.05, random_state=42)

In [12]:
len(sample_data)

500

In [13]:
# Perform quality checks on the sample data
# 1. Check for any missing values in critical columns
sample_data.isna().sum()

Unnamed: 0,0
generated_text,0
System prompt,0
Product Title,0
Product Category,0
Review Rating,0
Review Title,2
Review,0


In [14]:
# 2. Check for duplicate records
sample_data.duplicated().sum()

0

In [15]:
# 3. Validate data types (for example, checking if patient_uid is numeric)
sample_data.dtypes

Unnamed: 0,0
generated_text,object
System prompt,object
Product Title,object
Product Category,object
Review Rating,object
Review Title,object
Review,object


In [16]:
sample_data.describe()

Unnamed: 0,generated_text,System prompt,Product Title,Product Category,Review Rating,Review Title,Review
count,500,500,500,500,500,498,500
unique,500,1,485,16,5,450,500
top,"System prompt : Given the Product Title, Produ...","Given the Product Title, Product Category, Rev...",Anker Portable Charger PowerCore 20100mAh - Ul...,Cell Phones & Accessories,5,Five Stars,I really like this phone case that's the only ...
freq,1,500,3,415,309,18,1


In [17]:
# Save the sample data to a CSV file (optional)
sample_data.to_csv(EXPORT_FILE_PATH, index=False)
print(f"Sample data saved to {EXPORT_FILE_PATH}")

Sample data saved to sample_data_for_quality_check.csv
