# Google Play Digital Health Apps Reviews - CSV to JSON Conversion

This notebook processes the Google Play digital health apps reviews CSV file to create a JSON file with the following format:
```
{ 
  app_title1: [clean_content1, clean_content2, clean_content3, ...],
  app_title2: [clean_content1, clean_content2, clean_content3, ...]
}
```

We'll only keep reviews where the clean_content has more than 3 spaces.

In [2]:
# Import necessary libraries
import pandas as pd
import json
import os

## 1. Load and Examine the CSV File

In [3]:
# Load the CSV file
file_path = '../data/interim/google_play_digital_health_apps_reviews.csv'

# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

# Read the CSV file
df = pd.read_csv(file_path)

# Display basic information about the dataset
print(f"Total number of rows: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Total number of rows: 6612
Columns: ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion', 'app_id', 'app_title', 'year', 'month', 'yearmonth', 'review_length', 'sentiment', 'clean_content']


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,app_id,app_title,year,month,yearmonth,review_length,sentiment,clean_content
0,233306d7-3824-4bee-ad33-75503e313e6b,Sherry Northington,https://play-lh.googleusercontent.com/a/ACg8oc...,App doesn't properly graph readings. It puts a...,1,0,4.2.2(4),2025-04-05 22:13:01,,,4.2.2(4),com.ochsner.digitalmedicine,Digital Medicine,2025,4,2025-04,496,Negative,app doesnt properly graph readings puts readin...
1,d9d27d40-472b-4627-9073-fe9b8179e8f5,Teresa Weimer,https://play-lh.googleusercontent.com/a-/ALV-U...,👍 good,4,0,4.2.1(5),2025-04-02 15:34:02,,,4.2.1(5),com.ochsner.digitalmedicine,Digital Medicine,2025,4,2025-04,6,Positive,good
2,21086650-4b58-40bc-a4f2-62c1a3a023bc,Josiclyn Ortego,https://play-lh.googleusercontent.com/a/ACg8oc...,love it,5,0,4.2.1(5),2025-03-31 11:19:57,,,4.2.1(5),com.ochsner.digitalmedicine,Digital Medicine,2025,3,2025-03,7,Positive,love
3,cc0b6f1e-7c59-44fe-9ff5-07eb946e8be9,Mary Jenkins,https://play-lh.googleusercontent.com/a/ACg8oc...,Great,5,0,4.1.0(6),2025-03-12 15:57:13,,,4.1.0(6),com.ochsner.digitalmedicine,Digital Medicine,2025,3,2025-03,5,Positive,great
4,8302a08d-3830-419a-b862-7b8892b3e9b0,John Ware,https://play-lh.googleusercontent.com/a/ACg8oc...,Works as expected.,5,0,,2025-03-09 10:44:07,,,,com.ochsner.digitalmedicine,Digital Medicine,2025,3,2025-03,18,Positive,works expected


## 2. Filter and Process the Data

In [4]:
# Keep only the required columns
df_filtered = df[['app_title', 'score', 'clean_content']]

# Remove any rows with missing values
df_filtered = df_filtered.dropna()

# Count initial number of rows
initial_count = len(df_filtered)
print(f"Number of rows after removing NaN values: {initial_count}")

# Filter rows where clean_content has more than 3 spaces
df_filtered = df_filtered[df_filtered['clean_content'].str.count(' ') > 3]

# Count final number of rows
final_count = len(df_filtered)
print(f"Number of rows after filtering for >3 spaces: {final_count}")
print(f"Removed {initial_count - final_count} rows ({(initial_count - final_count) / initial_count:.2%} of data)")

# Show sample of filtered data
df_filtered.head()

Number of rows after removing NaN values: 6578
Number of rows after filtering for >3 spaces: 4047
Removed 2531 rows (38.48% of data)


Unnamed: 0,app_title,score,clean_content
0,Digital Medicine,1,app doesnt properly graph readings puts readin...
6,Digital Medicine,5,everyone nice professional caring procedure co...
8,Digital Medicine,1,working tmobile revvl opening tried deleting a...
10,Digital Medicine,1,second page said allow bluetooth connect would...
11,Digital Medicine,5,traveling use new application blood pressure c...


## 3. Create Dictionary and Convert to JSON

In [6]:
# Now, let's convert the data to the requested JSON format
app_reviews = {}

# Group by app_title
for app_title, group in df_filtered.groupby('app_title'):
    # Initialize the array for this app_title if it doesn't exist
    if app_title not in app_reviews:
        app_reviews[app_title] = []
    
    # Add [score, clean_content] for each review in this app_title
    for _, row in group.iterrows():
        app_reviews[app_title].append([row['score'], row['clean_content']])
# Print the number of unique apps
print(f"Number of unique apps: {len(app_reviews)}")

# Display stats about number of reviews per app
review_counts = {app: len(reviews) for app, reviews in app_reviews.items()}
review_count_series = pd.Series(review_counts)

print(f"\nStats on reviews per app:")
print(f"Min: {review_count_series.min()}")
print(f"Max: {review_count_series.max()}")
print(f"Mean: {review_count_series.mean():.2f}")
print(f"Median: {review_count_series.median()}")

# Show sample of the first 3 reviews for the first 3 apps
sample_dict = {}
for i, (app, reviews) in enumerate(app_reviews.items()):
    if i >= 3:
        break
    sample_dict[app] = reviews[:3]

sample_dict

Number of unique apps: 14

Stats on reviews per app:
Min: 5
Max: 804
Mean: 289.07
Median: 113.5


{'Ada – check your health': [[5,
   'great work lead treatmenti still use great app'],
  [5,
   'already idea symptoms well dealing app asked good questions gave exact answer suspected symptoms already great app'],
  [5, 'best app know learn medicine diagnosis etc']],
 'Calcium Digital Health AI': [[5,
   'love monitor health goals keep track parents medications app straightforward helpful'],
  [5,
   'great app tracking fitness familys health easy use keeps everything one place highly recommend'],
  [5,
   'fantastic job tracking health metrics comprehensively streamlined userfriendly making daily health management less chore']],
 'Digital Healthcare Solutions': [[1,
   'app would fine didnt uninstall reinstall every time used new update fix'],
  [5,
   'allows doctor know steps take help stay healthy quicker diagnose better'],
  [5,
   'app wonderful way keeping doctor informed condition day day basis instead trying condense monthsmonths worth symptoms visit']]}

## 4. Save the Result to JSON File

In [None]:
# Define the output file name
output_file = '../data/processed/google_play_reviews.json'

# Write the dictionary to a JSON file
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(app_reviews, json_file, ensure_ascii=False, indent=2)

# Check the file size
file_size = os.path.getsize(output_file) / (1024 * 1024)  # Convert to MB
print(f"JSON file saved as '{output_file}' ({file_size:.2f} MB)")

JSON file saved as '../data/processed/app_reviews.json' (0.55 MB)


## 5. Verify the JSON Format

In [8]:
# Read the JSON file back to verify its format
with open(output_file, 'r', encoding='utf-8') as json_file:
    loaded_data = json.load(json_file)

# Check if the structure matches what we expect
print(f"Loaded JSON has {len(loaded_data)} app titles")
print(f"Format verification: Each value is a list? {all(isinstance(v, list) for v in loaded_data.values())}")

# Display the first app and its first review
first_app = next(iter(loaded_data))
print(f"\nSample - First app: {first_app}")
if loaded_data[first_app]:
    print(f"First review: {loaded_data[first_app][0][:100]}...")

Loaded JSON has 14 app titles
Format verification: Each value is a list? True

Sample - First app: Ada – check your health
First review: [5, 'great work lead treatmenti still use great app']...
