In [None]:
!pip install tqdm pandas

In [None]:
import json
import glob, os
import pandas as pd
from pathlib import Path
from tqdm import tqdm

## Extract the conbini visits from Google Takeout data to a single JSON file

In [None]:
GOOGLE_TAKEOUT_DATA_DIRECTORY = "./Takeout"

files = list(Path(GOOGLE_TAKEOUT_DATA_DIRECTORY).rglob("Semantic Location History/*/*.json"))
print(len(files))

In [None]:
SEARCH_STRINGS = ["7-eleven", "familymart", "lawson"]

In [None]:
conbinis_visits = []

for file in tqdm(files):
    data: list[dict] = json.load(open(file))["timelineObjects"]
    for d in data:
        if d.get("placeVisit"):
            for element in SEARCH_STRINGS:
                if element in d["placeVisit"]["location"].get("name", "none").lower():
                    clean_visit: dict = d["placeVisit"]
                    if clean_visit.get("otherCandidateLocations"):
                        del clean_visit["otherCandidateLocations"]
                    conbinis_visits.append(clean_visit)

### Save as JSON file

In [None]:
print(len(conbinis_visits))

with open("conbinis_visits.json", "w") as f:
    json.dump(conbinis_visits, f, indent=2, ensure_ascii=False)

## Use the JSON to analyze data

### Load the JSON file as Pandas dataframe

In [None]:
with open('conbinis_visits.json', 'r') as f:
    data = json.load(f)

df = pd.json_normalize(data)
df['startDatetime'] = pd.to_datetime(df['duration.startTimestamp'])
df['endDatetime'] = pd.to_datetime(df['duration.endTimestamp'])
df.info()

### Number of conbinis visits

In [None]:
len(df)

### Number of unique conbinis visited

In [None]:
df['location.placeId'].nunique()

### Earliest conbini visit

In [None]:
df.iloc[df['startDatetime'].idxmin()][['location.name', 'location.address', 'startDatetime']]

### Top 3 most visited conbinis

In [None]:
df2 = df[['location.placeId', 'location.name', 'location.address']].copy()
df2 = df2.drop_duplicates(subset=["location.placeId"], keep="first")
df2['visitsNb'] = df2['location.placeId'].map(df['location.placeId'].value_counts())
df2.sort_values(by='visitsNb', ascending=False, inplace = True)

df2[:3][['location.name', 'location.address', 'visitsNb']]