In [1]:
import json
from pathlib import Path
import pandas as pd

path = Path("..", "Python Coding for Public Policy, Spring 2024 discussion threads.json")
data = json.load(open(path))
threads = pd.json_normalize(data)
# threads

In [2]:
threads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   url             35 non-null     object
 1   type            35 non-null     object
 2   number          35 non-null     int64 
 3   title           35 non-null     object
 4   category        35 non-null     object
 5   subcategory     35 non-null     object
 6   subsubcategory  35 non-null     object
 7   votes           35 non-null     int64 
 8   views           35 non-null     int64 
 9   unique_views    35 non-null     int64 
 10  private         35 non-null     bool  
 11  anonymous       35 non-null     bool  
 12  endorsed        35 non-null     bool  
 13  created_at      35 non-null     object
 14  text            35 non-null     object
 15  document        35 non-null     object
 16  comments        35 non-null     object
 17  user.name       35 non-null     object
 18  user.email  

In [3]:
comments = pd.json_normalize(threads["comments"].explode().dropna())
# comments

In [4]:
replies = pd.json_normalize(threads["answers"].explode().dropna())
# replies

In [5]:
posts = pd.concat([threads, comments, replies]).reset_index()
# posts

In [6]:
# threads["created_at"] = pd.to_datetime(threads["created_at"], format="%Y-%m-%DT%H:%M:%S")
# threads["created_at"] = pd.to_datetime(threads["created_at"], format="ISO8601")
posts["created_at"] = pd.to_datetime(posts["created_at"])
# posts["created_at"]

In [7]:
from datetime import datetime, timedelta
import pytz

# exclude the instructors
filtered_posts = posts[posts["user.role"] != "admin"]

two_weeks_ago = datetime.now(pytz.UTC) - timedelta(weeks=2)
filtered_posts = filtered_posts[filtered_posts["created_at"] >= two_weeks_ago]

filtered_posts = filtered_posts.sort_values("created_at", ascending=False)

# filtered_posts

In [None]:
from IPython.display import HTML

output = filtered_posts.copy()

output["created_at"] = output["created_at"].dt.tz_convert("America/New_York").dt.date

output = output[[
    "user.name",
    "created_at",
    "url",
    # "title",
    "text",
]]

# https://stackoverflow.com/a/20043785/358804
output["url"] = output["url"].apply(lambda url: f'<a href="{url}">Open</a>')

# https://stackoverflow.com/a/56881411/358804
styled = output.style.set_properties(**{
    'text-align': 'left',
    'white-space': 'pre-wrap',
})

HTML(styled.to_html(escape=False))