In [21]:
import json
import pandas as pd
from datetime import datetime
import re
import os

In [22]:
def get_filename(path):
    return os.path.splitext(os.path.basename(path))[0]

In [23]:
# filename = "../../data/raw/facebook_grouped.json"
def get_json_data(file_path):
    with open(file_path,"r") as file:
        data = json.load(file)
    return data

In [24]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    return text

In [32]:
def create_preprocessed_json(file_path):
    filename = get_filename(file_path)
    data = get_json_data(file_path)

    #dictionary to store the posts and comments by month
    posts_by_month = {}
    comments_by_month = {}

    # iterate over the months
    for month_group in data:
        month = month_group["month"]
        posts = month_group["posts"]

        if month not in posts_by_month:
            posts_by_month[month] = []
        if month not in comments_by_month:
            comments_by_month[month] = []

        for post in posts:

            # Spliting into date and time
            created_utc_str = post['created_utc']
            dt_object = datetime.strptime(created_utc_str, "%Y-%m-%dT%H:%M:%S")
            date_part = dt_object.date().isoformat()  
            time_part = dt_object.strftime("%H:%M:%S")  

            post_data = {
                # 'post_id': post['post_id'],
                # 'author': post['author'],
                # 'url': post['url'],
                # 'created_utc': created_utc_str,
                'month': month,
                'title': clean_text(post['title']),
                'date' : date_part,
                'time' : time_part,
                'score': post['score'],
                'num_comments': post['num_comments'],
                'flair': post['flair'],
                'body': clean_text(post.get('body', ''))  
            }
            posts_by_month[month].append(post_data)    

            for comment in post["comments"]:
                comment_data = {
                    # 'post_id': post['post_id'],
                    # 'comment_author': comment['author'],
                    # 'comment_created_utc': comment['created_utc']
                    'comment_body': clean_text(comment['body']),
                    'comment_score': comment['score'],
                    'date' : date_part,
                    'time' : time_part
                }
                comments_by_month[month].append(comment_data)
        
        # Save the preprocessed data
        processed_dir = f'../../data/processed/{filename}_preprocessed'
        os.makedirs(processed_dir, exist_ok=True) 
        with open(os.path.join(processed_dir, 'posts_by_month.json'), 'w') as f:
            json.dump(posts_by_month, f, indent=2)
        with open(os.path.join(processed_dir, 'comments_by_month.json'), 'w') as f:
            json.dump(comments_by_month, f, indent=2)

In [33]:
create_preprocessed_json("../../data/raw/facebook_grouped.json")