In [13]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize a dictionary to store aggregated user data
    users_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Use 'display_name' as the unique identifier
                user_id = row.get('display_name', None)

                if user_id is not None:
                    # Check if the user_id is not in the users_data dictionary
                    if user_id not in users_data:
                        users_data[user_id] = {'display_name': user_id}

                    # Update the user dictionary with information from other tables
                    users_data[user_id].update(row)

    # Construct the JSON file path for users
    users_json_path = os.path.join(json_folder, 'users.json')

    # Write data to the users JSON file
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output3'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [12]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize a dictionary to store aggregated post data
    aggregated_posts = {}

    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row_number, row in enumerate(csv_reader, start=1):
                # Assuming 'id' is the unique identifier
                post_id = row.get('id', '')

                # Ignore rows with missing or incorrect values
                if not post_id or not row.get('post_type_id'):
                    continue

                # Check if the post_id is not in the aggregated_posts dictionary
                if post_id not in aggregated_posts:
                    aggregated_posts[post_id] = {
                        'post_id': post_id,
                        'title': row.get('title', ''),
                        'body': row.get('body', ''),
                        'accepted_answer_id': row.get('accepted_answer_id', ''),
                        'answer_count': int(row.get('answer_count', 0)) if row.get('answer_count', '').isdigit() else 0,
                        'comment_count': int(row.get('comment_count', 0)),
                        'community_owned_date': row.get('community_owned_date', ''),
                        'creation_date': row.get('creation_date', ''),
                        'favorite_count': int(row.get('favorite_count', 0)) if row.get('favorite_count', '').isdigit() else 0,
                        'last_activity_date': row.get('last_activity_date', ''),
                        'last_edit_date': row.get('last_edit_date', ''),
                        'last_editor_display_name': row.get('last_editor_display_name', ''),
                        'last_editor_user_id': row.get('last_editor_user_id', ''),
                        'owner_display_name': row.get('owner_display_name', ''),
                        'owner_user_id': row.get('owner_user_id', ''),
                        'parent_id': row.get('parent_id', ''),
                        'post_type_id': int(row.get('post_type_id', 0)),
                        'score': int(row.get('score', 0)),
                        'tags': row.get('tags', ''),
                        'view_count': int(row.get('view_count', 0)) if row.get('view_count', '').isdigit() else 0,
                        'answers': []
                    }

                # If the row is an answer, add it to the 'answers' list
                if table_name == 'posts_answers':
                    answer = {
                        'answer_id': row.get('answer_id', ''),
                        'answer_body': row.get('body', ''),  # Use the 'body' field for answer text
                        'answer_user_id': row.get('owner_user_id', '')  # Use the 'owner_user_id' as the answer user_id
                    }
                    aggregated_posts[post_id]['answers'].append(answer)

    # Construct the JSON file path for posts
    json_file_path = os.path.join(json_folder, 'posts.json')

    # Open JSON file for writing
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        # Write the aggregated post data to the JSON file
        json.dump(list(aggregated_posts.values()), json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output3'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)
