In [6]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Initialize an empty dictionary for aggregated data
            aggregated_data = {}

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Assuming 'id' is the unique identifier
                unique_id = row['id']

                # Check if the unique_id is not in the aggregated_data dictionary
                if unique_id not in aggregated_data:
                    aggregated_data[unique_id] = {}

                # Replace placeholder values with actual column names
                username_column = row.get('user_name_column', 'username')
                email_column = row.get('email_address_column', 'email')

                # Aggregate data into the NoSQL schema
                aggregated_data[unique_id][table_name] = {
                    'id': unique_id,
                    "username": row.get(username_column, ''),
                    "email": row.get(email_column, ''),
                    "reputation": row.get('reputation_column', 0),
                    "creation_date": row.get('creation_date_column', ''),
                    "last_access_date": row.get('last_access_date_column', ''),
                    "location": row.get('location_column', ''),
                    "badges": {
                        "gold": row.get('gold_badges_column', 0),
                        "silver": row.get('silver_badges_column', 0),
                        "bronze": row.get('bronze_badges_column', 0),
                    },
                    "posts": [
                        {
                            "post_id": row.get('post_id_column_1', ''),
                            "title": row.get('post_title_column_1', ''),
                            "creation_date": row.get('post_creation_date_column_1', ''),
                        },
                        {
                            "post_id": row.get('post_id_column_2', ''),
                            "title": row.get('post_title_column_2', ''),
                            "creation_date": row.get('post_creation_date_column_2', ''),
                        },
                        # Add more post fields as needed
                    ],
                    "comments": [
                        {
                            "comment_id": row.get('comment_id_column_1', ''),
                            "text": row.get('comment_text_column_1', ''),
                            "creation_date": row.get('comment_creation_date_column_1', ''),
                        },
                        {
                            "comment_id": row.get('comment_id_column_2', ''),
                            "text": row.get('comment_text_column_2', ''),
                            "creation_date": row.get('comment_creation_date_column_2', ''),
                        },
                        # Add more comment fields as needed
                    ],
                    # Add more fields as needed
                }

        # Construct the JSON file path
        json_file_path = os.path.join(json_folder, f'{table_name}.json')

        # Open JSON file for writing
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            # Write the aggregated data to the JSON file
            json.dump(list(aggregated_data.values()), json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [20]:
import os
import csv
import json

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize an empty dictionary for aggregated data
    aggregated_data = {}

    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Assuming 'id' is the unique identifier
                unique_id = row['id']

                # Check if the unique_id is not in the aggregated_data dictionary
                if unique_id not in aggregated_data:
                    aggregated_data[unique_id] = {}

                # Aggregate data into the NoSQL schema
                aggregated_data[unique_id][table_name] = {
                    'id': unique_id,
                    "username": row.get('displayname', ''),
                    "email": row.get('email_hash', ''),
                    "reputation": row.get('reputation', 0),
                    "creation_date": row.get('creation_date', ''),
                    "last_access_date": row.get('last_access_date', ''),
                    "location": row.get('location', ''),
                    "badges": {
                        "gold": row.get('gold_badges', 0),
                        "silver": row.get('silver_badges', 0),
                        "bronze": row.get('bronze_badges', 0),
                    },
                    "posts": [
                        {
                            "post_id": row.get('post_id', ''),
                            "title": row.get('title', ''),
                            "creation_date": row.get('creation_date_post', ''),
                        },
                        # Add more post fields as needed
                    ],
                    "comments": [
                        {
                            "comment_id": row.get('comment_id', ''),
                            "text": row.get('text', ''),
                            "creation_date": row.get('creation_date_comment', ''),
                        },
                        # Add more comment fields as needed
                    ],
                    # Add more fields as needed
                    "additional_field_1": row.get('additional_field_1', ''),
                    "additional_field_2": row.get('additional_field_2', ''),
                    # Add more additional fields as needed
                }

    # Construct the JSON file path
    json_file_path = os.path.join(json_folder, 'aggregated_data.json')

    # Open JSON file for writing
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        # Write the aggregated data to the JSON file
        json.dump(list(aggregated_data.values()), json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [22]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Initialize an empty dictionary for aggregated data
            aggregated_data = {}

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Assuming 'id' is the unique identifier
                unique_id = row['id']

                # Check if the unique_id is not in the aggregated_data dictionary
                if unique_id not in aggregated_data:
                    aggregated_data[unique_id] = {}

                # Replace placeholder values with actual column names
                username_column = row.get('user_name_column', 'display_name')
                email_column = row.get('email_address_column', 'email')
                reputation_column = row.get('reputation_column', 'reputation')
                creation_date_column = row.get('creation_date_column', 'creation_date')
                last_access_date_column = row.get('last_access_date_column', 'last_access_date')
                location_column = row.get('location_column', 'location')
                gold_badges_column = row.get('gold_badges_column', 'gold_badges')
                silver_badges_column = row.get('silver_badges_column', 'silver_badges')
                bronze_badges_column = row.get('bronze_badges_column', 'bronze_badges')
                post_id_column_1 = row.get('post_id_column_1', 'post_id_1')
                post_title_column_1 = row.get('post_title_column_1', 'post_title_1')
                post_creation_date_column_1 = row.get('post_creation_date_column_1', 'post_creation_date_1')
                post_id_column_2 = row.get('post_id_column_2', 'post_id_2')
                post_title_column_2 = row.get('post_title_column_2', 'post_title_2')
                post_creation_date_column_2 = row.get('post_creation_date_column_2', 'post_creation_date_2')
                comment_id_column_1 = row.get('comment_id_column_1', 'comment_id_1')
                comment_text_column_1 = row.get('comment_text_column_1', 'comment_text_1')
                comment_creation_date_column_1 = row.get('comment_creation_date_column_1', 'comment_creation_date_1')
                comment_id_column_2 = row.get('comment_id_column_2', 'comment_id_2')
                comment_text_column_2 = row.get('comment_text_column_2', 'comment_text_2')
                comment_creation_date_column_2 = row.get('comment_creation_date_column_2', 'comment_creation_date_2')

                # Aggregate data into the NoSQL schema
                aggregated_data[unique_id][table_name] = {
                    'id': unique_id,
                    "username": row.get(username_column, ''),
                    "email": row.get(email_column, ''),
                    "reputation": row.get(reputation_column, 0),
                    "creation_date": row.get(creation_date_column, ''),
                    "last_access_date": row.get(last_access_date_column, ''),
                    "location": row.get(location_column, ''),
                    "badges": {
                        "gold": row.get(gold_badges_column, 0),
                        "silver": row.get(silver_badges_column, 0),
                        "bronze": row.get(bronze_badges_column, 0),
                    },
                    "posts": [
                        {
                            "post_id": row.get(post_id_column_1, ''),
                            "title": row.get(post_title_column_1, ''),
                            "creation_date": row.get(post_creation_date_column_1, ''),
                        },
                        {
                            "post_id": row.get(post_id_column_2, ''),
                            "title": row.get(post_title_column_2, ''),
                            "creation_date": row.get(post_creation_date_column_2, ''),
                        },
                        # Add more post fields as needed
                    ],
                    "comments": [
                        {
                            "comment_id": row.get(comment_id_column_1, ''),
                            "text": row.get(comment_text_column_1, ''),
                            "creation_date": row.get(comment_creation_date_column_1, ''),
                        },
                        {
                            "comment_id": row.get(comment_id_column_2, ''),
                            "text": row.get(comment_text_column_2, ''),
                            "creation_date": row.get(comment_creation_date_column_2, ''),
                        },
                        # Add more comment fields as needed
                    ],
                    # Add more fields as needed
                }

            # Log the number of rows processed for each table
            print(f"Table: {table_name}, Rows processed: {len(aggregated_data)}")

        # Construct the JSON file path
        json_file_path = os.path.join(json_folder, f'{table_name}.json')

        # Open JSON file for writing
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            # Write the aggregated data to the JSON file
            json.dump(list(aggregated_data.values()), json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


Table: badges, Rows processed: 500
Table: post_history, Rows processed: 500
Table: post_links, Rows processed: 500
Table: posts_answers, Rows processed: 500
Table: posts_moderator_nomination, Rows processed: 342
Table: posts_orphaned_tag_wiki, Rows processed: 167
Table: posts_privilege_wiki, Rows processed: 2
Table: posts_questions, Rows processed: 500
Table: posts_tag_wiki, Rows processed: 500
Table: stackoverflow_posts, Rows processed: 500
Table: tags, Rows processed: 500
Table: users, Rows processed: 500
Table: votes, Rows processed: 500


In [28]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize dictionaries for aggregated data
    users_data = {}
    badges_data = {}
    questions_data = {}
    other_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Use a different column as the unique identifier for tables without 'id'
                unique_id = row.get('user_id', row.get('post_id', None))

                if unique_id is not None:
                    # Check if the unique_id is not in the aggregated_data dictionary
                    if unique_id not in users_data:
                        users_data[unique_id] = {'id': unique_id}
                    else:
                        users_data[unique_id].update(row)

                    # Example: Add other tables to specific dictionaries
                    if table_name == 'badges':
                        if unique_id not in badges_data:
                            badges_data[unique_id] = {'id': unique_id}
                        badges_data[unique_id].update(row)
                    elif table_name == 'posts_questions':
                        if unique_id not in questions_data:
                            questions_data[unique_id] = {'id': unique_id}
                        questions_data[unique_id].update(row)
                    else:
                        if unique_id not in other_data:
                            other_data[unique_id] = {'id': unique_id}
                        other_data[unique_id].update({table_name: row})

    # Construct the JSON file paths
    users_json_path = os.path.join(json_folder, 'users.json')
    badges_json_path = os.path.join(json_folder, 'badges.json')
    questions_json_path = os.path.join(json_folder, 'questions.json')
    other_data_json_path = os.path.join(json_folder, 'other_data.json')

    # Write data to JSON files
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

    with open(badges_json_path, 'w', encoding='utf-8') as badges_json_file:
        json.dump(list(badges_data.values()), badges_json_file, indent=2)

    with open(questions_json_path, 'w', encoding='utf-8') as questions_json_file:
        json.dump(list(questions_data.values()), questions_json_file, indent=2)

    with open(other_data_json_path, 'w', encoding='utf-8') as other_data_json_file:
        json.dump(list(other_data.values()), other_data_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [2]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize dictionaries for aggregated data
    users_data = {}
    badges_data = {}
    questions_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Use a different column as the unique identifier for tables without 'id'
                unique_id = row.get('user_id', row.get('post_id', None))

                if unique_id is not None:
                    # Check if the unique_id is not in the aggregated_data dictionary
                    if unique_id not in users_data:
                        users_data[unique_id] = {'id': unique_id}

                    # Update the 'users' dictionary with information from other tables
                    users_data[unique_id].update(row)

                    # Example: Add other tables to specific dictionaries
                    if table_name == 'badges':
                        if unique_id not in badges_data:
                            badges_data[unique_id] = {'id': unique_id}
                        badges_data[unique_id].update(row)
                    elif table_name == 'posts_questions':
                        if unique_id not in questions_data:
                            questions_data[unique_id] = {'id': unique_id}
                        questions_data[unique_id].update(row)

    # Construct the JSON file paths
    users_json_path = os.path.join(json_folder, 'users.json')
    badges_json_path = os.path.join(json_folder, 'badges.json')
    questions_json_path = os.path.join(json_folder, 'questions.json')

    # Write data to JSON files
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

    with open(badges_json_path, 'w', encoding='utf-8') as badges_json_file:
        json.dump(list(badges_data.values()), badges_json_file, indent=2)

    with open(questions_json_path, 'w', encoding='utf-8') as questions_json_file:
        json.dump(list(questions_data.values()), questions_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [4]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize a dictionary to store aggregated user data
    users_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Use 'display_name' as the unique identifier
                user_id = row.get('display_name', None)

                if user_id is not None:
                    # Check if the user_id is not in the users_data dictionary
                    if user_id not in users_data:
                        users_data[user_id] = {'display_name': user_id}

                    # Update the user dictionary with information from other tables
                    users_data[user_id].update(row)

    # Construct the JSON file path for users
    users_json_path = os.path.join(json_folder, 'users.json')

    # Write data to the users JSON file
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [26]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize a dictionary to store aggregated user data
    users_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Use 'display_name' as the unique identifier
                user_id = row.get('display_name', None)

                if user_id is not None:
                    # Check if the user_id is not in the users_data dictionary
                    if user_id not in users_data:
                        users_data[user_id] = {'display_name': user_id}

                    # Update the user dictionary with information from other tables
                    users_data[user_id].update(row)

    # Construct the JSON file path for users
    users_json_path = os.path.join(json_folder, 'users.json')

    # Write data to the users JSON file
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [25]:
#add badges.csv info into users.json

import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize dictionaries to store aggregated data
    users_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Assuming 'user_id' is the identifier for users
                user_id = row.get('user_id', None)

                if user_id is not None:
                    # Check if the user_id is not in the users_data dictionary
                    if user_id not in users_data:
                        users_data[user_id] = {}

                    # Update the user dictionary with information from other tables
                    users_data[user_id].update(row)

                    # For badges, store information in the user dictionary
                    if table_name == 'badges':
                        if 'badges' not in users_data[user_id]:
                            users_data[user_id]['badges'] = []
                        users_data[user_id]['badges'].append(row)

    # Construct the JSON file path for users
    users_json_path = os.path.join(json_folder, 'users.json')

    # Write user data to the users.json file
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [19]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize dictionaries to store aggregated data
    users_data = {}
    posts_data = {}

    # Iterate through each CSV file
    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row in csv_reader:
                # Assuming 'user_id' is the identifier for users
                user_id = row.get('user_id', None)

                # Assuming 'post_id' is the identifier for posts
                post_id = row.get('post_id', None)

                if user_id is not None:
                    # Check if the user_id is not in the users_data dictionary
                    if user_id not in users_data:
                        users_data[user_id] = {'user_id': user_id, 'posts': []}

                    # Update the user dictionary with information from other tables
                    users_data[user_id].update(row)

                if post_id is not None:
                    # Check if the post_id is not in the posts_data dictionary
                    if post_id not in posts_data:
                        posts_data[post_id] = {'post_id': post_id, 'answers': []}

                    # For posts, store information in the post dictionary
                    posts_data[post_id].update(row)

                    # For answers, add to the list within the post dictionary
                    if table_name == 'posts_answers':
                        posts_data[post_id]['answers'].append({
                            'user_id': row.get('user_id', ''),
                            'answer_body': row.get('body', ''),
                        })

    # Construct the JSON file path for users
    users_json_path = os.path.join(json_folder, 'users.json')

    # Write user data to the users.json file
    with open(users_json_path, 'w', encoding='utf-8') as users_json_file:
        json.dump(list(users_data.values()), users_json_file, indent=2)

    # Construct the JSON file path for posts
    posts_json_path = os.path.join(json_folder, 'posts.json')

    # Write post data to the posts.json file
    with open(posts_json_path, 'w', encoding='utf-8') as posts_json_file:
        json.dump(list(posts_data.values()), posts_json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)


In [8]:
import csv
import json
import os

def aggregate_and_convert_to_nosql(csv_files, json_folder):
    # Create the output directory if it doesn't exist
    os.makedirs(json_folder, exist_ok=True)

    # Initialize a dictionary to store aggregated post data
    aggregated_posts = {}

    for csv_file in csv_files:
        # Extract the table name from the CSV file name
        table_name = os.path.splitext(os.path.basename(csv_file))[0]

        # Open CSV file for reading
        with open(csv_file, 'r', encoding='utf-8') as csv_file:
            # Create a CSV reader
            csv_reader = csv.DictReader(csv_file)

            # Iterate through each row in the CSV file
            for row_number, row in enumerate(csv_reader, start=1):
                # Assuming 'id' is the unique identifier
                post_id = row.get('id', '')

                # Ignore rows with missing or incorrect values
                if not post_id or not row.get('post_type_id'):
                    continue

                # Check if the post_id is not in the aggregated_posts dictionary
                if post_id not in aggregated_posts:
                    aggregated_posts[post_id] = {
                        'post_id': post_id,
                        'title': row.get('title', ''),
                        'body': row.get('body', ''),
                        'accepted_answer_id': row.get('accepted_answer_id', ''),
                        'answer_count': int(row.get('answer_count', 0)) if row.get('answer_count', '').isdigit() else 0,
                        'comment_count': int(row.get('comment_count', 0)),
                        'community_owned_date': row.get('community_owned_date', ''),
                        'creation_date': row.get('creation_date', ''),
                        'favorite_count': int(row.get('favorite_count', 0)) if row.get('favorite_count', '').isdigit() else 0,
                        'last_activity_date': row.get('last_activity_date', ''),
                        'last_edit_date': row.get('last_edit_date', ''),
                        'last_editor_display_name': row.get('last_editor_display_name', ''),
                        'last_editor_user_id': row.get('last_editor_user_id', ''),
                        'owner_display_name': row.get('owner_display_name', ''),
                        'owner_user_id': row.get('owner_user_id', ''),
                        'parent_id': row.get('parent_id', ''),
                        'post_type_id': int(row.get('post_type_id', 0)),
                        'score': int(row.get('score', 0)),
                        'tags': row.get('tags', ''),
                        'view_count': int(row.get('view_count', 0)) if row.get('view_count', '').isdigit() else 0,
                        'answers': []
                    }

                # If the row is an answer, add it to the 'answers' list
                if table_name == 'posts_answers':
                    answer = {
                        'answer_id': row.get('answer_id', ''),
                        'answer_body': row.get('body', ''),  # Use the 'body' field for answer text
                        'answer_user_id': row.get('owner_user_id', '')  # Use the 'owner_user_id' as the answer user_id
                    }
                    aggregated_posts[post_id]['answers'].append(answer)

    # Construct the JSON file path for posts
    json_file_path = os.path.join(json_folder, 'posts.json')

    # Open JSON file for writing
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        # Write the aggregated post data to the JSON file
        json.dump(list(aggregated_posts.values()), json_file, indent=2)

# Specify the list of CSV files and the folder for JSON output files
csv_files = ['badges.csv', 'post_history.csv', 'post_links.csv',
             'posts_answers.csv', 'posts_moderator_nomination.csv',
             'posts_orphaned_tag_wiki.csv', 'posts_privilege_wiki.csv',
             'posts_questions.csv', 'posts_tag_wiki.csv', 'stackoverflow_posts.csv',
             'tags.csv', 'users.csv', 'votes.csv']
json_folder = 'path/to/json_output2'

# Aggregate and convert each CSV file to JSON with NoSQL schema
aggregate_and_convert_to_nosql(csv_files, json_folder)
