In [5]:
import os
import json
import csv

# Define the dataset path
dataset_path = r'phemernrdataset/pheme-rnr-dataset'

# Paths for the output CSV files
source_csv_path = 'source_tweets.csv'
reaction_csv_path = 'reaction_tweets.csv'

# Function to load and extract data
def extract_pheme_data_to_csv(dataset_path, source_csv_path, reaction_csv_path):
    # Define headers for the CSV files
    source_headers = [
        'event', 'rumor_type', 'tweet_id', 'text', 'retweet_count', 'favorite_count', 
        'user_id', 'user_screen_name', 'user_followers_count', 'user_verified', 'user_statuses_count', 
        'created_at'
    ]
    reaction_headers = [
        'event', 'rumor_type', 'source_tweet_id', 'reaction_tweet_id', 'text', 'retweet_count', 
        'favorite_count', 'user_id', 'user_screen_name', 'user_followers_count', 'user_verified', 
        'user_statuses_count', 'created_at'
    ]
    
    # Open the CSV files for writing
    with open(source_csv_path, 'w', newline='', encoding='utf-8') as source_csv, \
         open(reaction_csv_path, 'w', newline='', encoding='utf-8') as reaction_csv:
        
        # Create CSV writers
        source_writer = csv.DictWriter(source_csv, fieldnames=source_headers)
        reaction_writer = csv.DictWriter(reaction_csv, fieldnames=reaction_headers)
        
        # Write headers
        source_writer.writeheader()
        reaction_writer.writeheader()
        
        # Traverse the dataset directory structure
        for event in os.listdir(dataset_path):
            event_path = os.path.join(dataset_path, event)
            if os.path.isdir(event_path):
                for rumor_type in ['rumours', 'non-rumours']:
                    rumor_path = os.path.join(event_path, rumor_type)
                    if os.path.isdir(rumor_path):
                        for source_id in os.listdir(rumor_path):
                            source_path = os.path.join(rumor_path, source_id)
                            if os.path.isdir(source_path):
                                # Load source tweet
                                source_tweet_path = os.path.join(source_path, 'source-tweet', f"{source_id}.json")
                                if os.path.exists(source_tweet_path):
                                    with open(source_tweet_path, 'r', encoding='utf-8') as source_file:
                                        source_data = json.load(source_file)
                                        # Extract source tweet information
                                        source_row = {
                                            'event': event,
                                            'rumor_type': rumor_type,
                                            'tweet_id': source_data.get('id'),
                                            'text': source_data.get('text'),
                                            'retweet_count': source_data.get('retweet_count', 0),
                                            'favorite_count': source_data.get('favorite_count', 0),
                                            'user_id': source_data['user'].get('id'),
                                            'user_screen_name': source_data['user'].get('screen_name'),
                                            'user_followers_count': source_data['user'].get('followers_count', 0),
                                            'user_verified': source_data['user'].get('verified', False),
                                            'user_statuses_count': source_data['user'].get('statuses_count', 0),
                                            'created_at': source_data.get('created_at')
                                        }
                                        source_writer.writerow(source_row)
                                
                                # Load reactions
                                reactions_path = os.path.join(source_path, 'reactions')
                                if os.path.isdir(reactions_path):
                                    for reaction_file in os.listdir(reactions_path):
                                        reaction_path = os.path.join(reactions_path, reaction_file)
                                        with open(reaction_path, 'r', encoding='utf-8') as reaction_f:
                                            reaction_data = json.load(reaction_f)
                                            # Extract reaction tweet information
                                            reaction_row = {
                                                'event': event,
                                                'rumor_type': rumor_type,
                                                'source_tweet_id': source_id,
                                                'reaction_tweet_id': reaction_data.get('id'),
                                                'text': reaction_data.get('text'),
                                                'retweet_count': reaction_data.get('retweet_count', 0),
                                                'favorite_count': reaction_data.get('favorite_count', 0),
                                                'user_id': reaction_data['user'].get('id'),
                                                'user_screen_name': reaction_data['user'].get('screen_name'),
                                                'user_followers_count': reaction_data['user'].get('followers_count', 0),
                                                'user_verified': reaction_data['user'].get('verified', False),
                                                'user_statuses_count': reaction_data['user'].get('statuses_count', 0),
                                                'created_at': reaction_data.get('created_at')
                                            }
                                            reaction_writer.writerow(reaction_row)

# Run the extraction function
extract_pheme_data_to_csv(dataset_path, source_csv_path, reaction_csv_path)
print(f"Data extraction complete. CSV files saved as '{source_csv_path}' and '{reaction_csv_path}'.")


Data extraction complete. CSV files saved as 'source_tweets.csv' and 'reaction_tweets.csv'.
