# Notes
- input
    - original file: `ct_platform/data/01-posts.zip`
- output    
    - 8 csv files saved at `ct_platform/data/extract_tem3` (where twitter has 2 csv files as it is too big)

quick summary of the extracted data:
|platform |  number of rows| percentage of total|
|---------|---------------|-------------------|
|tiktok | 15,310         | 0.58%             |
|bluesky |90,169          | 3.42%             |
|truthsocial |70,445      | 2.68%             |
|linkedIn | 11,385        | 0.43%             |
|twitter | 1,974,714      | 75.00%          |
|mastodon | 20,099        | 0.76%             |
|gab | 25,246             | 0.96%             |
|gettr |435,175           | 16.50%          |
|Total | 2,637,719      | 100%             |

In [None]:
import zipfile
import os 
import json
import pandas as pd
import re
from tqdm import tqdm

SERVER = "VData/scro4316/"

# 1. unzip the data

In [None]:
###### 1.1 unzip the original main file ######

# Path to the zip file
zip_file_path = "ct_platform/data/01-posts.zip"
# Path to extract the contents folder
zip_extract_path = 'ct_platform/data/extract_tem'

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents
    zip_ref.extractall(zip_extract_path)

print("Extraction complete.")

Extraction complete.


Notes:
1. Platforms: The unzipped data include multiple platforms, they are bluesky, truthsocial, linkedin, twitter, mastodon, gab, gettr, and tiktok. 
2. Each platform has 3-4 zipped files, named as a, b, c, d. I renamed all the files to `number-a/b/c/d-platform.zip` for better understanding.
3. under each platform zip file, there are multiple json files, each json file is named after the conspiracy keywords. 
4. after extraction, the unzipped file folder may share the same name as existing folders, so we have to handle it. 



In [None]:
# Improved extraction function that preserves zip file names
def unzip_all_zip_files(src_dir, dst_dir):
    """
    Unzips all .zip files from src_dir into separate subdirectories in dst_dir,
    with each subdirectory named after the zip file.
    """
    os.makedirs(dst_dir, exist_ok=True)  # Make the destination folder if it doesn't exist
    
    for filename in os.listdir(src_dir):  # Go through every file in the source folder
        if filename.endswith('.zip'):     # Check if the file is a .zip file
            # Create a subdirectory with the zip filename (without .zip extension)
            # because some zip files have the same name after unzipping
            zip_name = os.path.splitext(filename)[0]
            extract_subdir = os.path.join(dst_dir, zip_name)
            os.makedirs(extract_subdir, exist_ok=True)
            
            # Get the full path to the zip file
            zip_path = os.path.join(src_dir, filename)
            
            # Open the zip file for reading and extract to the subdirectory
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_subdir)
                
            # Remove __MACOSX directory if it exists
            macosx_path = os.path.join(extract_subdir, '__MACOSX')
            if os.path.exists(macosx_path) and os.path.isdir(macosx_path):
                import shutil
                shutil.rmtree(macosx_path)
                
            print(f'Extracted {filename} to {extract_subdir}')

# Set your source and destination directories
src_dir = 'ct_platform/data/extract_tem'
dst_dir = 'ct_platform/data/extract_tem2'  # second time extract zip file

# Make sure the destination directory exists
os.makedirs(dst_dir, exist_ok=True)
# Use the improved unzip function
unzip_all_zip_files(src_dir, dst_dir)


Extracted 07b-gettr.zip to /VData/scro4316/ct_platform/data/extract_tem2/07b-gettr
Extracted 03b-linkedin.zip to /VData/scro4316/ct_platform/data/extract_tem2/03b-linkedin
Extracted 07-gettr.zip to /VData/scro4316/ct_platform/data/extract_tem2/07-gettr
Extracted 01c-bluesky.zip to /VData/scro4316/ct_platform/data/extract_tem2/01c-bluesky
Extracted 03c-linkedin.zip to /VData/scro4316/ct_platform/data/extract_tem2/03c-linkedin
Extracted 06-gab.zip to /VData/scro4316/ct_platform/data/extract_tem2/06-gab
Extracted 06c-gab.zip to /VData/scro4316/ct_platform/data/extract_tem2/06c-gab
Extracted 05b-mastodon.zip to /VData/scro4316/ct_platform/data/extract_tem2/05b-mastodon
Extracted 03-linkedin.zip to /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin
Extracted 01-bluesky.zip to /VData/scro4316/ct_platform/data/extract_tem2/01-bluesky
Extracted 08b-tiktok.zip to /VData/scro4316/ct_platform/data/extract_tem2/08b-tiktok
Extracted 06b-gab.zip to /VData/scro4316/ct_platform/data/extract_tem

Notes: 
1. each platform has multiple folders, and each folder has multiple json files after unzipping.
2. next we will loop through json files in each folder and save them into a single file based on platform name

In [None]:
# redefine source folder 
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3'  # third time extract json file
for filename in os.listdir(src_dir):
    print(filename)

03-linkedin
02-truthsocial
03b-linkedin
04d-twitter
04a-twitter
04b-twitter
02c-truthsocial
07-gettr
03c-linkedin
07c-gettr
04c-twitter
05c-mastodon
08-tiktok.jsonl
05b-mastodon
08c-tiktok
06c-gab
06-gab
01c-bluesky
01b-bluesky
02b-truthsocial
07b-gettr
01-bluesky
05-mastodon
08b-tiktok
06b-gab


# 2. extract JSON file to CSV by platform

## 2.1 Tiktok
- the dictionary key names and structures are slightly across 3 JSON files, not sure why, but the b and c JSON structure are the same while the first one is different
- we show two example item in the file in the appendix in this notebook


In [5]:
def tiktok_json(data_path):
    """
    data_path is path of the json file
    read the data line by line, return list of dictionaries, this function is used to read the json file
    we did not read music relevant info
    """
    tiktok_list = []
    
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            else:
                return default
        return current
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Common fields regardless of structure
                video_id = ''
                created_time = str(data.get('createTime', ''))
                is_ad = str(data.get('isAd', ''))
                is_pinned = str(data.get('isPinned', ''))
                
                if "nextCursor" in data:
                    # 08b and 08c file structure
                    keyword = str(data.get('keyword', ''))
                    video_id = str(get_nested(data, ['video', 'id'], ''))
                    is_mute = str(data.get('itemMute', ''))
                    video_len = str(get_nested(data, ['video', 'duration'], ''))
                    video_play = str(get_nested(data, ['stats', 'playCount'], ''))
                    video_like = str(get_nested(data, ['stats', 'diggCount'], ''))
                    video_share = str(get_nested(data, ['stats', 'shareCount'], ''))
                    video_comment = str(get_nested(data, ['stats', 'commentCount'], ''))
                    video_collect = str(get_nested(data, ['stats', 'collectCount'], ''))
                    
                    # Description info
                    desc = str(data.get('desc', ''))
                    mention = str(data.get('mentions', ''))
                    hashtag = [str(item.get('name', '')) for item in data.get('hashtags', [])]
                    
                    # Author info
                    user_id = str(get_nested(data, ['author', 'id'], ''))
                    user_name = str(get_nested(data, ['author', 'nickname'], ''))
                    user_veri = str(get_nested(data, ['author', 'verified'], ''))
                    user_private = str(get_nested(data, ['author', 'privateAccount'], ''))
                    user_sig = str(get_nested(data, ['author', 'signature'], ''))
                    user_follow = str(get_nested(data, ['authorStats', 'followingCount'], ''))
                    user_fans = str(get_nested(data, ['authorStats', 'followerCount'], ''))
                    user_heart = str(get_nested(data, ['authorStats', 'heartCount'], ''))
                    user_video = str(get_nested(data, ['authorStats', 'videoCount'], ''))
                    user_like = str(get_nested(data, ['authorStats', 'diggCount'], ''))
                    
                else:
                    # 08-tiktok.jsonl structure
                    keyword = str(data.get('searchQuery', ''))
                    video_id = str(data.get('id', ''))
                    is_mute = str(data.get('isMuted', ''))
                    video_len = str(get_nested(data, ['videoMeta', 'duration'], ''))
                    video_play = str(data.get('playCount', ''))
                    video_like = str(data.get('diggCount', ''))
                    video_share = str(data.get('shareCount', ''))
                    video_comment = str(data.get('commentCount', ''))
                    video_collect = str(data.get('collectCount', ''))
                    
                    # Description info
                    desc = str(data.get('text', ''))
                    mention = str(data.get('mentions', ''))
                    hashtag = [str(item.get('name', '')) for item in data.get('hashtags', [])]
                    
                    # User info
                    user_id = str(get_nested(data, ['authorMeta', 'id'], ''))
                    user_name = str(get_nested(data, ['authorMeta', 'nickName'], ''))
                    user_veri = str(get_nested(data, ['authorMeta', 'verified'], ''))
                    user_private = str(get_nested(data, ['authorMeta', 'privateAccount'], ''))
                    user_sig = str(get_nested(data, ['authorMeta', 'signature'], ''))
                    user_follow = str(get_nested(data, ['authorMeta', 'following'], ''))
                    user_fans = str(get_nested(data, ['authorMeta', 'fans'], ''))
                    user_heart = str(get_nested(data, ['authorMeta', 'heart'], ''))
                    user_video = str(get_nested(data, ['authorMeta', 'video'], ''))
                    user_like = str(get_nested(data, ['authorMeta', 'digg'], ''))
                
                # Create dictionary with extracted data
                dict_tem = {
                    'keyword': keyword,
                    'post_id': video_id,
                    'post_time': created_time,
                    'is_ad': is_ad,
                    'is_mute': is_mute,
                    'is_pinned': is_pinned,
                    'video_len': video_len,
                    'view': video_play,
                    'like': video_like,
                    'repost': video_share,
                    'reply': video_comment,
                    'collect': video_collect,
                    'post_text': desc,
                    'mention': mention,
                    'hashtag': hashtag,
                    'user_id': user_id,
                    'user_name': user_name,
                    'user_veri': user_veri,
                    'user_private': user_private,
                    'user_sig': user_sig,
                    'user_follow': user_follow,
                    'user_fans': user_fans,
                    'user_heart': user_heart,
                    'user_video': user_video,
                    'user_like': user_like
                }
                tiktok_list.append(dict_tem)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
                
    return tiktok_list

In [None]:
# redefine source folder 
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
### change 1 
platform = 'tiktok'

platform_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith(platform):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith(platform):
                json_path_2 = os.path.join(json_path_1, filename_2)
            elif filename_2.endswith('.jsonl'):
                json_path_2 = os.path.join(json_path_1, filename_2)

            json_path = os.path.join(json_path_2)
            print(f'processing {json_path}')
            # create a new dataframe to save the csv file
            ####  change 2 ##### 
            data = tiktok_json(json_path)
            # save the data into a csv file
            df = pd.DataFrame(data)
            platform_df = pd.concat([platform_df, df], ignore_index=True)
            print(f'Extracted {filename} to {dst_dir}')
            print(f'Extracted {len(df)} records')
            print(f'Extracted {len(platform_df)} records to the data file')

platform_df.to_csv(f'ct_platform/data/extract_tem3/{platform}.csv', index=False)
print(f'Extracted {len(platform_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/08c-tiktok
processing /VData/scro4316/ct_platform/data/extract_tem2/08c-tiktok/08c-tiktok.jsonl
Extracted 08c-tiktok to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 2128 records
Extracted 2128 records to the data file
/VData/scro4316/ct_platform/data/extract_tem2/08b-tiktok
processing /VData/scro4316/ct_platform/data/extract_tem2/08b-tiktok/08b-tiktok.jsonl
Extracted 08b-tiktok to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 12613 records
Extracted 14741 records to the data file
Extracted 14741 records to the data file


In [None]:
# check if the content column is html format to decide whether to further process the data 
tiktok = pd.read_csv(f'ct_platform/data/extract_tem3/{platform}.csv')
tiktok.head()
# no need to further process the data, just save it to the csv file

  tiktok = pd.read_csv(f'/VData/scro4316/ct_platform/data/extract_tem3/{platform}.csv')


## 2.2 Bluesky
- Bluesky has 3 folders, each folder has multiple JSON files, but luckily the JSON structure is the same across all files

In [7]:
def bluesky_json(data_path):
    """
    data_path is path of the json file
    read the data line by line, return list of dictionaries, this function is used to read the json file
    we did not read music relevant info
    """
    result_list = []  # Changed from 'list' to avoid shadowing built-in
    
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]  # Added list index access capability
            else:
                return default
        return current
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Common fields regardless of structure                
                keyword = str(data.get('keyword', ''))

                # post info
                post_id = str(data.get('cid', '')) 
                post_type = str(get_nested(data, ['record', '$type'], ''))
                post_time = str(get_nested(data, ['record', 'createdAt'], ''))
                post_lan = str(get_nested(data, ['record', 'langs'], ''))
                post_text = str(get_nested(data, ['record', 'text'], ''))
                reply = str(data.get('replyCount', ''))
                repost = str(data.get('repostCount', ''))
                like = str(data.get('likeCount', ''))
                
                # User info
                user_id = str(get_nested(data, ['author', 'did'], ''))
                user_name = str(get_nested(data, ['author', 'displayName'], ''))
                
                # Handle array or complex nested structures properly
                user_labels = get_nested(data, ['author', 'labels'], [])
                user_label = str(user_labels) if isinstance(user_labels, list) else str(user_labels)
                
                user_time = str(get_nested(data, ['author', 'createdAt'], ''))
                user_view_mute = str(get_nested(data, ['author', 'viewer', 'viewMute'], ''))
                user_block = str(get_nested(data, ['author', 'viewer', 'blockedBy'], ''))

                # Create dictionary with extracted data
                dict_tem = {
                    'keyword': keyword,
                    'post_id': post_id,
                    'post_type': post_type,
                    'post_time': post_time,
                    'post_lan': post_lan,
                    'post_text': post_text,
                    'reply': reply,
                    'repost': repost,
                    'like': like,
                    'user_id': user_id,
                    'user_name': user_name,
                    'user_label': user_label,
                    'user_time': user_time,
                    'user_view_mute': user_view_mute,
                    'user_block': user_block
                }
                result_list.append(dict_tem)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
                
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 

bluesky_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith('bluesky'):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith('bluesky'):
                json_path_2 = os.path.join(json_path_1, filename_2)
                for filename_3 in os.listdir(json_path_2):
                    if filename_3.endswith('.jsonl'):
                        json_path = os.path.join(json_path_2, filename_3)
                        print(f'processing {json_path}')
                        # create a new dataframe to save the csv file
                        data = bluesky_json(json_path)
                        # save the data into a csv file
                        df = pd.DataFrame(data)
                        bluesky_df = pd.concat([bluesky_df, df], ignore_index=True)
                        print(f'Extracted {filename} to {dst_dir}')
                        print(f'Extracted {len(df)} records')
                        print(f'Extracted {len(bluesky_df)} records to the data file')

bluesky_df.to_csv('ct_platform/data/extract_tem3/bluesky.csv', index=False)
print(f'Extracted {len(bluesky_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/01c-bluesky
processing /VData/scro4316/ct_platform/data/extract_tem2/01c-bluesky/01c-bluesky/jeffrey epstein.jsonl
Extracted 01c-bluesky to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 657 records
Extracted 657 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/01c-bluesky/01c-bluesky/Roswell.jsonl
Extracted 01c-bluesky to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 327 records
Extracted 984 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/01c-bluesky/01c-bluesky/princess diana.jsonl
Extracted 01c-bluesky to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 260 records
Extracted 1244 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/01c-bluesky/01c-bluesky/Bill Gates  Vaccines.jsonl
Extracted 01c-bluesky to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 16 records
Extracted 1260 records to the data file
proce

In [None]:
# check if the content column is html format to decide whether to further process the data 
bs = pd.read_csv('ct_platform/data/extract_tem3/bluesky.csv')
bs['post_text'].sample(5)
# no need to further process the data, just save it to the csv file

79210    Ternyata Pemprov DKI pernah bikin seabrek dash...
54461    Thailand: Prime Minister Srettha Thavisin is t...
81656    So far, mankind has responded to Covid-19 by c...
12998    Status Atualizado das Linhas\n\nNormal:\n1-Azu...
40807    One of my favourite fun facts is that there we...
Name: post_text, dtype: object

## 2.3 Truthsocial
- Truthsocial has 3 folders, each folder has multiple JSON files, but luckily the JSON structure is the same across all files

In [10]:
## gpt revised code
def truthsocial_json(data_path):
    """
    data_path is path of the json file
    read the data line by line, return list of dictionaries, this function is used to read the json file
    we did not read music relevant info
    """
    result_list = []  # Changed from 'list' to avoid shadowing built-in
  
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]  # Added list index access capability
            else:
                return default
        return current
    
    # Helper function to create post dictionary
    def create_post_dict(data_obj, keyword, reblogged, quoted_id, reply_id):
        post_id = str(data_obj.get('id', '')) 
        post_time = str(data_obj.get('created_at', ''))
        post_lan = str(data_obj.get('language', ''))
        post_sensitive = str(data_obj.get('sensitive', ''))
        post_visibility = str(data_obj.get('visibility', ''))
        post_text = str(data_obj.get('text', ''))
        #NOTE. content is the text of the post, text is null most of the time, not sure what it is. 
        post_content = str(data_obj.get('content', ''))
        post_spoiler = str(data_obj.get('spoiler_text', ''))
        post_pin = str(data_obj.get('pinned', ''))
        reply = str(data_obj.get('replies_count', ''))
        repost = str(data_obj.get('reblogs_count', ''))
        like = str(data_obj.get('favourites_count', ''))
        mention = [str(item.get('username', '')) for item in data_obj.get('mentions', [])]

        # user info
        user_id = str(get_nested(data_obj, ['account', 'id'], ''))
        user_name = str(get_nested(data_obj, ['account', 'username'], ''))
        user_verified = str(get_nested(data_obj, ['account', 'verified'], ''))
        user_time = str(get_nested(data_obj, ['account', 'created_at'], ''))
        user_last_status = str(get_nested(data_obj, ['account', 'last_status_at'], ''))
        user_follow = str(get_nested(data_obj, ['account', 'following_count'], ''))
        user_fans = str(get_nested(data_obj, ['account', 'follower_count'], ''))
        user_status = str(get_nested(data_obj, ['account', 'statuses_count'], ''))
        user_bot = str(get_nested(data_obj, ['account', 'bot'], ''))
        user_note = str(get_nested(data_obj, ['account', 'note'], ''))

        return {
            'keyword': keyword,
            'reblogged': reblogged,
            'quoted_id': quoted_id,
            'reply_id': reply_id,
            'post_id': post_id,
            'post_time': post_time,
            'post_lan': post_lan,
            'post_sensitive': post_sensitive,
            'post_visibility': post_visibility,
            # 'post_text': post_text,
            'post_text': post_content,  
            'post_spoiler': post_spoiler,
            'post_pin': post_pin,
            'reply': reply,
            'repost': repost,
            'like': like,
            'mention': mention,
            'user_id': user_id,
            'user_name': user_name,
            'user_veri': user_verified,
            'user_time': user_time,
            'user_last_status': user_last_status,
            'user_follow': user_follow,
            'user_fans': user_fans,
            'user_status': user_status,
            'user_bot': user_bot,
            'user_note': user_note
        }
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Common fields regardless of structure                
                keyword = str(data.get('keyword', ''))
                
                # Post links info
                reblogged = str(data.get('reblogged', ''))
                quoted_id = str(data.get('quoted_id', ''))   
                reply_id = str(data.get('in_reply_to_id', ''))   
                
                # Add main post
                dict_tem = create_post_dict(data, keyword, reblogged, quoted_id, reply_id)
                result_list.append(dict_tem)
                
                # Add quoted post if exists
                if quoted_id and quoted_id != 'None':
                    data_quoted = data.get('quoted', {})
                    if data_quoted:  # Check if quoted post data exists
                        dict_tem = create_post_dict(data_quoted, keyword, reblogged, quoted_id, reply_id)
                        result_list.append(dict_tem)
                
                # Add reply post if exists
                if reply_id and reply_id != 'None':
                    data_reply = data.get('in_reply_to', {})
                    if data_reply:  # Check if reply post data exists
                        dict_tem = create_post_dict(data_reply, keyword, reblogged, quoted_id, reply_id)
                        result_list.append(dict_tem)
                
                # Add reblog post if exists
                if reblogged and reblogged != 'None':
                    data_reblog = data.get('reblog', {})
                    if data_reblog:  # Check if reblog post data exists
                        dict_tem = create_post_dict(data_reblog, keyword, reblogged, quoted_id, reply_id)
                        result_list.append(dict_tem)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
            
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
### change 1 
platform = 'truthsocial'

platform_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith('truthsocial'):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith(platform):
                json_path_2 = os.path.join(json_path_1, filename_2)
                for filename_3 in os.listdir(json_path_2):
                    if filename_3.endswith('.jsonl'):
                        json_path = os.path.join(json_path_2, filename_3)
                        print(f'processing {json_path}')
                        # create a new dataframe to save the csv file
                        ####  change 2 ##### 
                        data = truthsocial_json(json_path)
                        # save the data into a csv file
                        df = pd.DataFrame(data)
                        platform_df = pd.concat([platform_df, df], ignore_index=True)
                        print(f'Extracted {filename} to {dst_dir}')
                        print(f'Extracted {len(df)} records')
                        print(f'Extracted {len(platform_df)} records to the data file')

platform_df.to_csv(f'ct_platform/data/extract_tem3/{platform}.csv', index=False)
print(f'Extracted {len(platform_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/02-truthsocial
processing /VData/scro4316/ct_platform/data/extract_tem2/02-truthsocial/02-truthsocial/jeffrey epstein.jsonl
Extracted 02-truthsocial to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 81 records
Extracted 81 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/02-truthsocial/02-truthsocial/princess diana.jsonl
Extracted 02-truthsocial to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 0 records
Extracted 81 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/02-truthsocial/02-truthsocial/jfk.jsonl
Extracted 02-truthsocial to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 125 records
Extracted 206 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/02-truthsocial/02-truthsocial/deep-state.jsonl
Extracted 02-truthsocial to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 126 records
Extracted 332 records to 

In [None]:
# check if the content column is html format to decide whether to further process the data 
ts = pd.read_csv('ct_platform/data/extract_tem3/truthsocial.csv')
ts['post_text'].sample(5)
# NEED TO FURTHER PROCESS THE DATA



37901    <p>So you’ve got constant raping by priests of...
61513    <p>Prayer for a Godly President</p><p>WHO Upda...
46242                           <p>I won&apos;t say it</p>
17961    <p>People&apos;s who became bankrupt between 2...
2257     <p>Several Republican senators confronted Secr...
Name: post_text, dtype: object

In [30]:
import re
from bs4 import BeautifulSoup

def extract_html_content(post_string):
    """
    Extract meaningful content from Truth Social API post strings.
    
    Args:
        post_string (str): Raw post string from Truth Social API
    
    Returns:
        str: Cleaned post content
    """
    try:
        # Remove HTML tags
        soup = BeautifulSoup(post_string, 'html.parser')
        
        # Extract text from paragraphs
        paragraphs = soup.find_all('p')
        
        # Combine paragraph texts, filtering out link-related content
        content_parts = []
        for p in paragraphs:
            # Remove link and hashtag elements
            for a in p.find_all('a'):
                a.decompose()
            
            # Clean up the text
            clean_text = p.get_text(strip=True)
            
            # Skip empty strings and purely link-related content
            if clean_text and not clean_text.startswith('http'):
                content_parts.append(clean_text)
        
        # Join the content parts
        full_content = ' '.join(content_parts)
        
        # Remove extra whitespaces
        full_content = re.sub(r'\s+', ' ', full_content).strip()
        
        return full_content
    
    except Exception as e:
        print(f"Error processing post: {e}")
        return ""

# Example usage
# posts = [
#     '''<p>Live CNN Report Interrupted by Man Wearing 'Where's Jeffrey Epstein Client List' Shirt<br />  <a href="https://links.truthsocial.com/link/112801012990205769" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">dailyfetched.com/live-cnn-repo</span><span class="invisible">rt-interrupted-by-man-wearing-wheres-jeffrey-epstein-client-list-shirt/</span></a></p>''',
    
#     '''<p>🚨 <a href="https://truthsocial.com/tags/BREAKINGNEWS" class="mention hashtag" rel="tag">#<span>BREAKINGNEWS</span></a> Can we just make this post go viral that Republican party has nominated a close friend of Jeffrey Epstein. </p><p>- Was accused of raping a 13 year old girl named Katie Johnson with Jeffrey Epstein. </p><p>- In Epstein court documents accusing Maxwell of running a sex trafficking recruitment center out of Mar-a-Lago. </p><p>- Was on Jeffrey Epstein&apos;s flight logs. </p><p>- Was on Jeffrey Epstein&apos;s messages and call-back requests for massages. </p><p>- Visited Jeffrey Epstein&apos;s New York estate. </p><p>- Jeffrey Epstein visited Trump. </p><p>- Found liable for Fraud 4 times. </p><p>- Found liable of sexual assault that the judge later clarified it as rape. </p><p>- Found liable for defamation after he verbally attacked the woman he raped. </p><p>- Found guilty of Falsifying business records New York Penal Code 175.10 and 175.05. </p><p>- Trump Charities ordered too shutdown. </p><p>-Trump University ordered too shutdown. </p><p>- Can&apos;t personally do business in New York for 3 years. </p><p>- Lied during his administration over 30,000 times. <span class="quote-inline"><br/>RT: <a href="https://truthsocial.com/users/Nobull_1/statuses/112823603918546351" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://</span><span class="ellipsis">truthsocial.com/users/Nobull_1</span><span class="invisible">/statuses/112823603918546351</span></a></span></p>''',
    
#     '''<p>Joshua Shapiro Charged for Attempted Assassination of Donald J. Trump <a href="https://links.truthsocial.com/link/112844303182284821" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">youtube.com/watch?v=E4v7A99nCm</span><span class="invisible">s</span></a> {{{ You better get to her before she gets Jeffrey Epstein-ed }}}</p>'''
# ]

# # Demonstrate usage
# for post in posts:
#     print(extract_html_content(post))
#     print("\n---\n")

In [None]:
ts['post_text_raw'] = ts['post_text'].astype(str)
ts['post_text'] = ts['post_text_raw'].apply(extract_html_content)
ts['post_text'].sample(5)
ts.to_csv('ct_platform/data/extract_tem3/truthsocial.csv', index=False)

## 2.4 Linkedin
- unzipped linkedin folders JSON files are not named with `.jsonl` affix, but they are actually jsonl files, so we need to rename them first

In [None]:

folder_path = 'ct_platform/data/extract_tem2/03-linkedin/03-linkedin'

# List of your folder paths
folder_paths = [
    'ct_platform/data/extract_tem2/03-linkedin/03-linkedin',
    'ct_platform/data/extract_tem2/03c-linkedin/03c-linkedin',
    # Add more folders here
]
for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        old_path = os.path.join(folder_path, filename)
        if os.path.isfile(old_path):
            # Skip if file already ends with .jsonl
            if filename.lower().endswith('.jsonl'):
                continue
            base_name = os.path.splitext(filename)[0]
            new_filename = base_name + '.jsonl'
            new_path = os.path.join(folder_path, new_filename)
            os.rename(old_path, new_path)
            print(f'Renamed {old_path} to {new_path}')

Renamed /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/mlk to /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/mlk.jsonl
Renamed /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/illuminati to /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/illuminati.jsonl
Renamed /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/2020 election fraud to /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/2020 election fraud.jsonl
Renamed /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/deep-state to /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/deep-state.jsonl
Renamed /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/deep state to /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/deep state.jsonl
Renamed /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/aliens to /VData/scro43

In [13]:
def linkedin_json(data_path):
    """
    Extracts data from a LinkedIn JSON file and returns a list of structured dictionaries.
    
    Parameters:
        data_path (str): Path to the JSON file containing LinkedIn data
        
    Returns:
        list: List of dictionaries with extracted LinkedIn post data
    """
    result_list = []
    
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]
            else:
                return default
        return current
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Common fields regardless of structure                
                keyword = str(data.get('keyword', ''))

                # Post info
                post_id = str(data.get('urn', '')) 
                post_reposted = str(data.get('isRepost', ''))
                post_time = str(data.get('postedAtTimestamp', ''))
                post_text = str(data.get('text', ''))
                post_type = str(data.get('type', ''))
                repost = str(data.get('numShares', ''))
                reply = str(data.get('numComments', ''))
                like = str(data.get('numLikes', ''))
                post_visibility = str(data.get('shareAudience', ''))

                # Engagement info - keeping complete objects for potential future use
                post_mention = data.get('attributes', [])
                post_reactions = data.get('reactions', [])
                post_comments = data.get('comments', [])

                # Author info
                user_id = str(data.get('authorProfileId', ''))
                user_name = str(data.get('authorName', ''))
                user_name_full = str(data.get('authorFullName', ''))
                user_type = str(data.get('authorType', ''))
                user_follower = str(data.get('authorFollowersCount', ''))
                user_desc = str(data.get('authorTitle', ''))

                # Get author details if exists
                user_occupation = str(get_nested(data, ['author', 'occupation'], '')) 
                
                # Create result dictionary
                dict_tem = {
                    'keyword': keyword,
                    'post_id': post_id,
                    'post_reposted': post_reposted,
                    'post_time': post_time,
                    'post_text': post_text,
                    'post_type': post_type,
                    'post_visibility': post_visibility,
                    'post_mention': post_mention,
                    'post_reactions': post_reactions,
                    'post_comments': post_comments,
                    'reply': reply,
                    'repost': repost,
                    'like': like,
                    'user_id': user_id,
                    'user_name': user_name,
                    'user_name_full': user_name_full,
                    'user_type': user_type,
                    'user_fans': user_follower,
                    'user_desc': user_desc,
                    'user_occupation': user_occupation
                }
                result_list.append(dict_tem)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
                
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
### change 1 
platform = 'linkedin'

platform_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith(platform):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith(platform):
                json_path_2 = os.path.join(json_path_1, filename_2)

                for filename_3 in os.listdir(json_path_2):
                    if filename_3.endswith('.jsonl'):
                        json_path = os.path.join(json_path_2, filename_3)
                        print(f'processing {json_path}')
                        # create a new dataframe to save the csv file
                        ####  change 2 ##### 
                        data = linkedin_json(json_path)
                        # save the data into a csv file
                        df = pd.DataFrame(data)
                        platform_df = pd.concat([platform_df, df], ignore_index=True)
                        print(f'Extracted {filename} to {dst_dir}')
                        print(f'Extracted {len(df)} records')
                        print(f'Extracted {len(platform_df)} records to the data file')

platform_df.to_csv(f'ct_platform/data/extract_tem3/{platform}.csv', index=False)
print(f'Extracted {len(platform_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/03-linkedin
processing /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/jeffrey epstein.jsonl
Extracted 03-linkedin to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 20 records
Extracted 20 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/princess diana.jsonl
Extracted 03-linkedin to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 19 records
Extracted 39 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/jfk.jsonl
Extracted 03-linkedin to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 40 records
Extracted 79 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/03-linkedin/03-linkedin/deep-state.jsonl
Extracted 03-linkedin to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 20 records
Extracted 99 records to the data file
processing /VData/scro4316/c

In [None]:
li = pd.read_csv('ct_platform/data/extract_tem3/linkedin.csv')
li['post_text'].sample(5)
# no need to further process the data, just save it to the csv file

179     🚀 Passive Income Gen Z: Create Viral Historica...
4825    Dual strategy of chemokine modulation followed...
8647    Given the developments in the U.K., I am not t...
5496    CONGRATULATIONS TO OUR #D7RN'S JULY'S D7RN  “H...
9719    #DARPA defense science office (DSO) is hosting...
Name: post_text, dtype: object

## 2.5 Twitter
- so annoying, `04a-twitter/04-twitter` file is folder with multiple subfolders, and json files are stored in subfolders, so we need to loop through the subfolders and extract the json files for this specfic folder, other 04b, 04c, 04d are normal folders with json files
- the data structure is a bit different across json files, for example, some files include user info in `user` key, while some files include user info in `author` key, so we need to handle this
- this block took about 1 hour to run, we have in total 1,974,714 tweets, saved in to two batches of csv files

In [15]:
def twitter_json(data_path):
    """
    data_path is path of the json file
    read the data line by line, return list of dictionaries, this function is used to read the json file
    we did not read music relevant info
    """
    
    result_list = []  # Changed from 'list' to avoid shadowing built-in
  
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]  # Added list index access capability
            else:
                return default
        return current if current is not None else default
    
    # Helper function to create post dictionary
    def create_post_dict(data_obj, keyword, is_quote='', is_reply='', is_repost=''):
        post_id = str(data_obj.get('id', '')) 
        post_time = str(data_obj.get('createdAt', ''))
        post_lan = str(data_obj.get('lang', ''))
        
        if 'text' in data_obj:
            post_text = str(data_obj.get('text', ''))
        else:
            post_text = str(data_obj.get('rawContent', ''))

        post_source = str(data_obj.get('source', ''))
        reply = str(data_obj.get('replyCount', ''))
        repost = str(data_obj.get('retweetCount', ''))
        like = str(data_obj.get('likeCount', ''))
        quote = str(data_obj.get('quoteCount', ''))
        view = str(data_obj.get('viewCount', ''))

        media = data_obj.get('media', [])
        hashtag = data_obj.get('hashtags', [])
        mention = data_obj.get('mentionedUsers', [])

        post_type = str(data_obj.get('type', ''))
        
        # Use data_obj instead of data for quoted_id, reply_id, and retweet_id
        quoted_id = str(data_obj.get('quotedId', ''))
        reply_id = str(data_obj.get('conversationId', '')) 
        retweet_id = str(data_obj.get('retweetId', ''))

        # user info, it might be in user or author key
        if 'user' in data_obj:
            user_id = str(get_nested(data_obj, ['user','id'], ''))
            user_name = str(get_nested(data_obj, ['user', 'username'], ''))
            user_verified = str(get_nested(data_obj, ['user', 'verified'], ''))
            user_blue = str(get_nested(data_obj, ['user', 'blueType'], ''))
            user_time = str(get_nested(data_obj, ['user', 'created'], ''))
            user_follow = str(get_nested(data_obj, ['user', 'friendsCount'], ''))
            user_fans = str(get_nested(data_obj, ['user', 'followersCount'], ''))
            user_status = str(get_nested(data_obj, ['user', 'statusesCount'], ''))
            user_fav = str(get_nested(data_obj, ['user', 'favouritesCount'], ''))
            user_loc = str(get_nested(data_obj, ['user', 'location'], ''))
            user_desc = str(get_nested(data_obj, ['user', 'rawDescription'], ''))
        else:
            user_id = str(get_nested(data_obj, ['author','id'], ''))
            user_name = str(get_nested(data_obj, ['author', 'userName'], ''))
            user_verified = str(get_nested(data_obj, ['author', 'isVerified'], ''))
            user_blue = str(get_nested(data_obj, ['author', 'blueType'], ''))
            user_time = str(get_nested(data_obj, ['author', 'createdAt'], ''))
            user_follow = str(get_nested(data_obj, ['author', 'following'], ''))
            user_fans = str(get_nested(data_obj, ['author', 'followers'], ''))
            user_status = str(get_nested(data_obj, ['author', 'statusesCount'], ''))
            user_fav = str(get_nested(data_obj, ['author', 'favouritesCount'], ''))
            user_loc = str(get_nested(data_obj, ['author', 'location'], ''))
            user_desc = str(get_nested(data_obj, ['author', 'description'], ''))

        return {
            'keyword': keyword,
            'post_id': post_id,
            'post_time': post_time,
            'post_lan': post_lan,
            'post_text': post_text,
            'post_source': post_source,
            'post_type': post_type,
            'reply': reply,
            'repost': repost,
            'like': like,
            'quote': quote,
            'view': view,
            'media': media,
            'hashtag': hashtag,
            'mention': mention,
            'quoted_id': quoted_id,
            'reply_id': reply_id,
            'retweet_id': retweet_id,
            'is_quote': is_quote,
            'is_reply': is_reply,
            'is_repost': is_repost,
            'user_id': user_id,
            'user_name': user_name,
            'user_verified': user_verified,
            'user_blue': user_blue,
            'user_time': user_time, 
            'user_follow': user_follow,
            'user_fans': user_fans,
            'user_status': user_status,
            'user_fav': user_fav,
            'user_loc': user_loc,
            'user_desc': user_desc
        }
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Common fields regardless of structure
                if 'keyword' in data:              
                    keyword = str(data.get('keyword', ''))
                else:
                    keyword = str(data.get('searchTerm', '')) 
                    # extract the regex, match everything before string 'lang'
                    keyword = re.search(r'^(.*?)\s+lang', keyword).group(1) if re.search(r'^(.*?)\s+lang', keyword) else keyword
                
                # Post links info
                is_quote = str(data.get('isQuote', 'false')).lower() == 'true'
                is_repost = str(data.get('isRetweet', 'false')).lower() == 'true'
                is_reply = str(data.get('isReply', 'false')).lower() == 'true'
                
                # Add main post
                main_post = create_post_dict(
                    data, 
                    keyword, 
                    'true' if is_quote else 'false',
                    'true' if is_reply else 'false',
                    'true' if is_repost else 'false'
                )
                result_list.append(main_post)
                
                # Add quoted post if exists
                if is_quote:
                    quoted_data = data.get('quote', {})
                    if quoted_data:  # Check if quoted post data exists
                        quoted_post = create_post_dict(quoted_data, keyword)
                        # Ensure linking between posts
                        if main_post['post_id'] and quoted_post['post_id']:
                            quoted_post['quoted_by_id'] = main_post['post_id']
                        result_list.append(quoted_post)
                
                # Add reply post if exists
                if is_reply:
                    reply_data = data.get('reply', {})
                    if reply_data:  # Check if reply post data exists
                        reply_post = create_post_dict(reply_data, keyword)
                        # Ensure linking between posts
                        if main_post['post_id'] and reply_post['post_id']:
                            main_post['in_reply_to_id'] = reply_post['post_id']
                            # Make sure both posts share the same conversation ID
                            if main_post['reply_id']:
                                reply_post['reply_id'] = main_post['reply_id']
                        result_list.append(reply_post)
                
                # Add retweet post if exists
                if is_repost:
                    retweet_data = data.get('retweet', {})
                    if retweet_data:  # Check if retweet post data exists
                        retweet_post = create_post_dict(retweet_data, keyword)
                        # Ensure linking between posts
                        if main_post['post_id'] and retweet_post['post_id']:
                            main_post['retweet_of_id'] = retweet_post['post_id']
                        result_list.append(retweet_post)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
            
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
platform = 'twitter'

# Set batch size for saving files
BATCH_SIZE = 1000000  # 1 million records per file
batch_number = 1
total_processed = 0

# Initialize an empty DataFrame for the current batch
platform_df = pd.DataFrame()

def process_jsonl_file(json_path):
    """Process a single JSONL file and return the extracted data"""
    # Make sure we're using twitter_json instead of truthsocial_json
    return twitter_json(json_path)

def find_jsonl_files(directory):
    """Recursively find all .jsonl files in a directory"""
    jsonl_files = []
    
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        
        if os.path.isfile(item_path) and item.endswith('.jsonl'):
            jsonl_files.append(item_path)
        elif os.path.isdir(item_path):
            # Recursively search subdirectories
            jsonl_files.extend(find_jsonl_files(item_path))
            
    return jsonl_files

def save_batch(df, batch_num):
    """Save the current batch of data to a CSV file"""
    if len(df) == 0:
        return  # Skip saving if DataFrame is empty
    
    # Create the output file path with batch number
    batch_file = f'{dst_dir}/{platform}_batch_{batch_num:03d}.csv'
    df.to_csv(batch_file, index=False)
    print(f'Saved batch #{batch_num} with {len(df)} records to {batch_file}')
    
    # Return an empty DataFrame for the next batch
    return pd.DataFrame()

# Make sure the destination directory exists
os.makedirs(dst_dir, exist_ok=True)

# Get the directories containing the platform name
matching_dirs = [d for d in os.listdir(src_dir) if platform in d]
print(f"Found {len(matching_dirs)} directories containing '{platform}'")

# Use tqdm for progress tracking if possible
try:
    from tqdm import tqdm
    matching_dirs_iter = tqdm(matching_dirs, desc="Processing directories")
except ImportError:
    matching_dirs_iter = matching_dirs
    print("Note: Install tqdm for progress bars")

# Iterate through directories with the platform name
for filename in matching_dirs_iter:
    json_path_1 = os.path.join(src_dir, filename)
    print(f"Processing directory: {json_path_1}")
    
    # Find all jsonl files recursively in this directory
    jsonl_files = find_jsonl_files(json_path_1)
    print(f"Found {len(jsonl_files)} JSONL files in {filename}")
    
    # Process each file
    for json_path in jsonl_files:
        print(f'Processing file: {os.path.basename(json_path)}')
        try:
            # Process the JSONL file
            data = process_jsonl_file(json_path)
            
            # Convert to DataFrame and append to main DataFrame
            if data:
                df = pd.DataFrame(data)
                current_batch_size = len(platform_df)
                platform_df = pd.concat([platform_df, df], ignore_index=True)
                total_processed += len(df)
                
                print(f'Extracted {len(df)} records from {os.path.basename(json_path)}')
                print(f'Current batch size: {len(platform_df)}, Total processed: {total_processed}')
                
                # Check if we've reached the batch size limit
                while len(platform_df) >= BATCH_SIZE:
                    # Save the first BATCH_SIZE rows
                    batch_to_save = platform_df.iloc[:BATCH_SIZE]
                    remaining_rows = platform_df.iloc[BATCH_SIZE:]
                    
                    # Save the current batch
                    save_batch(batch_to_save, batch_number)
                    batch_number += 1
                    
                    # Keep the remaining rows for the next batch
                    platform_df = remaining_rows.reset_index(drop=True)
            else:
                print(f'No data extracted from {os.path.basename(json_path)}')
                
        except Exception as e:
            print(f'Error processing {os.path.basename(json_path)}: {str(e)}')

# Save any remaining data in the final batch
if len(platform_df) > 0:
    save_batch(platform_df, batch_number)
    
print(f'Processing complete. Saved {total_processed} total records in {batch_number} batch files.')
# 1,974,714 rows in the end

In [None]:
platform_df.shape 

(1971765, 33)

In [None]:
twitter = pd.read_csv('ct_platform/data/extract_tem3/twitter_batch_001.csv')
twitter['post_text'].sample(5)
# no need to further process the data, just save it to the csv file

  twitter = pd.read_csv('/VData/scro4316/ct_platform/data/extract_tem3/twitter_batch_001.csv')


604918    H/T BioClandestine\nThe Dems have been crying ...
276536    @Emyrus17 @NirvanaM1nd It's humanist of us to ...
565935    @MMoritsen @BCOYS88 @EuroExpert_ There’s 2 way...
678876    "we choose to go to the moon,"\n    JFK\n\nbac...
582882    Admito q por causa de fã desse sujeito eu tinh...
Name: post_text, dtype: object

## 2.6 mastodon
- there is no keyword for mastodon. i don't see the keyword information in json files

In [17]:
def mastodon_json(data_path):
    """
    Extracts data from a LinkedIn JSON file and returns a list of structured dictionaries.
    
    Parameters:
        data_path (str): Path to the JSON file containing LinkedIn data
        
    Returns:
        list: List of dictionaries with extracted LinkedIn post data
    """
    result_list = []
    
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]
            else:
                return default
        return current
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Common fields regardless of structure               
                # Post info
                post_id = str(data.get('id', ''))
                post_time = str(data.get('created_at', ''))
                post_visibility = str(data.get('visibility', ''))
                post_sensitive = str(data.get('sensitive', ''))
                post_spoiler = str(data.get('spoiler_text', ''))
                post_lan = str(data.get('language', ''))
                post_text = str(data.get('content', ''))
                repost = str(data.get('reblogs_count', ''))
                reply = str(data.get('replies_count', ''))
                like = str(data.get('favourites_count', ''))

                # post links
                replyto_id = str(data.get('in_reply_to_id', ''))
                replyto_user = str(data.get('in_reply_to_user_id', ''))
                conversation_id = str(data.get('conversation_id', ''))

                # Engagement info - keeping complete objects for potential future use
                post_mention = data.get('mentions', [])
                post_tags = [str(item.get('name', '')) for item in data.get('tags', [])]
                post_reactions = data.get('reactions', [])

                # Author info
                user_id = str(get_nested(data, ['account', 'id'], ''))
                user_name = str(get_nested(data, ['account', 'username'], ''))
                user_locked = str(get_nested(data, ['account', 'locked'], ''))
                user_bot = str(get_nested(data, ['account', 'bot'], ''))
                user_discoverable = str(get_nested(data, ['account', 'discoverable'], ''))
                user_indexable = str(get_nested(data, ['account', 'indexable'], ''))
                user_group = str(get_nested(data, ['account', 'group'], ''))
                user_time = str(get_nested(data, ['account', 'created_at'], ''))
                user_note = str(get_nested(data, ['account', 'note'], ''))
                user_follow = str(get_nested(data, ['account', 'following_count'], ''))
                user_fans = str(get_nested(data, ['account', 'followers_count'], ''))
                user_status = str(get_nested(data, ['account', 'statuses_count'], ''))
                user_last_status = str(get_nested(data, ['account', 'last_status_at'], ''))
                user_hide_collection = str(get_nested(data, ['account', 'hide_collections'], ''))
                
                
                # Create result dictionary
                dict_tem = {
                    'keyword': pd.NA,
                    'post_id': post_id,
                    'post_time': post_time,
                    'post_lan': post_lan,
                    'post_text': post_text,
                    'post_visibility': post_visibility,
                    'post_sensitive': post_sensitive,
                    'post_spoiler': post_spoiler,
                    'reply': reply,
                    'repost': repost,
                    'like': like,
                    'replyto_id': replyto_id,
                    'replyto_user': replyto_user,
                    'conversation_id': conversation_id,
                    'post_mention': post_mention,
                    'post_tags': post_tags,
                    'post_reactions': post_reactions,
                    'user_id': user_id,
                    'user_name': user_name,
                    'user_locked': user_locked,
                    'user_bot': user_bot,
                    'user_discoverable': user_discoverable,
                    'user_indexable': user_indexable,
                    'user_group': user_group,
                    'user_time': user_time,
                    'user_note': user_note,
                    'user_follow': user_follow,
                    'user_fans': user_fans,
                    'user_status': user_status,
                    'user_last_status': user_last_status,
                    'user_hide_collection': user_hide_collection
            }
                result_list.append(dict_tem)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
                
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
### change 1 
platform = 'mastodon'

platform_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith(platform):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith(platform):
                json_path_2 = os.path.join(json_path_1, filename_2)

                for filename_3 in os.listdir(json_path_2):
                    if filename_3.endswith('.jsonl'):
                        json_path = os.path.join(json_path_2, filename_3)
                        print(f'processing {json_path}')
                        # create a new dataframe to save the csv file
                        ####  change 2 ##### 
                        data = mastodon_json(json_path)
                        # save the data into a csv file
                        df = pd.DataFrame(data)
                        platform_df = pd.concat([platform_df, df], ignore_index=True)
                        print(f'Extracted {filename} to {dst_dir}')
                        print(f'Extracted {len(df)} records')
                        print(f'Extracted {len(platform_df)} records to the data file')

platform_df.to_csv(f'ct_platform/data/extract_tem3/{platform}.csv', index=False)
print(f'Extracted {len(platform_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/05c-mastodon
processing /VData/scro4316/ct_platform/data/extract_tem2/05c-mastodon/05c-mastodon/kinky.business.jsonl
Extracted 05c-mastodon to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 0 records
Extracted 0 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/05c-mastodon/05c-mastodon/mastodon.art.jsonl
Extracted 05c-mastodon to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 23 records
Extracted 23 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/05c-mastodon/05c-mastodon/mastodon.librelabucm.org.jsonl
Extracted 05c-mastodon to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 0 records
Extracted 23 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/05c-mastodon/05c-mastodon/mastodon.com.br.jsonl
Extracted 05c-mastodon to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 1 records
Extracted 24 records to the data f

In [None]:
md = pd.read_csv('ct_platform/data/extract_tem3/mastodon.csv')
md['post_text'].sample(5)   

10084    <p>Flight: <a href="https://mastodon.social/ta...
6736     <p>Bangladesh 🇧🇩 COVID-19 current stats for We...
3836     <p><a href="https://mastodon.social/tags/paran...
17306    <p>Google News</p><p>Trump rally shooter searc...
926      <p>Scientists Propose New Way to Find Aliens: ...
Name: post_text, dtype: object

In [None]:
md['post_text_raw'] = md['post_text'].astype(str)
md['post_text'] = md['post_text_raw'].apply(extract_html_content)
md['post_text'].sample(5)


In [None]:
md.to_csv('ct_platform/data/extract_tem3/mastodon.csv', index=False)

## 2.7 gab
- not sure what are the `card` and `poll` keys in the json files, just kept them for now

In [19]:
def gab_json(data_path):
    """
    Extracts data from a Gab JSON file and returns a list of structured dictionaries.
    
    Parameters:
        data_path (str): Path to the JSON file containing Gab data
        
    Returns:
        list: List of dictionaries with extracted Gab post data
    """
    result_list = []
    
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            if isinstance(current, dict) and key in current:
                current = current[key]
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]
            else:
                return default
        return current if current is not None else default
    
    def create_post_dict(data_obj, keyword, is_quote='', is_reply='', is_repost=''):
        # Common fields regardless of structure               
        # Post info
        post_id = str(data_obj.get('id', ''))
        post_time = str(data_obj.get('created_at', ''))
        post_revise = str(data_obj.get('revised_at', ''))
        post_visibility = str(data_obj.get('visibility', ''))
        post_sensitive = str(data_obj.get('sensitive', ''))
        post_spoiler = str(data_obj.get('spoiler_text', ''))
        post_lan = str(data_obj.get('language', ''))
        post_text = str(data_obj.get('content', ''))
        post_pinnabel = str(data_obj.get('pinnable', ''))
        repost = str(data_obj.get('reblogs_count', ''))
        reply = str(data_obj.get('replies_count', ''))
        reply_direct = str(data_obj.get('direct_replies_count', ''))
        like = str(data_obj.get('favourites_count', ''))
        quote = str(data_obj.get('quotes_count', ''))
        view = str(get_nested(data_obj, ['analytics', 'count'], ''))
        
        # post links
        conversation_id = str(data_obj.get('conversation_id', ''))
        replyto_id = str(data_obj.get('in_reply_to_id', ''))
        replyto_user = str(data_obj.get('in_reply_to_user_id', ''))
        quoted_id = str(data_obj.get('quote_of_id', ''))
        reblog_id = str(data_obj.get('reblog_of_id', ''))
        group_id = str(data_obj.get('group_id', ''))
        status_context_id = str(data_obj.get('status_context_id', ''))

        # Engagement info - keeping complete objects for potential future use
        post_mention = data_obj.get('mentions', [])
        post_tags = [str(item.get('name', '')) for item in data_obj.get('tags', [])]
        post_reactions = data_obj.get('reactions_counts', [])# with reaction type number and counts

        # Author info
        user_id = str(get_nested(data_obj, ['account', 'id'], ''))
        user_name = str(get_nested(data_obj, ['account', 'username'], ''))
        user_locked = str(get_nested(data_obj, ['account', 'locked'], ''))
        user_note = str(get_nested(data_obj, ['account', 'note'], ''))
        user_time = str(get_nested(data_obj, ['account', 'created_at'], ''))
        user_spam = str(get_nested(data_obj, ['account', 'is_spam'], ''))
        user_follow = str(get_nested(data_obj, ['account', 'following_count'], ''))
        user_fans = str(get_nested(data_obj, ['account', 'followers_count'], ''))
        user_status = str(get_nested(data_obj, ['account', 'statuses_count'], ''))
        user_pro = str(get_nested(data_obj, ['account', 'is_pro'], ''))
        user_verified = str(get_nested(data_obj, ['account', 'is_verified'], ''))
        user_donor = str(get_nested(data_obj, ['account', 'is_donor'], ''))
        user_investor = str(get_nested(data_obj, ['account', 'is_investor'], ''))
        user_parody = str(get_nested(data_obj, ['account', 'is_parody'], ''))
        
        # Create result dictionary
        return {
            'keyword': keyword,
            'post_id': post_id,
            'post_time': post_time,
            'post_revise': post_revise,
            'post_lan': post_lan,
            'post_text': post_text,
            'post_visibility': post_visibility,
            'post_sensitive': post_sensitive,
            'post_spoiler': post_spoiler,
            'post_pinnable': post_pinnabel,
            'reply': reply,
            'reply_direct': reply_direct,
            'repost': repost,
            'like': like,
            'quote': quote,
            'view': view,
            'is_quote': is_quote,
            'is_reply': is_reply,
            'is_repost': is_repost,
            'replyto_id': replyto_id,
            'replyto_user': replyto_user,
            'conversation_id': conversation_id,
            'quoted_id': quoted_id,
            'reblog_id': reblog_id,
            'group_id': group_id,
            'status_context_id': status_context_id,
            'post_mention': post_mention,
            'post_tags': post_tags,
            'post_reactions': post_reactions,
            'user_id': user_id,
            'user_name': user_name,
            'user_locked': user_locked,
            'user_note': user_note,
            'user_time': user_time,
            'user_spam': user_spam,
            'user_follow': user_follow,
            'user_fans': user_fans,
            'user_status': user_status,
            'user_pro': user_pro,
            'user_verified': user_verified,
            'user_donor': user_donor,
            'user_investor': user_investor,
            'user_parody': user_parody
        }
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
    
                # Common fields regardless of structure
                keyword = str(data.get('keyword', ''))

                # Post links info
                is_quote = str(data.get('has_quote', 'false')).lower() == 'true'
                is_repost = str(data.get('reblog_of_id', 'false')).lower() != 'false' and str(data.get('reblog_of_id', '')) != ''
                is_reply = str(data.get('in_reply_to_id', 'false')).lower() != 'false' and str(data.get('in_reply_to_id', '')) != ''
                
                # Add main post
                main_post = create_post_dict(
                    data, 
                    keyword, 
                    'true' if is_quote else 'false',
                    'true' if is_reply else 'false',
                    'true' if is_repost else 'false'
                )
                result_list.append(main_post)
                
                # Add quoted post if exists
                if is_quote:
                    quoted_data = data.get('quote', {})
                    if quoted_data:  # Check if quoted post data exists
                        quoted_post = create_post_dict(quoted_data, keyword)
                        # Ensure linking between posts
                        if main_post['post_id'] and quoted_post['post_id']:
                            quoted_post['quoted_by_id'] = main_post['post_id']
                        result_list.append(quoted_post)
                
                # Add retweet post if exists
                if is_repost:
                    retweet_data = data.get('reblog', {})
                    if retweet_data:  # Check if retweet post data exists
                        retweet_post = create_post_dict(retweet_data, keyword)
                        # Ensure linking between posts
                        if main_post['post_id'] and retweet_post['post_id']:
                            main_post['reblog_of_id'] = retweet_post['post_id']
                        result_list.append(retweet_post)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
            
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
### change 1 
platform = 'gab'

platform_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith(platform):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith(platform):
                json_path_2 = os.path.join(json_path_1, filename_2)

                for filename_3 in os.listdir(json_path_2):
                    if filename_3.endswith('.jsonl'):
                        json_path = os.path.join(json_path_2, filename_3)
                        print(f'processing {json_path}')
                        # create a new dataframe to save the csv file
                        ####  change 2 ##### 
                        data = gab_json(json_path)
                        # save the data into a csv file
                        df = pd.DataFrame(data)
                        platform_df = pd.concat([platform_df, df], ignore_index=True)
                        print(f'Extracted {filename} to {dst_dir}')
                        print(f'Extracted {len(df)} records')
                        print(f'Extracted {len(platform_df)} records to the data file')

platform_df.to_csv(f'ct_platform/data/extract_tem3/{platform}.csv', index=False)
print(f'Extracted {len(platform_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/06c-gab
processing /VData/scro4316/ct_platform/data/extract_tem2/06c-gab/06c-gab/jeffrey epstein.jsonl
Extracted 06c-gab to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 545 records
Extracted 545 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/06c-gab/06c-gab/Roswell.jsonl
Extracted 06c-gab to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 178 records
Extracted 723 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/06c-gab/06c-gab/princess diana.jsonl
Extracted 06c-gab to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 204 records
Extracted 927 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/06c-gab/06c-gab/Bill Gates  Vaccines.jsonl
Extracted 06c-gab to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 530 records
Extracted 1457 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/

In [None]:
gab = pd.read_csv('ct_platform/data/extract_tem3/gab.csv')
gab['post_text'].sample(5)   
# need to further process the data, just save it to the csv file

6936     "You will own nothing"<br /><br />UN seizing l...
24816    ABSOLUTE TRUTH<br />The Cost of Illegal Immigr...
18172    <a href="https://gab.com/MikeF2021" class="men...
20108                                       Trump to JFK -
19671    Caitlin Clark Beats Three Olympians in Their H...
Name: post_text, dtype: object

In [51]:
## need to define a new function, gab data is too messy, different from previous platforms
def extract_gab_text(post_string):
    """
    Extract only the relevant text content from Gab posts, removing all URLs, mentions, and links.
    
    Args:
        post_string (str): Raw post string from Gab API
    
    Returns:
        str: Clean, relevant text content only
    """
    try:
        # Create BeautifulSoup object
        soup = BeautifulSoup(post_string, 'html.parser')
        
        # Remove all links and mentions completely
        for a in soup.find_all('a'):
            a.decompose()
        
        # Get the remaining text
        text = soup.get_text(separator=' ', strip=True)
        
        # Replace HTML entities
        text = text.replace('&apos;', "'").replace('&quot;', '"').replace('&amp;', '&')
        
        # Remove any URLs that might be in plain text
        text = re.sub(r'https?://\S+', '', text)
        text = re.sub(r'www\.\S+', '', text)
        
        # Remove mentions (@username)
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        
        # Clean up extra whitespace and punctuation artifacts
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
        text = re.sub(r'\s([,.!?;:])', r'\1', text)  # Fix spacing before punctuation
        text = re.sub(r'(\s)\1+', r'\1', text)  # Remove duplicate spaces
        
        # Clean up any bracket artifacts left from removing links
        text = re.sub(r'\[\s*\]', '', text)
        text = re.sub(r'\(\s*\)', '', text)
        text = re.sub(r'\{\s*\}', '', text)
        text = re.sub(r'\{\{\{\s*\}\}\}', '', text)
        
        return text.strip()
    
    except Exception as e:
        print(f"Error processing post: {e}")
        return ""

# Example usage with pandas DataFrame:
# df['clean_text'] = df['gab_post_column'].apply(extract_gab_text)

# Example Gab posts for testing
gab_posts = [
    '''We know Bill Barr's father hired Epstein out of college.  Two peas in a pod?  Is Epstein really dead? <a href="https://www.zerohedge.com/political/did-jeffrey-epstein-william-barr-attend-interlochen-1967" rel="nofollow noopener" target="_blank"><span class="invisible">https://www.</span>zerohedge.com/political/did-jeffrey-epstein-w<span class="invisible">illiam-barr-attend-interlochen-1967</span><span class="ellipsis"></span></a>''',
    
    '''<a href="https://www.zerohedge.com/political/did-jeffrey-epstein-william-barr-attend-interlochen-1967" rel="nofollow noopener" target="_blank"><span class="invisible">https://www.</span>zerohedge.com/political/did-jeffrey-epstein-w<span class="invisible">illiam-barr-attend-interlochen-1967</span><span class="ellipsis"></span></a><br /><br />It does look like him<br /><br />Interlochen is a prestigious fine arts preparatory school in northern Michigan, and Jeffrey Epstein attended Interlochen "camp" in 1967 as a teenager. But the school disavows that two-time Attorney General William Barr also attended the camp in 1967, despite pictorial evidence that appears to tell a different story. <br /><br />At the top of the page, the picture shows teenage Epstein standing in front of his respective lodge at Interlochen in 1967 and a boy who bears an uncanny resemblance to a miniature William Barr kneeling before his respective lodge. In fact, he could be a doppelganger for William Barr if he isn't William Barr.<br /><br />Interlochen cannot possibly deny that Epstein attended the camp, because he became a major booster for the school, donating $500,000 that financed construction of the Jeffrey Epstein Scholarship Lodge on the school's campus. He also held soirees for Interlochen alumni at his New York townhouse, and, of course, he preyed on Interlochen minors. "Jane," an Interlochen camp alumni, was one of four women who testified at Ghislaine Maxwell's trial. She testified that she met Maxwell and Epstein at Interlochen's summer camp in the summer of 1994, when she was 13, and they groomed her for sexual abuse that lasted more than five years. <br /><br />On July 6, 2019, Epstein was arrested at the Teterboro Airport in New Jersey. Two days later Barr seemingly recused himself from all things Epstein. "I'm recused from that matter because one of the law firms that represented Epstein long ago was a firm I subsequently joined for a period of time," Barr told reporters. Barr was referring to his tenure at Kirkland and Ellis whose Jay Leftkowitz colluded with Assistant US Attorney Ann Marie Villlafana to work out Epstein's corrupt, "sweetheart" deal in 2007, which landed Epstein in a county jail for 13 months, even though the Justice Department was aware of more than 30 underage victims of Epstein. <br /><br />Barr had additional reasons to recuse himself from all things Epstein in addition to the conflict of interest engendered by his''',
    
    '''<a href="https://gab.com/USA_1" class="mention" rel="nofollow noopener" target="_blank">@USA_1</a> <a href="https://gab.com/Frankperrewar" class="mention" rel="nofollow noopener" target="_blank">@Frankperrewar</a> what are you people on?!<br /><br />'Don't get me wrong, I am not exposing the Trickster Trumpster's deviancies in defense of Demophile pedophiles.'<br /><br /><a href="https://m.youtube.com/watch?v=tTp5spO4acE" rel="nofollow noopener" target="_blank"><span class="invisible">https://</span>m.youtube.com/watch?v=tTp5spO4acE<span class="invisible"></span></a>'''
]

# Test the function
for i, post in enumerate(gab_posts):
    print(f"=== Example {i+1} ===")
    print(extract_gab_text(post))
    print("\n---\n")

=== Example 1 ===
We know Bill Barr's father hired Epstein out of college. Two peas in a pod? Is Epstein really dead?

---

=== Example 2 ===
It does look like him Interlochen is a prestigious fine arts preparatory school in northern Michigan, and Jeffrey Epstein attended Interlochen "camp" in 1967 as a teenager. But the school disavows that two-time Attorney General William Barr also attended the camp in 1967, despite pictorial evidence that appears to tell a different story. At the top of the page, the picture shows teenage Epstein standing in front of his respective lodge at Interlochen in 1967 and a boy who bears an uncanny resemblance to a miniature William Barr kneeling before his respective lodge. In fact, he could be a doppelganger for William Barr if he isn't William Barr. Interlochen cannot possibly deny that Epstein attended the camp, because he became a major booster for the school, donating $500,000 that financed construction of the Jeffrey Epstein Scholarship Lodge on the

In [52]:
# gab['post_text_raw'] = gab['post_text'].astype(str)
gab['post_text'] = gab['post_text_raw'].apply(extract_gab_text)
gab['post_text'].sample(5)

9589     ==== ==== PLEASE SHARE: CORRECTING YOUR LIMITE...
19918                              Vegan Ragoona Princess!
10062    death to google, creating new ways to be evil....
10099    ⚕ THE LATEST NEWS About Med Beds! NESARA GESAR...
3823     Only the one who is in his absolute and avoids...
Name: post_text, dtype: object

In [None]:
gab.to_csv('ct_platform/data/extract_tem3/gab.csv', index=False)

## 2.8 gettr
- Gettr's nested structure is a bit annoying, where post content and user information are stored in auxiliary sections rather than directly with each post item.
- it collected posts based on users' news feed, `receiver_id` refers to the user who received the news feed

In [21]:
def gettr_json(data_path):
    """
    Parse Gettr JSON file line by line and extract structured information.
    
    Args:
        data_path: Path to the JSON file containing Gettr data
        
    Returns:
        List of dictionaries containing structured post information
    """
    import json
    
    result_list = []
    
    # Helper function to safely get nested values
    def get_nested(data, keys, default=''):
        """Safely get nested values from a dictionary using a list of keys"""
        current = data
        for key in keys:
            # check if "current" is a dict and if the current key exists in that dict
            # if both true, update the current to the value of that key
            if isinstance(current, dict) and key in current:
                current = current[key]
            # Handles list indexing: checks if current is a list AND the key is an integer AND the index is valid
            # If all these conditions are true, updates current to the element at that index
            elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
                current = current[key]
            # If we can't navigate further (key doesn't exist or current is not a dictionary/list), return the default value
            else:
                return default
        return current
    
    # Helper function to extract post information
    def create_post_dict(post_data, user_data, activity_data, keyword):
        # Post info
        post_id = str(post_data.get('_id', ''))
        post_text = str(post_data.get('txt', ''))
        post_time = str(post_data.get('cdate', ''))
        post_udate = str(post_data.get('udate', ''))
        post_lan = str(post_data.get('txt_lang', ''))
        post_visibility = str(get_nested(post_data, ['vis'], ''))
        
        # Media info
        has_media = str(bool(post_data.get('main', '')))
        media_url = str(post_data.get('main', ''))
        video_url = str(post_data.get('vid', ''))
        video_duration = str(post_data.get('vid_dur', ''))
        
        # Engagement metrics
        like = str(post_data.get('lkbpst', '0'))
        reply = str(post_data.get('cm', '0'))
        repost = str(post_data.get('shbpst', '0'))
        views = str(post_data.get('vfpst', '0'))
        
        # Tags
        user_tags = post_data.get('utgs', [])
        hashtags = post_data.get('vtgs', [])
        
        # User info
        user_id = str(user_data.get('username', '')) # username is user id in gettr
        user_nickname = str(user_data.get('nickname', ''))
        user_name = str(user_data.get('ousername', ''))
        user_note = str(user_data.get('dsc', ''))
        user_location = str(user_data.get('location', ''))
        user_fans = str(user_data.get('flg', '0'))
        user_follow = str(user_data.get('flw', '0'))
        user_time = str(user_data.get('cdate', ''))
        # user_bgimg = str(user_data.get('bgimg', '')) # profile background image
        user_status = str(user_data.get('status', ''))
        
        # Post links and relationship info
        # receiver_id is the user who sees the feed
        receiver_id = str(activity_data.get('receiver_id', '')) 
        action_type = str(activity_data.get('action', ''))
        
        # Get references to related posts/users from activity
        # Each post has activity object with:
        initiator_id = str(get_nested(activity_data, ['activity','init_id'], ''))
        related_post_ids = [str(id) for id in activity_data.get('activity', {}).get('rpstIds', [])]
        related_user_ids = [str(id) for id in activity_data.get('activity', {}).get('rusrIds', [])]
        source_type = str(get_nested(activity_data, ['activity', 'src_type'], ''))
        source_id = str(get_nested(activity_data, ['activity', 'src_id'], ''))
        target_type = str(get_nested(activity_data, ['activity', 'tgt_type'], ''))
        target_id = str(get_nested(activity_data, ['activity', 'tgt_id'], ''))
        
        return {
            # Search metadata
            'keyword': keyword,
            
            # Post info
            'post_id': post_id,
            'post_text': post_text,
            'post_time': post_time,
            'post_udate': post_udate,
            'post_lan': post_lan,
            'post_visibility': post_visibility,
            
            # Media info
            'has_media': has_media,
            'media_url': media_url,
            'video_url': video_url,
            'video_duration': video_duration,
            
            # Engagement metrics
            'like': like,
            'reply': reply,
            'repost': repost,
            'views': views,
            
            # Tags
            'user_tags': user_tags,
            'hashtags': hashtags,
            
            # User info
            'user_id': user_id,
            'user_nickname': user_nickname,
            'user_name': user_name,
            'user_note': user_note,
            'user_location': user_location,
            'user_fans': user_fans,
            'user_follow': user_follow,
            'user_time': user_time,
            'user_status': user_status,
            
            # Post links
            'receiver_id': receiver_id,
            'action_type': action_type,
            'initiator_id': initiator_id,
            'related_post_ids': related_post_ids,
            'related_user_ids': related_user_ids,
            'source_type': source_type,
            'source_id': source_id,
            'target_type': target_type,
            'target_id': target_id
        }
    
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                
                # Extract the search keyword
                keyword = str(data.get('keword', ''))
                
                # Extract posts information from the response
                post_feed = get_nested(data, ['result', 'data'], {})
                post_items = get_nested(post_feed, ['list'], [])
                
                # Get auxiliary data containing post content and user info
                aux_data = get_nested(data, ['result', 'aux'], {})
                posts_data = get_nested(aux_data, ['post'], {})
                users_data = get_nested(aux_data, ['uinf'], {})
                
                # Process each post item
                for post_item in post_items:
                    # Get the post ID from activity
                    post_id = get_nested(post_item, ['activity', 'tgt_id'], '')
                    
                    # Skip if no post ID
                    if not post_id:
                        continue
                    
                    # Get post data from auxiliary information
                    post_data = posts_data.get(post_id, {})
                    
                    # Skip if no post data found
                    if not post_data:
                        continue
                    
                    # Get user ID and data
                    user_id = post_data.get('uid', '')
                    user_data = users_data.get(user_id, {})
                    
                    # Skip if no user data found
                    if not user_data:
                        continue
                    
                    # Create post dictionary
                    dict_tem = create_post_dict(post_data, user_data, post_item, keyword)
                    result_list.append(dict_tem)
                    
                    # Check for any related posts (reposts)
                    related_post_ids = post_data.get('rpstIds', [])
                    for related_id in related_post_ids:
                        if related_id in posts_data:
                            related_post = posts_data.get(related_id, {})
                            related_user_id = related_post.get('uid', '')
                            related_user = users_data.get(related_user_id, {})
                            
                            if related_post and related_user:
                                # Create related post dictionary with relationship info
                                dict_tem = create_post_dict(related_post, related_user, post_item, keyword)
                                dict_tem['is_related'] = 'True'
                                dict_tem['related_to'] = post_id
                                result_list.append(dict_tem)
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON line in {data_path}")
                continue
            except Exception as e:
                print(f"Error processing data: {str(e)} in {data_path}")
                continue
            
    return result_list

In [None]:
src_dir = 'ct_platform/data/extract_tem2'
dst_dir = 'ct_platform/data/extract_tem3' 
### change 1 
platform = 'gettr'

platform_df = pd.DataFrame()
for filename in os.listdir(src_dir):
    if filename.endswith(platform):
        json_path_1 = os.path.join(src_dir, filename)
        print(json_path_1)
        
        for filename_2 in os.listdir(json_path_1):
            if filename_2.endswith(platform):
                json_path_2 = os.path.join(json_path_1, filename_2)

                for filename_3 in os.listdir(json_path_2):
                    if filename_3.endswith('.jsonl'):
                        json_path = os.path.join(json_path_2, filename_3)
                        print(f'processing {json_path}')
                        # create a new dataframe to save the csv file
                        ####  change 2 ##### 
                        data = gettr_json(json_path)
                        # save the data into a csv file
                        df = pd.DataFrame(data)
                        platform_df = pd.concat([platform_df, df], ignore_index=True)
                        print(f'Extracted {filename} to {dst_dir}')
                        print(f'Extracted {len(df)} records')
                        print(f'Extracted {len(platform_df)} records to the data file')

platform_df.to_csv(f'ct_platform/data/extract_tem3/{platform}.csv', index=False)
print(f'Extracted {len(platform_df)} records to the data file')

/VData/scro4316/ct_platform/data/extract_tem2/07-gettr
processing /VData/scro4316/ct_platform/data/extract_tem2/07-gettr/07-gettr/jeffrey epstein.jsonl
Extracted 07-gettr to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 10010 records
Extracted 10010 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/07-gettr/07-gettr/princess diana.jsonl
Extracted 07-gettr to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 20 records
Extracted 10030 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/07-gettr/07-gettr/jfk.jsonl
Extracted 07-gettr to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 10008 records
Extracted 20038 records to the data file
processing /VData/scro4316/ct_platform/data/extract_tem2/07-gettr/07-gettr/deep-state.jsonl
Extracted 07-gettr to /VData/scro4316/ct_platform/data/extract_tem3
Extracted 10005 records
Extracted 30043 records to the data file
processing /VData/scro4316/ct_platform/data/ex

In [None]:
gt = pd.read_csv('ct_platform/data/extract_tem3/gettr.csv')
gt['post_text'].sample(5)   
# NO need to further process the data, just save it to the csv file, gettr is cool 

  gt = pd.read_csv('/VData/scro4316/ct_platform/data/extract_tem3/gettr.csv')


208243    Bilderberg Expert Daniel Estulin Exposes What'...
307672    U.S. Air Force veteran Kristen Meghan blows th...
278589    FLAT EARTH DAVE! https://rumble.com/register/R...
424855                      Illuminati du monde entier \n\n
103714    By Rafapal 🇺🇸🧨\nUm juiz ordena que 150 nomes r...
Name: post_text, dtype: object

# 3. Appendix 
example JSON file structure for different platforms, i did not include examples for all platforms. only included significant ones especially Gettr and Tiktok whose structure are very odd. feel free to check the json files in the unzipped folder for more details.

## 3.1 Tiktok 

The below is tiktok `08-tiktok.jsonl`
```json
{
  "id": "7334860866690501930",
  "text": "Dr Steven Greer reveals the secrets of an alien dissection #fyp #aliens #disclosure #stevengreer #ufo #area51 #governmentsecrets #aliensighting ",
  "createTime": 1707780398,
  "createTimeISO": "2024-02-12T23:26:38.000Z",
  "isAd": false,
  "isMuted": false,
  "authorMeta": {
    "id": "6943229016070194181",
    "name": "ofanotherworld",
    "nickName": "OfAnotherWorld",
    "verified": false,
    "signature": "An extraterrestrial reality is becoming more evident. Let me prove it to you...",
    "bioLink": null,
    "avatar": "https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/c5b745b7b3e99eb6bed6ca7a8f3874a6~c5_720x720.jpeg?lk3s=a5d48078&nonce=65228&refresh_token=71a2809e04d05a5c24d9304072499a54&x-expires=1722157200&x-signature=5aua%2BWLotQLGmPuMShij7i7in%2F0%3D&shp=a5d48078&shcp=b59d6b55",
    "privateAccount": false,
    "following": 5,
    "fans": 175000,
    "heart": 2600000,
    "video": 230,
    "digg": 254
  },
  "musicMeta": {
    "musicName": "TRANSGENDER (PEARL WHITE VIP)",
    "musicAuthor": "PEARL WHITE",
    "musicOriginal": false,
    "musicAlbum": "TRANSGENDER (PEARL WHITE VIP)",
    "playUrl": "https://p16-va-default.akamaized.net/obj/tos-useast2a-ve-2774/7fa5e099ab4645f28a7cfe73f62aea84",
    "coverMediumUrl": "https://p16-va-default.akamaized.net/img/tos-useast2a-v-2774/6b3f27cd0eb24b2184c05100ebea73aa~c5_200x200.jpeg",
    "musicId": "6756319481322014722"
  },
  "webVideoUrl": "https://www.tiktok.com/@ofanotherworld/video/7334860866690501930",
  "mediaUrls": [
    "https://v16-webapp-prime.tiktok.com/video/tos/maliva/tos-maliva-ve-0068c799-us/oIMsQTeEAGIvSAfJ0c9CeLnKSHkdhIQ4ruJNlg/?a=1988&bti=NDU3ZjAwOg%3D%3D&ch=0&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C&br=1360&bt=680&cs=0&ds=6&ft=4fUEKMzD8Zmo0Qa7--4jVvcXDpWrKsd.&mime_type=video_mp4&qs=0&rc=aDo2ODc8ZDc2NDNnZmk5NUBpM2lnNGw5cm5xcjMzZzgzNEA1MmNiLzAyXjIxYzRjXy8wYSNhNi1rMmQ0MTZgLS1kLy9zcw%3D%3D&btag=e00090000&expire=1722158309&l=20240726091728732D29F760B4AF0A4BA0&ply_type=2&policy=2&signature=1c46f9764f07a9765e37e0c1ff53323f&tk=tt_chain_token"
  ],
  "videoMeta": {
    "height": 1024,
    "width": 576,
    "duration": 60,
    "coverUrl": "https://p19-sign.tiktokcdn-us.com/obj/tos-useast5-p-0068-tx/4708eb871715422cbdb694d22a8a051a_1712550486?lk3s=b59d6b55&nonce=20175&refresh_token=b58efbce463fef28123f5784a13a3938&x-expires=1722157200&x-signature=WWXYmqygn7t3w4HPBlUFE2PVIjU%3D&shp=b59d6b55&shcp=-",
    "originalCoverUrl": "https://p19-sign.tiktokcdn-us.com/obj/tos-useast5-p-0068-tx/4708eb871715422cbdb694d22a8a051a_1712550486?lk3s=b59d6b55&nonce=20175&refresh_token=b58efbce463fef28123f5784a13a3938&x-expires=1722157200&x-signature=WWXYmqygn7t3w4HPBlUFE2PVIjU%3D&shp=b59d6b55&shcp=-",
    "format": "mp4",
    "originalDownloadAddr": "https://v16-webapp-prime.tiktok.com/video/tos/maliva/tos-maliva-ve-0068c799-us/oIMsQTeEAGIvSAfJ0c9CeLnKSHkdhIQ4ruJNlg/?a=1988&bti=NDU3ZjAwOg%3D%3D&ch=0&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C&br=1360&bt=680&cs=0&ds=6&ft=4fUEKMzD8Zmo0Qa7--4jVvcXDpWrKsd.&mime_type=video_mp4&qs=0&rc=aDo2ODc8ZDc2NDNnZmk5NUBpM2lnNGw5cm5xcjMzZzgzNEA1MmNiLzAyXjIxYzRjXy8wYSNhNi1rMmQ0MTZgLS1kLy9zcw%3D%3D&btag=e00090000&expire=1722158309&l=20240726091728732D29F760B4AF0A4BA0&ply_type=2&policy=2&signature=1c46f9764f07a9765e37e0c1ff53323f&tk=tt_chain_token",
    "downloadAddr": "https://v16-webapp-prime.tiktok.com/video/tos/maliva/tos-maliva-ve-0068c799-us/oIMsQTeEAGIvSAfJ0c9CeLnKSHkdhIQ4ruJNlg/?a=1988&bti=NDU3ZjAwOg%3D%3D&ch=0&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C&br=1360&bt=680&cs=0&ds=6&ft=4fUEKMzD8Zmo0Qa7--4jVvcXDpWrKsd.&mime_type=video_mp4&qs=0&rc=aDo2ODc8ZDc2NDNnZmk5NUBpM2lnNGw5cm5xcjMzZzgzNEA1MmNiLzAyXjIxYzRjXy8wYSNhNi1rMmQ0MTZgLS1kLy9zcw%3D%3D&btag=e00090000&expire=1722158309&l=20240726091728732D29F760B4AF0A4BA0&ply_type=2&policy=2&signature=1c46f9764f07a9765e37e0c1ff53323f&tk=tt_chain_token"
  },
  "diggCount": 113600,
  "shareCount": 2503,
  "playCount": 2300000,
  "collectCount": 13400,
  "commentCount": 616,
  "mentions": [],
  "hashtags": [
    {
      "id": "229207",
      "name": "fyp",
      "title": "",
      "cover": ""
    },
    {
      "id": "6310",
      "name": "aliens",
      "title": "",
      "cover": ""
    },
    {
      "id": "718",
      "name": "disclosure",
      "title": "",
      "cover": ""
    },
    {
      "id": "1651247064613893",
      "name": "stevengreer",
      "title": "",
      "cover": ""
    },
    {
      "id": "154930",
      "name": "ufo",
      "title": "",
      "cover": ""
    },
    {
      "id": "162682",
      "name": "area51",
      "title": "The truth is out there.",
      "cover": "https://p16-amd-va.tiktokcdn.com/obj/musically-maliva-obj/c1db886596dc5bd4c7780950293f54a8?nonce=56879&refresh_token=ce3fa4a03f75853f25a81264e420c315&shp=b59d6b55&shcp=-"
    },
    {
      "id": "1626660341054469",
      "name": "governmentsecrets",
      "title": "",
      "cover": ""
    },
    {
      "id": "39826749",
      "name": "aliensighting",
      "title": "",
      "cover": ""
    }
  ],
  "effectStickers": [],
  "isSlideshow": false,
  "isPinned": false,
  "searchQuery": "aliens"
}
```

The below is `08b-tiktok.jsonl` and `08c-tiktok.jsonl`, they are the same in structure. 
``` JSON
{
  "id": "7175685519475641646",
{
  "keyword": "jfk",
  "nextCursor": "MjQtLS1zcGxpdF9jdXNyb3ItLS0yMDI0MDgwNzEzMzQzNDgyODk0MUU3OEFGOTkyMDVBNzkz",
  "author": {
    "avatarLarger": "https://p19-sign.tiktokcdn-us.com/tos-useast5-avt-0068-tx/7336452379141668910~c5_1080x1080.jpeg?lk3s=a5d48078&nonce=26404&refresh_token=a2aac78ae1d60d8a4ab16619b2c5cca8&x-expires=1723208400&x-signature=hqcfuCuYYZ4t4He5w257b76VyAc%3D&shp=a5d48078&shcp=b59d6b55",
    "avatarMedium": "https://p19-sign.tiktokcdn-us.com/tos-useast5-avt-0068-tx/7336452379141668910~c5_720x720.jpeg?lk3s=a5d48078&nonce=16506&refresh_token=74d3efb5ab9ecd635d3d260cbbeccb60&x-expires=1723208400&x-signature=CK8%2FmCdmHt%2BCDrKt50cMvekIcOQ%3D&shp=a5d48078&shcp=b59d6b55",
    "avatarThumb": "https://p16-sign.tiktokcdn-us.com/tos-useast5-avt-0068-tx/7336452379141668910~c5_100x100.jpeg?lk3s=a5d48078&nonce=31698&refresh_token=e067d76b9df1b46662c37e0078a2e439&x-expires=1723208400&x-signature=sKKXwMXbu3mJvX01yewQb8tiFG8%3D&shp=a5d48078&shcp=b59d6b55",
    "commentSetting": 0,
    "downloadSetting": 0,
    "duetSetting": 0,
    "ftc": false,
    "id": "6800869635085878278",
    "nickname": "DigitalLuke",
    "openFavorite": false,
    "privateAccount": false,
    "relation": 0,
    "secUid": "MS4wLjABAAAA69BGgglmv65UbDF4i12elnbp84vY5nf3DYl_pvSmwU1QeLdd5BlkrSabQC1TrLx6",
    "secret": false,
    "signature": "I animate sometimes\n17",
    "stitchSetting": 0,
    "uniqueId": "digitalluke22",
    "verified": false
  },
  "authorStats": {
    "diggCount": 2162,
    "followerCount": 136600,
    "followingCount": 2056,
    "heart": 13000000,
    "heartCount": 13000000,
    "videoCount": 78
  },
  "collected": false,
  "createTime": 1670719489,
  "desc": "Lego JFK Assassination but it's PG",
  "digged": false,
  "duetEnabled": true,
  "duetInfo": {
    "duetFromId": "0"
  },
  "forFriend": false,
  "id": "7175685519475641646",
  "isAd": false,
  "itemCommentStatus": 0,
  "itemMute": false,
  "music": {
    "album": "",
    "authorName": "DigitalLuke",
    "coverLarge": "https://p19-sign.tiktokcdn-us.com/tos-useast5-avt-0068-tx/7336452379141668910~c5_1080x1080.jpeg?lk3s=a5d48078&nonce=26404&refresh_token=a2aac78ae1d60d8a4ab16619b2c5cca8&x-expires=1723208400&x-signature=hqcfuCuYYZ4t4He5w257b76VyAc%3D&shp=a5d48078&shcp=b59d6b55",
    "coverMedium": "https://p19-sign.tiktokcdn-us.com/tos-useast5-avt-0068-tx/7336452379141668910~c5_720x720.jpeg?lk3s=a5d48078&nonce=16506&refresh_token=74d3efb5ab9ecd635d3d260cbbeccb60&x-expires=1723208400&x-signature=CK8%2FmCdmHt%2BCDrKt50cMvekIcOQ%3D&shp=a5d48078&shcp=b59d6b55",
    "coverThumb": "https://p16-sign.tiktokcdn-us.com/tos-useast5-avt-0068-tx/7336452379141668910~c5_100x100.jpeg?lk3s=a5d48078&nonce=31698&refresh_token=e067d76b9df1b46662c37e0078a2e439&x-expires=1723208400&x-signature=sKKXwMXbu3mJvX01yewQb8tiFG8%3D&shp=a5d48078&shcp=b59d6b55",
    "duration": 14,
    "id": "7175685509711285034",
    "original": true,
    "playUrl": "",
    "title": "original sound"
  },
  "officalItem": false,
  "originalItem": false,
  "privateItem": false,
  "secret": false,
  "shareEnabled": true,
  "showNotPass": false,
  "stats": {
    "collectCount": 17900,
    "commentCount": 289,
    "diggCount": 104000,
    "playCount": 2100000,
    "shareCount": 6254
  },
  "stitchEnabled": true,
  "video": {
    "bitrate": 322456,
    "cover": "https://p16-sign.tiktokcdn-us.com/obj/tos-useast5-p-0068-tx/867ec0248daa412ea37a9033c928b05c?lk3s=b59d6b55&nonce=82009&refresh_token=b9d5bdd582b0897365e795228bd70b73&x-expires=1723208400&x-signature=nR659hwAiceD0WbSjYG%2F2mt%2FNU0%3D&shp=b59d6b55&shcp=-",
    "downloadAddr": "https://v16-webapp-prime.us.tiktok.com/video/tos/useast5/tos-useast5-pve-0068-tx/7b72d3cf557f42258ac7e18af90f0e7a/?a=1988&bti=NDU3ZjAwOg%3D%3D&ch=0&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C&br=728&bt=364&cs=0&ds=1&ft=4KJMyMvt8Zmo0Z8p0-4jVZ-bdpWrKsd.&mime_type=video_mp4&qs=0&rc=NDdoNzs3M2ZmaDxoNTo2N0BpMzk5bzg6Zm9saDMzZzczNEBiMzI2M2JfNV4xYTNhLTIxYSMwMjYvcjRnY15gLS1kMS9zcw%3D%3D&btag=e00088000&expire=1723210490&l=2024080713343586478F63727D70052560&ply_type=2&policy=2&signature=87b10dce3db3dc4f4f468ce68f7ff8f8&tk=tt_chain_token",
    "duration": 14,
    "dynamicCover": "https://p16-sign.tiktokcdn-us.com/obj/tos-useast5-p-0068-tx/baf5bb8014ee44219343ad099977079f_1670719490?lk3s=b59d6b55&nonce=90853&refresh_token=d6ac39e01599c7daf563c7fbf0599ef8&x-expires=1723208400&x-signature=UqHGSb0Q2BrP2n9MQ2yq4i375YM%3D&shp=b59d6b55&shcp=-",
    "encodeUserTag": "",
    "encodedType": "normal",
    "format": "mp4",
    "height": 360,
    "id": "7175685519475641646",
    "originCover": "https://p19-pu-sign-useast8.tiktokcdn-us.com/obj/tos-useast5-p-0068-tx/179a89e3abf040fdb74173729039402a_1670719490?lk3s=b59d6b55&nonce=92675&refresh_token=28ce2e9226dca72f48d4e3d0d90d0ff2&x-expires=1723208400&x-signature=ZGI%2BkfExKLMWlLMeYhId4VLNYsI%3D&shp=b59d6b55&shcp=-",
    "playAddr": "https://v16-webapp-prime.us.tiktok.com/video/tos/useast5/tos-useast5-ve-0068c001-tx/3c024930f91944c5ab60168d3468642b/?a=1988&bti=NDU3ZjAwOg%3D%3D&ch=0&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C&br=628&bt=314&cs=0&ds=1&ft=4KJMyMvt8Zmo0Z8p0-4jVZ-bdpWrKsd.&mime_type=video_mp4&qs=0&rc=ZTZnZjpnaTQ2ZzMzaDU4NEBpMzk5bzg6Zm9saDMzZzczNEAwMmA0NTNhX18xYWNjLjE0YSMwMjYvcjRnY15gLS1kMS9zcw%3D%3D&btag=e00088000&expire=1723210490&l=2024080713343586478F63727D70052560&ply_type=2&policy=2&signature=5e739822adc04a77b885f189aedda236&tk=tt_chain_token",
    "ratio": "360p",
    "reflowCover": "https://p16-sign.tiktokcdn-us.com/tos-useast5-p-0068-tx/867ec0248daa412ea37a9033c928b05c~tplv-photomode-video-cover:480:480.jpeg?lk3s=b59d6b55&nonce=88650&refresh_token=418867d145eea29f2a6d2b11a010d9fb&x-expires=1723208400&x-signature=rHn4%2BKom6lu3bQWn%2BtXHdiHVHZE%3D&shp=b59d6b55&shcp=-",
    "shareCover": [
      "",
      "https://p16-pu-sign-useast8.tiktokcdn-us.com/tos-useast5-p-0068-tx/179a89e3abf040fdb74173729039402a_1670719490~tplv-photomode-tiktok-play.jpeg?lk3s=b59d6b55&nonce=21873&refresh_token=f2147ad7ab7da0cd3e02a04e42f9d647&x-expires=1723640400&x-signature=r9wZWIQRuPHr8q%2FfaIlhFFEliT8%3D&shp=b59d6b55&shcp=-",
      "https://p19-sign.tiktokcdn-us.com/tos-useast5-p-0068-tx/179a89e3abf040fdb74173729039402a_1670719490~tplv-photomode-share-play.jpeg?lk3s=b59d6b55&nonce=49447&refresh_token=e3d498fa08e16205b18a40021b3851bd&x-expires=1723640400&x-signature=tyoco6ZZKxy29yAlChpp%2BYcbMok%3D&shp=b59d6b55&shcp=-"
    ],
    "videoQuality": "normal",
    "width": 640
  },
  "vl1": false
}
```

## 3.7 Gettr


This is a JSON response from Gettr's API (a social media platform). Let me break down the structure and key meanings:

__Top-Level Structure__

`_t`: "xresp" - Indicates this is an API response object
`rc`: "OK" - Response code indicating success
`result`: Main data container
`keword`: "9/11" - The search keyword used for this query

__Main Data Section (result.data)__

`udate` & `cdate`: Timestamps in milliseconds (Unix time) for update and creation dates
`_t`: "pstfd" - Object type identifier ("post feed")
`list`: Array of post items/activities in the feed
`_id`: Unique identifier for this feed ("estl_saghiyeganeh_1721999778754")

__Post Items (list array elements)__
Each item represents an activity in the feed with:

`_t`: "psti" - Object type ("post item")
`receiver_id`: User receiving the post (e.g., "saghiyeganeh")
``cdate``/udate: Creation/update timestamps
`_id`: Unique identifier for this activity
`action`: Activity type (e.g., "pub_pst" for published post)

__Activity Details__
Each post item contains an activity object with:

`init_id`: User who initiated the activity
`src_type`: Source type ("u" for user)
`action`: Action performed (e.g., "pub_pst")
`tgt_type`: Target type (usually "post")
`tgt_id`: Unique ID of the target post
`rpstIds`: Array of reposted post IDs
`rusrIds`: Array of referenced user IDs

__Auxiliary Data (result.aux)__
Contains additional information to supplement the feed:

__Post Content (aux.post)__
Detailed information about each post referenced in the feed:

`txt`: The post text content
`main`: Media URL (if present)
`vid`/`ovid`: Video URLs
`utgs`: User tags
`vis`: Visibility ("p" for public)
`lkbpst`: Number of likes
`cm`: Number of comments
`shbpst`: Number of shares
`vfpst`: Number of video views (when applicable)

__User Information (aux.uinf)__
Contains profile information for all users mentioned:

`username`: User's handle
`ousername`: Original username
`nickname`: Display name
`dsc`: User bio/description
`flw`: Following count
`flg`: Followers count
`location`: User location
`ico`: Profile picture URL
`bgimg`: Profile background image URL

__Stats Information (aux.s_pst)__
Provides engagement metrics for posts:

`lkbpst`: Likes count
`cm`: Comments count
`shbpst`: Shares count

__Pagination__
`cursor`: Encoded string for pagination or next page retrieval

In [None]:
## my code ###
# def truthsocial_json(data_path):
#     """
#     data_path is path of the json file
#     read the data line by line, return list of dictionaries, this function is used to read the json file
#     we did not read music relevant info
#     """
#     result_list = []  # Changed from 'list' to avoid shadowing built-in
  
#     # Helper function to safely get nested values
#     def get_nested(data, keys, default=''):
#         """Safely get nested values from a dictionary using a list of keys"""
#         current = data
#         for key in keys:
#             if isinstance(current, dict) and key in current:
#                 current = current[key]
#             elif isinstance(current, list) and isinstance(key, int) and 0 <= key < len(current):
#                 current = current[key]  # Added list index access capability
#             else:
#                 return default
#         return current
    
#     with open(data_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             try:
#                 data = json.loads(line)
                
#                 # Common fields regardless of structure                
#                 keyword = str(data.get('keyword', ''))
                
#                 ###### here we save posts individually #####
#                 # NOTE. each record is all meta info of a post, including repost, reply, and quoted links
#                 # NOTE. the nested/original post, reply_to_post, and quoted post are saved as an individual record

#                 #### from here we loop, that is, if there are any rebloy, reply, or quoted nested posts, 
#                 #### we save them as a new record
#                 # post links info
#                 reblogged = str(data.get('reblogged', ''))
#                 quoted_id = str(data.get('quoted_id', ''))   #  the post is a repost if not null
#                 reply_id = str(data.get('in_reply_to_id', ''))   #  the post is a reply if not null
                
#                 # post info
#                 post_id = str(data.get('id', '')) 
#                 post_time = str(data.get('created_at', ''))
#                 post_lan = str(data.get('language', ''))
#                 post_sensitive = str(data.get('sensitive', ''))
#                 post_visibility = str(data.get('visibility', ''))
#                 post_text = str(data.get('text', ''))
#                 post_content = str(data.get('content', ''))
#                 post_spoiler = str(data.get('spoiler_text', ''))
#                 post_pin = str(data.get('pinned', ''))
#                 reply = str(data.get('replies_count', ''))
#                 repost = str(data.get('reblogs_count', ''))
#                 like = str(data.get('favourites_count', ''))
#                 mention = [str(item.get('username', '')) for item in data.get('mentions', [])] # get mentioned user_name

#                 # user info
#                 user_id = str(get_nested(data, ['account', 'id'], ''))
#                 user_name = str(get_nested(data, ['account', 'username'], ''))
#                 user_verified = str(get_nested(data, ['account', 'verified'], ''))
#                 user_time = str(get_nested(data, ['account', 'created_at'], ''))
#                 user_last_status = str(get_nested(data, ['account', 'last_status_at'], ''))
#                 user_follow = str(get_nested(data, ['account', 'following_count'], ''))
#                 user_fans = str(get_nested(data, ['account', 'follower_count'], ''))
#                 user_status = str(get_nested(data, ['account', 'statuses_count'], ''))
#                 user_bot = str(get_nested(data, ['account', 'bot'], ''))
#                 user_note = str(get_nested(data, ['account', 'note'], ''))

#                 # Create dictionary with extracted data
#                 dict_tem = {
#                     'keyword': keyword,
#                     'reblogged': reblogged,
#                     'quoted_id': quoted_id,
#                     'reply_id': reply_id,
#                     'post_id': post_id,
#                     'post_time': post_time,
#                     'post_lan': post_lan,
#                     'post_sensitive': post_sensitive,
#                     'post_visibility': post_visibility,
#                     'post_text': post_text,
#                     'post_content': post_content,  
#                     'post_spoiler': post_spoiler,
#                     'post_pin': post_pin,
#                     'reply': reply,
#                     'repost': repost,
#                     'like': like,
#                     'mention': mention,
#                     'user_id': user_id,
#                     'user_name': user_name,
#                     'user_verified': user_verified,
#                     'user_time': user_time,
#                     'user_last_status': user_last_status,
#                     'user_follow': user_follow,
#                     'user_fans': user_fans,
#                     'user_status': user_status,
#                     'user_bot': user_bot,
#                     'user_note': user_note
#                 }
#                 result_list.append(dict_tem)
                
#                 ## add nested original quoted post 
#                 if quoted_id != 'None':
#                      # post links info (should be null as they are already nested posts)
#                     reblogged = str(data.get('reblogged', ''))
#                     quoted_id = str(data.get('quoted_id', ''))   
#                     reply_id = str(data.get('in_reply_to_id', ''))   
                    
#                     # Save the quoted post info
#                     data_quoted = data.get('quoted', {})
#                     post_id = str(data_quoted.get('id', ''))
#                     post_time = str(data_quoted.get('created_at', ''))
#                     post_lan = str(data_quoted.get('language', ''))
#                     post_sensitive = str(data_quoted.get('sensitive', ''))
#                     post_visibility = str(data_quoted.get('visibility', ''))
#                     post_text = str(data_quoted.get('text', ''))
#                     post_content = str(data_quoted.get('content', ''))
#                     post_spoiler = str(data_quoted.get('spoiler_text', ''))
#                     post_pin = str(data_quoted.get('pinned', ''))
#                     reply = str(data_quoted.get('replies_count', ''))
#                     repost = str(data_quoted.get('reblogs_count', ''))
#                     like = str(data_quoted.get('favourites_count', ''))
#                     mention = [str(item.get('username', '')) for item in data_quoted.get('mentions', [])]

#                     # user info
#                     user_id = str(get_nested(data_quoted, ['account', 'id'], ''))
#                     user_name = str(get_nested(data_quoted, ['account', 'username'], ''))
#                     user_verified = str(get_nested(data_quoted, ['account', 'verified'], ''))
#                     user_time = str(get_nested(data_quoted, ['account', 'created_at'], ''))
#                     user_last_status = str(get_nested(data_quoted, ['account', 'last_status_at'], ''))
#                     user_follow = str(get_nested(data_quoted, ['account', 'following_count'], ''))
#                     user_fans = str(get_nested(data_quoted, ['account', 'follower_count'], ''))
#                     user_status = str(get_nested(data_quoted, ['account', 'statuses_count'], ''))
#                     user_bot = str(get_nested(data_quoted, ['account', 'bot'], ''))
#                     user_note = str(get_nested(data_quoted, ['account', 'note'], ''))

#                     # Create dictionary with extracted data
#                     # Create dictionary with extracted data
#                     dict_tem = {
#                         'keyword': keyword,
#                         'reblogged': reblogged,
#                         'quoted_id': quoted_id,
#                         'reply_id': reply_id,
#                         'post_id': post_id,
#                         'post_time': post_time,
#                         'post_lan': post_lan,
#                         'post_sensitive': post_sensitive,
#                         'post_visibility': post_visibility,
#                         'post_text': post_text,
#                         'post_content': post_content,  
#                         'post_spoiler': post_spoiler,
#                         'post_pin': post_pin,
#                         'reply': reply,
#                         'repost': repost,
#                         'like': like,
#                         'mention': mention,
#                         'user_id': user_id,
#                         'user_name': user_name,
#                         'user_verified': user_verified,
#                         'user_time': user_time,
#                         'user_last_status': user_last_status,
#                         'user_follow': user_follow,
#                         'user_fans': user_fans,
#                         'user_status': user_status,
#                         'user_bot': user_bot,
#                         'user_note': user_note
#                     }
#                     result_list.append(dict_tem)
                
#                 ## add nested original reply post 
#                 if reply_id != 'None':
#                      # post links info (should be null as they are already nested posts)
#                     reblogged = str(data.get('reblogged', ''))
#                     quoted_id = str(data.get('quoted_id', ''))   
#                     reply_id = str(data.get('in_reply_to_id', ''))   
                    
#                     # Save the quoted post info
#                     data_reply = data.get('in_reply_to', {})

#                     post_id = str(data_reply.get('id', ''))
#                     post_time = str(data_reply.get('created_at', ''))
#                     post_lan = str(data_reply.get('language', ''))
#                     post_sensitive = str(data_reply.get('sensitive', ''))
#                     post_visibility = str(data_reply.get('visibility', ''))
#                     post_text = str(data_reply.get('text', ''))
#                     post_content = str(data_reply.get('content', ''))
#                     post_spoiler = str(data_reply.get('spoiler_text', ''))
#                     post_pin = str(data_reply.get('pinned', ''))
#                     reply = str(data_reply.get('replies_count', ''))
#                     repost = str(data_reply.get('reblogs_count', ''))
#                     like = str(data_reply.get('favourites_count', ''))
#                     mention = [str(item.get('username', '')) for item in data_reply.get('mentions', [])]

#                     # user info
#                     user_id = str(get_nested(data_reply, ['account', 'id'], ''))
#                     user_name = str(get_nested(data_reply, ['account', 'username'], ''))
#                     user_verified = str(get_nested(data_reply, ['account', 'verified'], ''))
#                     user_time = str(get_nested(data_reply, ['account', 'created_at'], ''))
#                     user_last_status = str(get_nested(data_reply, ['account', 'last_status_at'], ''))
#                     user_follow = str(get_nested(data_reply, ['account', 'following_count'], ''))
#                     user_fans = str(get_nested(data_reply, ['account', 'follower_count'], ''))
#                     user_status = str(get_nested(data_reply, ['account', 'statuses_count'], ''))
#                     user_bot = str(get_nested(data_reply, ['account', 'bot'], ''))
#                     user_note = str(get_nested(data_reply, ['account', 'note'], ''))

#                     # Create dictionary with extracted data
#                     # Create dictionary with extracted data
#                     dict_tem = {
#                         'keyword': keyword,
#                         'reblogged': reblogged,
#                         'quoted_id': quoted_id,
#                         'reply_id': reply_id,
#                         'post_id': post_id,
#                         'post_time': post_time,
#                         'post_lan': post_lan,
#                         'post_sensitive': post_sensitive,
#                         'post_visibility': post_visibility,
#                         'post_text': post_text,
#                         'post_content': post_content,  
#                         'post_spoiler': post_spoiler,
#                         'post_pin': post_pin,
#                         'reply': reply,
#                         'repost': repost,
#                         'like': like,
#                         'mention': mention,
#                         'user_id': user_id,
#                         'user_name': user_name,
#                         'user_verified': user_verified,
#                         'user_time': user_time,
#                         'user_last_status': user_last_status,
#                         'user_follow': user_follow,
#                         'user_fans': user_fans,
#                         'user_status': user_status,
#                         'user_bot': user_bot,
#                         'user_note': user_note
#                     }
#                     result_list.append(dict_tem)
            
#             ## add nested original reblog post 
#             if reblogged != 'None':
#                 # post links info (should be null as they are already nested posts)
#                 reblogged = str(data.get('reblogged', ''))
#                 quoted_id = str(data.get('quoted_id', ''))   
#                 reply_id = str(data.get('in_reply_to_id', ''))   
                
#                 # Save the quoted post info
#                 data_reblog = data.get('reblog', {})

#                 post_id = str(data_reblog.get('id', ''))
#                 post_time = str(data_reblog.get('created_at', ''))
#                 post_lan = str(data_reblog.get('language', ''))
#                 post_sensitive = str(data_reblog.get('sensitive', ''))
#                 post_visibility = str(data_reblog.get('visibility', ''))
#                 post_text = str(data_reblog.get('text', ''))
#                 post_content = str(data_reblog.get('content', ''))
#                 post_spoiler = str(data_reblog.get('spoiler_text', ''))
#                 post_pin = str(data_reblog.get('pinned', ''))
#                 reply = str(data_reblog.get('replies_count', ''))
#                 repost = str(data_reblog.get('reblogs_count', ''))
#                 like = str(data_reblog.get('favourites_count', ''))
#                 mention = [str(item.get('username', '')) for item in data_reblog.get('mentions', [])]

#                 # user info
#                 user_id = str(get_nested(data_reblog, ['account', 'id'], ''))
#                 user_name = str(get_nested(data_reblog, ['account', 'username'], ''))
#                 user_verified = str(get_nested(data_reblog, ['account', 'verified'], ''))
#                 user_time = str(get_nested(data_reblog, ['account', 'created_at'], ''))
#                 user_last_status = str(get_nested(data_reblog, ['account', 'last_status_at'], ''))
#                 user_follow = str(get_nested(data_reblog, ['account', 'following_count'], ''))
#                 user_fans = str(get_nested(data_reblog, ['account', 'follower_count'], ''))
#                 user_status = str(get_nested(data_reblog, ['account', 'statuses_count'], ''))
#                 user_bot = str(get_nested(data_reblog, ['account', 'bot'], ''))
#                 user_note = str(get_nested(data_reblog, ['account', 'note'], ''))

#                 # Create dictionary with extracted data
#                 # Create dictionary with extracted data
#                 dict_tem = {
#                     'keyword': keyword,
#                     'reblogged': reblogged,
#                     'quoted_id': quoted_id,
#                     'reply_id': reply_id,
#                     'post_id': post_id,
#                     'post_time': post_time,
#                     'post_lan': post_lan,
#                     'post_sensitive': post_sensitive,
#                     'post_visibility': post_visibility,
#                     'post_text': post_text,
#                     'post_content': post_content,  
#                     'post_spoiler': post_spoiler,
#                     'post_pin': post_pin,
#                     'reply': reply,
#                     'repost': repost,
#                     'like': like,
#                     'mention': mention,
#                     'user_id': user_id,
#                     'user_name': user_name,
#                     'user_verified': user_verified,
#                     'user_time': user_time,
#                     'user_last_status': user_last_status,
#                     'user_follow': user_follow,
#                     'user_fans': user_fans,
#                     'user_status': user_status,
#                     'user_bot': user_bot,
#                     'user_note': user_note
#                 }
#                 result_list.append(dict_tem)    
            
#             except json.JSONDecodeError:
#                 print(f"Error parsing JSON line in {data_path}")
#                 continue
#             except Exception as e:
#                 print(f"Error processing data: {str(e)} in {data_path}")
#                 continue
            
#         return result_list