In [1]:
# Merge all user JSON files in a directory into a single CSV file

import os
import json
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_user_data(json_data):
    user_data = json_data['data']['user']['result']
    legacy_data = user_data.get('legacy', {})
    professional_data = user_data.get('professional', {})

    # Flatten the nested fields in legacy_data and professional_data
    combined_data = {
        'is_blue_verified': user_data.get('is_blue_verified', False),
        'created_at': legacy_data.get('created_at', ''),
        'description': legacy_data.get('description', ''),
        'favourites_count': legacy_data.get('favourites_count', 0),
        'followers_count': legacy_data.get('followers_count', 0),
        'friends_count': legacy_data.get('friends_count', 0),
        'listed_count': legacy_data.get('listed_count', 0),
        'location': legacy_data.get('location', ''),
        'media_count': legacy_data.get('media_count', 0),
        'name': legacy_data.get('name', ''),
        'normal_followers_count': legacy_data.get('normal_followers_count', 0),
        'pinned_tweet_ids_str': legacy_data.get('pinned_tweet_ids_str', []),
        'possibly_sensitive': legacy_data.get('possibly_sensitive', False),
        'profile_banner_url': legacy_data.get('profile_banner_url', ''),
        'profile_image_url_https': legacy_data.get('profile_image_url_https', ''),
        'screen_name': legacy_data.get('screen_name', ''),
        'statuses_count': legacy_data.get('statuses_count', 0),
        'verified': legacy_data.get('verified', False),
        'want_retweets': legacy_data.get('want_retweets', False),
        'withheld_in_countries': legacy_data.get('withheld_in_countries', []),
        # Professional data
        'professional_type': professional_data.get('professional_type', ''),
        # Add more fields from professional_data if needed
        # Other user data fields
        'smart_blocked_by': user_data.get('smart_blocked_by', False),
        'smart_blocking': user_data.get('smart_blocking', False),
        'has_hidden_likes_on_profile': user_data.get('has_hidden_likes_on_profile', False),
        'has_hidden_subscriptions_on_profile': user_data.get('has_hidden_subscriptions_on_profile', False),
        'is_identity_verified': user_data.get('verification_info', {}).get('is_identity_verified', False),
        'verified_since_msec': user_data.get('verification_info', {}).get('reason', {}).get('verified_since_msec'),
        'can_highlight_tweets': user_data.get('highlights_info', {}).get('can_highlight_tweets', False),
        'highlighted_tweets': user_data.get('highlights_info', {}).get('highlighted_tweets', '0'),
        'creator_subscriptions_count': user_data.get('creator_subscriptions_count', 0)
    }

    # Flatten entities' URLs
    urls = legacy_data.get('entities', {}).get('url', {}).get('urls', [])
    for i, url_info in enumerate(urls):
        combined_data[f'url_{i}_display'] = url_info.get('display_url', '')
        combined_data[f'url_{i}_expanded'] = url_info.get('expanded_url', '')
        combined_data[f'url_{i}_url'] = url_info.get('url', '')

    return combined_data

def process_json_files(directory):
    all_data = []

    for file in os.listdir(directory):
        if file.endswith('.json'):
            file_path = os.path.join(directory, file)
            json_data = read_json_file(file_path)
            user_data = extract_user_data(json_data)
            all_data.append(user_data)

    return pd.DataFrame(all_data)


directory = './twitter/users/'

# Process the JSON files and create a DataFrame
df = process_json_files(directory)

# Save the DataFrame to a CSV file
df.to_csv('users.csv', index=False)

In [5]:
# Create Network Graph of Followers and Following

import os
import json
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_following_data(user_name, json_data):
    user_following_list = []
    instructions = json_data['data']['user']['result']['timeline']['timeline']['instructions']
    for instruction in instructions:
        if instruction['type'] == 'TimelineAddEntries':
            following_entries = instruction['entries']
            for entry in following_entries:
                if 'content' in entry and 'itemContent' in entry['content']:
                    following_user = entry['content']['itemContent'].get('user_results', {}).get('result', {}).get('legacy', {}).get('screen_name', '')
                    if following_user:
                        user_following_list.append({'user': user_name, 'following': following_user})

    return user_following_list

def process_json_files(directory):
    all_following_data = []

    for file in os.listdir(directory):
        if file.endswith('.json'):
            file_path = os.path.join(directory, file)
            json_data = read_json_file(file_path)
            following_data = extract_following_data(file.split('-')[0], json_data)
            all_following_data.extend(following_data)

    return pd.DataFrame(all_following_data)

directory = './twitter/followers/'

df = process_json_files(directory)

df.to_csv('user_following.csv', index=False)


In [22]:
# Merge all other users from following file into a single CSV file

import os
import json
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_user_data(json_data):
    user_data = json_data['result']
    legacy_data = user_data.get('legacy', {})
    professional_data = user_data.get('professional', {})

    # Flatten the nested fields in legacy_data and professional_data
    combined_data = {
        'is_blue_verified': user_data.get('is_blue_verified', False),
        'created_at': legacy_data.get('created_at', ''),
        'description': legacy_data.get('description', ''),
        'favourites_count': legacy_data.get('favourites_count', 0),
        'followers_count': legacy_data.get('followers_count', 0),
        'friends_count': legacy_data.get('friends_count', 0),
        'listed_count': legacy_data.get('listed_count', 0),
        'location': legacy_data.get('location', ''),
        'media_count': legacy_data.get('media_count', 0),
        'name': legacy_data.get('name', ''),
        'normal_followers_count': legacy_data.get('normal_followers_count', 0),
        'pinned_tweet_ids_str': legacy_data.get('pinned_tweet_ids_str', []),
        'possibly_sensitive': legacy_data.get('possibly_sensitive', False),
        'profile_banner_url': legacy_data.get('profile_banner_url', ''),
        'profile_image_url_https': legacy_data.get('profile_image_url_https', ''),
        'screen_name': legacy_data.get('screen_name', ''),
        'statuses_count': legacy_data.get('statuses_count', 0),
        'verified': legacy_data.get('verified', False),
        'want_retweets': legacy_data.get('want_retweets', False),
        'withheld_in_countries': legacy_data.get('withheld_in_countries', []),
        # Professional data
        'professional_type': professional_data.get('professional_type', ''),
        # Add more fields from professional_data if needed
        # Other user data fields
        'smart_blocked_by': user_data.get('smart_blocked_by', False),
        'smart_blocking': user_data.get('smart_blocking', False),
        'has_hidden_likes_on_profile': user_data.get('has_hidden_likes_on_profile', False),
        'has_hidden_subscriptions_on_profile': user_data.get('has_hidden_subscriptions_on_profile', False),
        'is_identity_verified': user_data.get('verification_info', {}).get('is_identity_verified', False),
        'verified_since_msec': user_data.get('verification_info', {}).get('reason', {}).get('verified_since_msec'),
        'can_highlight_tweets': user_data.get('highlights_info', {}).get('can_highlight_tweets', False),
        'highlighted_tweets': user_data.get('highlights_info', {}).get('highlighted_tweets', '0'),
        'creator_subscriptions_count': user_data.get('creator_subscriptions_count', 0)
    }

    # Flatten entities' URLs
    urls = legacy_data.get('entities', {}).get('url', {}).get('urls', [])
    for i, url_info in enumerate(urls):
        combined_data[f'url_{i}_display'] = url_info.get('display_url', '')
        combined_data[f'url_{i}_expanded'] = url_info.get('expanded_url', '')
        combined_data[f'url_{i}_url'] = url_info.get('url', '')

    return combined_data

def extract_following_data(user_name, json_data):
    all_data = []
    instructions = json_data['data']['user']['result']['timeline']['timeline']['instructions']
    for instruction in instructions:
        if instruction['type'] == 'TimelineAddEntries':
            following_entries = instruction['entries']
            for entry in following_entries:
                if 'content' in entry and 'itemContent' in entry['content']:
                    following_user = entry['content']['itemContent'].get('user_results', {})
                    if following_user:
                        user_data = extract_user_data(following_user)
                        all_data.append(user_data)
    return all_data

def process_json_files(directory):
    all_following_data = []

    for file in os.listdir(directory):
        if file.endswith('.json'):
            file_path = os.path.join(directory, file)
            json_data = read_json_file(file_path)
            following_data = extract_following_data(file.split('-')[0], json_data)
            all_following_data.extend(following_data)

    return pd.DataFrame(all_following_data)

directory = './twitter/followers/'

df = process_json_files(directory)

df.to_csv('users_other.csv', index=False)

In [55]:
from pyvis.network import Network
import pandas as pd

users_df = pd.read_csv('users.csv')
users_others_df = pd.read_csv('users_other.csv')
print('users_others_df',users_others_df.shape)
users_others_df = users_others_df[users_others_df['is_blue_verified'] == True]
print('users_others_df',users_others_df.shape)

network_df = pd.read_csv('user_following.csv', nrows=10000)
print('network_df',network_df.shape)
network_df = network_df[network_df['following'].isin(users_others_df['screen_name'])]
network_df['following_in_degree'] = network_df['following'].map(network_df['following'].value_counts())
print('network_df',network_df.shape)
print(network_df.head())
network_df = network_df[network_df['following_in_degree'] > 10]
print('network_df',network_df.shape)

# Initialize the Network
net = Network(height="750px", width="100%", directed=True)

# Add nodes and edges
for index, row in network_df.iterrows():
    net.add_node(row['user'], label=row['user'], value=1)
    net.add_node(row['following'], label=row['following'], value=row['following_in_degree'])
    net.add_edge(row['user'], row['following'])

# Generate the graph
net.show("twitter_network.html", notebook=False)

users_others_df (28891, 33)
users_others_df (5784, 33)
network_df (10000, 2)
network_df (2845, 3)
               user    following  following_in_degree
5   Chinamission2un       nypost                    1
6   Chinamission2un      thehill                    1
9   Chinamission2un          NPR                    2
10  Chinamission2un         VP45                    3
13  Chinamission2un  LeoDiCaprio                    2
network_df (214, 3)
twitter_network.html


  users_others_df = pd.read_csv('users_other.csv')


In [66]:
from pyvis.network import Network
import pandas as pd

users_df = pd.read_csv('users.csv')
users_others_df = pd.read_csv('users_other.csv')
print('users_others_df',users_others_df.shape)
users_others_df = users_others_df[users_others_df['is_blue_verified'] == True]
print('users_others_df',users_others_df.shape)

network_df = pd.read_csv('user_following.csv', nrows=1000)
print('network_df',network_df.shape)
network_df = network_df[network_df['following'].isin(users_df['screen_name'])]
network_df['following_in_degree'] = network_df['following'].map(network_df['following'].value_counts())
print('network_df',network_df.shape)
print(network_df.head())
network_df = network_df[network_df['following_in_degree'] > 2]
print('network_df',network_df.shape)

# Initialize the Network
net = Network(height="750px", width="100%", directed=True)

added_nodes = set()
# Add nodes and edges
for index, row in network_df.iterrows():
    net.add_node(row['following'], label=row['following'], value=row['following_in_degree'])
    added_nodes.add(row['following'])

# Add nodes and edges
for index, row in network_df.iterrows():
    if row['user'] not in added_nodes:
        net.add_node(row['user'], label=row['user'], value=1)
    net.add_edge(row['user'], row['following'])

# Generate the graph
net.show("twitter_network.html", notebook=False)

users_others_df (28891, 33)
users_others_df (5784, 33)
network_df (1000, 2)
network_df (139, 3)
               user        following  following_in_degree
20  Chinamission2un      WanmingYang                    2
22  Chinamission2un  China_Amb_India                    1
23  Chinamission2un   Chinaembmanila                    2
27  Chinamission2un       chenweihua                    1
30  Chinamission2un    BeijingReview                    1
network_df (24, 3)
twitter_network.html


  users_others_df = pd.read_csv('users_other.csv')
