In [6]:
# Merge all user JSON files in a directory into a single CSV file

import os
import json
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_user_data(json_data):
    user_data = json_data['data']['user']['result']
    legacy_data = user_data.get('legacy', {})
    professional_data = user_data.get('professional', {})

    # Flatten the nested fields in legacy_data and professional_data
    combined_data = {
        'is_blue_verified': user_data.get('is_blue_verified', False),
        'created_at': legacy_data.get('created_at', ''),
        'description': legacy_data.get('description', ''),
        'favourites_count': legacy_data.get('favourites_count', 0),
        'followers_count': legacy_data.get('followers_count', 0),
        'friends_count': legacy_data.get('friends_count', 0),
        'listed_count': legacy_data.get('listed_count', 0),
        'location': legacy_data.get('location', ''),
        'media_count': legacy_data.get('media_count', 0),
        'name': legacy_data.get('name', ''),
        'normal_followers_count': legacy_data.get('normal_followers_count', 0),
        'pinned_tweet_ids_str': legacy_data.get('pinned_tweet_ids_str', []),
        'possibly_sensitive': legacy_data.get('possibly_sensitive', False),
        'profile_banner_url': legacy_data.get('profile_banner_url', ''),
        'profile_image_url_https': legacy_data.get('profile_image_url_https', ''),
        'screen_name': legacy_data.get('screen_name', ''),
        'statuses_count': legacy_data.get('statuses_count', 0),
        'verified': legacy_data.get('verified', False),
        'want_retweets': legacy_data.get('want_retweets', False),
        'withheld_in_countries': legacy_data.get('withheld_in_countries', []),
        # Professional data
        'professional_type': professional_data.get('professional_type', ''),
        # Add more fields from professional_data if needed
        # Other user data fields
        'smart_blocked_by': user_data.get('smart_blocked_by', False),
        'smart_blocking': user_data.get('smart_blocking', False),
        'has_hidden_likes_on_profile': user_data.get('has_hidden_likes_on_profile', False),
        'has_hidden_subscriptions_on_profile': user_data.get('has_hidden_subscriptions_on_profile', False),
        'is_identity_verified': user_data.get('verification_info', {}).get('is_identity_verified', False),
        'verified_since_msec': user_data.get('verification_info', {}).get('reason', {}).get('verified_since_msec'),
        'can_highlight_tweets': user_data.get('highlights_info', {}).get('can_highlight_tweets', False),
        'highlighted_tweets': user_data.get('highlights_info', {}).get('highlighted_tweets', '0'),
        'creator_subscriptions_count': user_data.get('creator_subscriptions_count', 0)
    }

    # Flatten entities' URLs
    urls = legacy_data.get('entities', {}).get('url', {}).get('urls', [])
    for i, url_info in enumerate(urls):
        combined_data[f'url_{i}_display'] = url_info.get('display_url', '')
        combined_data[f'url_{i}_expanded'] = url_info.get('expanded_url', '')
        combined_data[f'url_{i}_url'] = url_info.get('url', '')

    return combined_data

def process_json_files(directory):
    all_data = []

    for file in os.listdir(directory):
        if file.endswith('.json'):
            file_path = os.path.join(directory, file)
            json_data = read_json_file(file_path)
            user_data = extract_user_data(json_data)
            all_data.append(user_data)

    return pd.DataFrame(all_data)


directory = '../twitter/users/'

# Process the JSON files and create a DataFrame
df = process_json_files(directory)

# Save the DataFrame to a CSV file
df.to_csv('./clean_csvs/users.csv', index=False)

In [9]:
# Create Network Graph of Followers and Following

import os
import json
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_following_data(user_name, json_data):
    print(user_name)
    user_following_list = []
    instructions = json_data['data']['user']['result']['timeline']['timeline']['instructions']
    for instruction in instructions:
        if instruction['type'] == 'TimelineAddEntries':
            following_entries = instruction['entries']
            for entry in following_entries:
                if 'content' in entry and 'itemContent' in entry['content']:
                    following_user = entry['content']['itemContent'].get('user_results', {}).get('result', {}).get('legacy', {}).get('screen_name', '')
                    if following_user:
                        user_following_list.append({'user': user_name, 'following': following_user})

    return user_following_list

def process_json_files(directory):
    all_following_data = []

    for file in os.listdir(directory):
        if file.endswith('.json'):
            file_path = os.path.join(directory, file)
            json_data = read_json_file(file_path)
            following_data = extract_following_data(file.split('-')[0], json_data)
            all_following_data.extend(following_data)

    return pd.DataFrame(all_following_data)

directory = '../twitter/followers/'

df = process_json_files(directory)

df.to_csv('./clean_csvs/user_following.csv', index=False)


zhu_jingyang
Chinamission2un
AmbassadorHuang
CHN_EGY
Amazing_Yunnan
xinhua_africa
ChineseEmb_FJ
ChinaEmbassy_MV
xinjiangchannel
CRIsinhala
chineseembassy1
AmbChineBurundi
afghanchina
LiyingZHU1
thouse_opinions
shanghaidaily
ChinaInDenmark
CGTNEurope
yangliuxh
MahuiChina
Lin_Nan1
haiwaiwangUS
GuanchaNewMedia
AmbQinGang
peijin_zhang
libijian2
MiaoXiaojuan
shen_shiwei
CGChinaInZurich
Li_Yang_China
Amb_Yiming
DiscoverJilin
MFA_China
China_Amb_Mdv
ChineseEmbKenya
ChinainVan
MMMKKK56665147
yawenxu17
ChinaEmbajada
LiuHongyang4
iChongqing_CIMC
EmbChinaGE
EmbChinaMex
izhejiang
CDchinawatch
Chinaemb_Hellas
fayhaaChina
LiuHongyang4
muyi_dima
WangXianfeng8
ChinaEmbajadaRD
Chinacultureorg
YangZha42341102
CD_visual
ZhaoYing_CRI
PenicilinPan
WangJialei4
discoverbinhai
CDAfricaNews
assignasia
XHNews
JessicaZ1018
JianhuaLi4
YayingRUAN
Hongchencuxian
ConsulateSan
izhejiang
ModernExpressEN
shao_zheng
RadioEjani
ChinaEmbEritrea
pd_northamerica
CaoYi_MFA
CDHKedition
huanxinzhao
XinhuanetNews
ShaanxiMoments

In [10]:
# Merge all other users from following file into a single CSV file

import os
import json
import pandas as pd

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def extract_user_data(json_data):
    user_data = json_data['result']
    legacy_data = user_data.get('legacy', {})
    professional_data = user_data.get('professional', {})

    # Flatten the nested fields in legacy_data and professional_data
    combined_data = {
        'is_blue_verified': user_data.get('is_blue_verified', False),
        'created_at': legacy_data.get('created_at', ''),
        'description': legacy_data.get('description', ''),
        'favourites_count': legacy_data.get('favourites_count', 0),
        'followers_count': legacy_data.get('followers_count', 0),
        'friends_count': legacy_data.get('friends_count', 0),
        'listed_count': legacy_data.get('listed_count', 0),
        'location': legacy_data.get('location', ''),
        'media_count': legacy_data.get('media_count', 0),
        'name': legacy_data.get('name', ''),
        'normal_followers_count': legacy_data.get('normal_followers_count', 0),
        'pinned_tweet_ids_str': legacy_data.get('pinned_tweet_ids_str', []),
        'possibly_sensitive': legacy_data.get('possibly_sensitive', False),
        'profile_banner_url': legacy_data.get('profile_banner_url', ''),
        'profile_image_url_https': legacy_data.get('profile_image_url_https', ''),
        'screen_name': legacy_data.get('screen_name', ''),
        'statuses_count': legacy_data.get('statuses_count', 0),
        'verified': legacy_data.get('verified', False),
        'want_retweets': legacy_data.get('want_retweets', False),
        'withheld_in_countries': legacy_data.get('withheld_in_countries', []),
        # Professional data
        'professional_type': professional_data.get('professional_type', ''),
        # Add more fields from professional_data if needed
        # Other user data fields
        'smart_blocked_by': user_data.get('smart_blocked_by', False),
        'smart_blocking': user_data.get('smart_blocking', False),
        'has_hidden_likes_on_profile': user_data.get('has_hidden_likes_on_profile', False),
        'has_hidden_subscriptions_on_profile': user_data.get('has_hidden_subscriptions_on_profile', False),
        'is_identity_verified': user_data.get('verification_info', {}).get('is_identity_verified', False),
        'verified_since_msec': user_data.get('verification_info', {}).get('reason', {}).get('verified_since_msec'),
        'can_highlight_tweets': user_data.get('highlights_info', {}).get('can_highlight_tweets', False),
        'highlighted_tweets': user_data.get('highlights_info', {}).get('highlighted_tweets', '0'),
        'creator_subscriptions_count': user_data.get('creator_subscriptions_count', 0)
    }

    # Flatten entities' URLs
    urls = legacy_data.get('entities', {}).get('url', {}).get('urls', [])
    for i, url_info in enumerate(urls):
        combined_data[f'url_{i}_display'] = url_info.get('display_url', '')
        combined_data[f'url_{i}_expanded'] = url_info.get('expanded_url', '')
        combined_data[f'url_{i}_url'] = url_info.get('url', '')

    return combined_data

def extract_following_data(user_name, json_data):
    all_data = []
    instructions = json_data['data']['user']['result']['timeline']['timeline']['instructions']
    for instruction in instructions:
        if instruction['type'] == 'TimelineAddEntries':
            following_entries = instruction['entries']
            for entry in following_entries:
                if 'content' in entry and 'itemContent' in entry['content']:
                    following_user = entry['content']['itemContent'].get('user_results', {})
                    if following_user:
                        user_data = extract_user_data(following_user)
                        all_data.append(user_data)
    return all_data

def process_json_files(directory):
    all_following_data = []

    for file in os.listdir(directory):
        if file.endswith('.json'):
            file_path = os.path.join(directory, file)
            json_data = read_json_file(file_path)
            following_data = extract_following_data(file.split('-')[0], json_data)
            all_following_data.extend(following_data)

    return pd.DataFrame(all_following_data)

directory = '../twitter/followers/'

df = process_json_files(directory)

df.to_csv('./clean_csvs/users_other.csv', index=False)

In [55]:
from pyvis.network import Network
import pandas as pd

users_df = pd.read_csv('users.csv')
users_others_df = pd.read_csv('users_other.csv')
print('users_others_df',users_others_df.shape)
users_others_df = users_others_df[users_others_df['is_blue_verified'] == True]
print('users_others_df',users_others_df.shape)

network_df = pd.read_csv('user_following.csv', nrows=10000)
print('network_df',network_df.shape)
network_df = network_df[network_df['following'].isin(users_others_df['screen_name'])]
network_df['following_in_degree'] = network_df['following'].map(network_df['following'].value_counts())
print('network_df',network_df.shape)
print(network_df.head())
network_df = network_df[network_df['following_in_degree'] > 10]
print('network_df',network_df.shape)

# Initialize the Network
net = Network(height="750px", width="100%", directed=True)

# Add nodes and edges
for index, row in network_df.iterrows():
    net.add_node(row['user'], label=row['user'], value=1)
    net.add_node(row['following'], label=row['following'], value=row['following_in_degree'])
    net.add_edge(row['user'], row['following'])

# Generate the graph
net.show("twitter_network.html", notebook=False)

users_others_df (28891, 33)
users_others_df (5784, 33)
network_df (10000, 2)
network_df (2845, 3)
               user    following  following_in_degree
5   Chinamission2un       nypost                    1
6   Chinamission2un      thehill                    1
9   Chinamission2un          NPR                    2
10  Chinamission2un         VP45                    3
13  Chinamission2un  LeoDiCaprio                    2
network_df (214, 3)
twitter_network.html


  users_others_df = pd.read_csv('users_other.csv')


In [66]:
from pyvis.network import Network
import pandas as pd

users_df = pd.read_csv('users.csv')
users_others_df = pd.read_csv('users_other.csv')
print('users_others_df',users_others_df.shape)
users_others_df = users_others_df[users_others_df['is_blue_verified'] == True]
print('users_others_df',users_others_df.shape)

network_df = pd.read_csv('user_following.csv', nrows=1000)
print('network_df',network_df.shape)
network_df = network_df[network_df['following'].isin(users_df['screen_name'])]
network_df['following_in_degree'] = network_df['following'].map(network_df['following'].value_counts())
print('network_df',network_df.shape)
print(network_df.head())
network_df = network_df[network_df['following_in_degree'] > 2]
print('network_df',network_df.shape)

# Initialize the Network
net = Network(height="750px", width="100%", directed=True)

added_nodes = set()
# Add nodes and edges
for index, row in network_df.iterrows():
    net.add_node(row['following'], label=row['following'], value=row['following_in_degree'])
    added_nodes.add(row['following'])

# Add nodes and edges
for index, row in network_df.iterrows():
    if row['user'] not in added_nodes:
        net.add_node(row['user'], label=row['user'], value=1)
    net.add_edge(row['user'], row['following'])

# Generate the graph
net.show("twitter_network.html", notebook=False)

users_others_df (28891, 33)
users_others_df (5784, 33)
network_df (1000, 2)
network_df (139, 3)
               user        following  following_in_degree
20  Chinamission2un      WanmingYang                    2
22  Chinamission2un  China_Amb_India                    1
23  Chinamission2un   Chinaembmanila                    2
27  Chinamission2un       chenweihua                    1
30  Chinamission2un    BeijingReview                    1
network_df (24, 3)
twitter_network.html


  users_others_df = pd.read_csv('users_other.csv')


In [34]:
import pandas as pd

users_df = pd.read_csv('./clean_csvs/users.csv')
network_df = pd.read_csv('./clean_csvs/user_following.csv')
network_df = network_df[network_df['following'].isin(users_df['screen_name'])]
network_df['following_in_degree'] = network_df['following'].map(network_df['following'].value_counts())
canis_df = pd.read_csv('../data/CANIS_PRC_state_media_on_social_media_platforms-2023-11-03.csv')
canis_df['user'] = canis_df['X (Twitter) handle']
canis_df = canis_df[['user', 'Entity owner (English)']]
network_df = network_df.merge(canis_df, on='user')
network_df['user_entity'] = network_df['Entity owner (English)']
network_df = network_df[['user', 'following', 'following_in_degree', 'user_entity']]
canis_df['following'] = canis_df['user']
canis_df = canis_df[['following', 'Entity owner (English)']]
network_df = network_df.merge(canis_df, on='following')
network_df['following_entity'] = network_df['Entity owner (English)']
network_df = network_df[['user', 'following', 'following_in_degree', 'user_entity', 'following_entity']]
print('network_df',network_df.shape)
print(network_df.head())

network_df (27057, 5)
              user   following  following_in_degree  \
0     zhu_jingyang  IXiangshan                   35   
1    DiscoverJilin  IXiangshan                   35   
2        izhejiang  IXiangshan                   35   
3  ChinaEmbEritrea  IXiangshan                   35   
4        CaoYi_MFA  IXiangshan                   35   

                                         user_entity  \
0                        Ministry of Foreign Affairs   
1  Department of Culture and Tourism of Jilin Pro...   
2                                  China Daily Press   
3                        Ministry of Foreign Affairs   
4                        Ministry of Foreign Affairs   

             following_entity  
0  Zhejiang Media Group (ZMG)  
1  Zhejiang Media Group (ZMG)  
2  Zhejiang Media Group (ZMG)  
3  Zhejiang Media Group (ZMG)  
4  Zhejiang Media Group (ZMG)  


In [9]:
!pip3 install --upgrade nbformat

Defaulting to user installation because normal site-packages is not writeable
Collecting nbformat
  Downloading nbformat-5.9.2-py3-none-any.whl.metadata (3.4 kB)
Collecting fastjsonschema (from nbformat)
  Downloading fastjsonschema-2.19.0-py3-none-any.whl.metadata (2.0 kB)
Downloading nbformat-5.9.2-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.6/77.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastjsonschema-2.19.0-py3-none-any.whl (23 kB)
Installing collected packages: fastjsonschema, nbformat
Successfully installed fastjsonschema-2.19.0 nbformat-5.9.2


In [59]:
import pandas as pd
import networkx as nx
import plotly.graph_objs as go
import random
import numpy as np


df = network_df.groupby(['user_entity', 'following_entity']).agg({'following_in_degree': 'sum'}).reset_index()
df = df[df['following_in_degree'] > 2500]
print(df['following_in_degree'].describe())

def get_pleasing_random_color():
    pleasing_colors = [
        '#FF5733',  # Vibrant Red
        '#33FF57',  # Vivid Green
        '#3357FF',  # Bright Blue
        '#FF33FF',  # Magenta
        '#FFFF33',  # Yellow
        '#33FFFF',  # Cyan
        '#FF7F50',  # Coral
        '#9370DB',  # Medium Purple
        '#3CB371',  # Medium Sea Green
        '#FFD700',  # Gold
        '#FF69B4',  # Hot Pink
        '#87CEEB',  # Sky Blue
        '#FF6347',  # Tomato
        '#40E0D0',  # Turquoise
        '#EE82EE',  # Violet
        '#DA70D6',  # Orchid
        '#6495ED',  # Cornflower Blue
        '#FFB6C1',  # Light Pink
        '#BC8F8F',  # Rosy Brown
        '#F08080',  # Light Coral
        '#7B68EE',  # Medium Slate Blue
        '#6B8E23',  # Olive Drab
    ]

    return random.choice(pleasing_colors)

color_dict = {}
def get_entity_color(entiry):
    if entiry not in color_dict:
        color_dict[entiry] = get_pleasing_random_color()
    return color_dict[entiry]

# Create a NetworkX graph from edge list
G = nx.from_pandas_edgelist(df, 'user_entity', 'following_entity')

# Manually set the 'group' attribute for each node
for _, row in df.iterrows():
    G.nodes[row['user_entity']]['color'] = get_entity_color(row['user_entity'])
    G.nodes[row['user_entity']]['size'] = np.log10(row['following_in_degree'])
    G.nodes[row['following_entity']]['color'] = get_entity_color(row['following_entity'])
    G.nodes[row['following_entity']]['size'] = np.log10(row['following_in_degree'])

# Compute 3D layout
spring_3D = nx.spring_layout(G, dim=3, seed=18)

# Node coordinates and colors
x_nodes = [spring_3D[node][0] for node in G.nodes()]  # x-coordinates
y_nodes = [spring_3D[node][1] for node in G.nodes()]  # y-coordinates
z_nodes = [spring_3D[node][2] for node in G.nodes()]  # z-coordinates
node_color = [G.nodes[node]['color'] for node in G.nodes()]  # Node colors

# Edge coordinates
x_edges = []
y_edges = []
z_edges = []
for edge in G.edges():
    x_coords = [spring_3D[edge[0]][0], spring_3D[edge[1]][0], None]
    x_edges += x_coords
    y_coords = [spring_3D[edge[0]][1], spring_3D[edge[1]][1], None]
    y_edges += y_coords
    z_coords = [spring_3D[edge[0]][2], spring_3D[edge[1]][2], None]
    z_edges += z_coords

traces = []
# Trace for edges
trace_edges = go.Scatter3d(x=x_edges, y=y_edges, z=z_edges, mode='lines',
                           line=dict(color='rgb(125,125,125)', width=1), hoverinfo='none')
traces.append(trace_edges)

# Group nodes by entity owner
grouped_nodes = df.groupby('user_entity')

# Create a trace for each group
for entity_owner, group in grouped_nodes:
    # Extract the nodes for the current entity owner
    group_nodes = [node for node in G.nodes() if G.nodes[node]['color'] == color_dict[entity_owner]]

    # Get the coordinates and colors for the nodes in the current group
    x = [spring_3D[node][0] for node in group_nodes]
    y = [spring_3D[node][1] for node in group_nodes]
    z = [spring_3D[node][2] for node in group_nodes]
    node_color = [G.nodes[node]['color'] for node in group_nodes]

    # Create a trace for the current group
    trace = go.Scatter3d(
        x=x, y=y, z=z, 
        mode='markers',
        marker=dict(
            symbol='circle', 
            size=5,  # Adjust size based on in-degree
            color=node_color, 
            line=dict(color='black', width=0.5)
        ),
        name=entity_owner,  # Name the trace as the entity owner
        hoverinfo='text'
    )
    traces.append(trace)


# Layout
layout = go.Layout(
    title="Network Graph",
    scene=dict(
        xaxis=dict(
            showbackground=True,
            showline=True,
            zeroline=True,
            showgrid=True,
            showticklabels=True,
            title='X-axis'
        ),
        yaxis=dict(
            showbackground=True,
            showline=True,
            zeroline=True,
            showgrid=True,
            showticklabels=True,
            title='Y-axis'
        ),
        zaxis=dict(
            showbackground=True,
            showline=True,
            zeroline=True,
            showgrid=True,
            showticklabels=True,
            title='Z-axis'
        )
    ),
    margin=dict(t=100),
    hovermode='closest'
)

# Figure
fig = go.Figure(data=traces, layout=layout)

# Show plot
fig.show()

count    1.138000e+03
mean     2.550460e+03
std      4.021138e+04
min      1.100000e+01
25%      4.600000e+01
50%      9.800000e+01
75%      3.070000e+02
max      1.326431e+06
Name: following_in_degree, dtype: float64


In [58]:
from pyvis.network import Network
import pandas as pd

# Initialize the Network
net = Network(height="750px", width="100%", directed=True)

df = network_df.groupby(['user_entity', 'following_entity']).agg({'following_in_degree': 'sum'}).reset_index()
print(df.shape)
df = df[:100]
print(df.shape)

added_nodes = set()
# Add nodes and edges
for index, row in df.iterrows():
    net.add_node(row['following_entity'],
                 label=row['following_entity'],
                 value=row['following_in_degree'],
                 color=get_entity_color(row['following_entity'])
                 )
    added_nodes.add(row['following_entity'])

# Add nodes and edges
for index, row in df.iterrows():
    if row['user_entity'] not in added_nodes:
        net.add_node(row['user_entity'], 
                     label=row['user_entity'],
                     value=1,
                     color=get_entity_color(row['user_entity']))
    net.add_edge(row['user_entity'], row['following_entity'])

# Generate the graph
net.show("twitter_network.html", notebook=False)

(1158, 3)
(100, 3)
twitter_network.html
