In [8]:
import pandas as pd
# turn off scientific notation for large id numbers
pd.set_option('display.float_format', lambda x: '%.0f' % x)

sampled = pd.read_csv("sampled_dataset_full_column.csv", lineterminator='\n')
print("Number of all sampled tweets", len(sampled))

Number of all sampled tweets 163204


In [9]:
# Get number of tweets with repeated text (retweets)
retweets_all = sampled["text"].duplicated()
number_all = len(retweets_all[retweets_all == True])
print("Total retweets number",number_all)

Total retweets number 65575


In [10]:
# Filter tweets by time period: all until March 13th
time_filtered = sampled[(sampled['tweetcreatedts'] < '2022-03-13 24:59:59+00:00')]
time_filtered.shape

(53158, 14)

In [47]:
# Get number of tweets with repeated text (retweets) in filtered dataset
retweets = time_filtered["text"].duplicated()
number = len(retweets[retweets == True])
print("Total retweets number until March 13th:", number)

Total retweets number until March 13th: 17255


In [12]:
# Get only duplicates (removing tweets without retweets for easier network analysis)
duplicates = time_filtered.loc[retweets]
duplicates.shape

(17255, 14)

In [13]:
# Dictionary for text id for easier work with nodes in network
unique = duplicates["text"].drop_duplicates(keep="first")
unique_hash = {text: i for i, text in unique.items()}
unique_keys = unique_hash.keys() #texts
unique_values = unique_hash.values() #indexes

In [14]:
# Insert text id column to the dataset 
duplicates.insert(loc=2, column="textid", value=[unique_hash[x] for x in duplicates["text"]])

In [15]:
# all_columns = ["userid",'username', 'location', 'tweetid','tweetcreatedts', 'retweetcount', 'hashtags', 'text', 'usercreatedts','following', 'followers', 'totaltweets']
# columns = ["userid",'tweetid','tweetcreatedts', 'hashtags', 'text', 'usercreatedts','following', 'followers', 'totaltweets']

In [22]:
# Get nodes for bipartite network
users = duplicates["userid"]
tweets = duplicates["textid"]

# Create edge list for bipartite network
column_e = ["userid", "textid"]
edgelist = [(user, textid) for user, textid in duplicates[columns].values]

edges = pd.DataFrame(edgelist,columns = column_e)

edges.to_csv("edgelist_UtoT.csv",sep = ",", header = True, encoding = "UTF-8",index=False)

In [24]:
import networkx as nx
import matplotlib.pyplot as plt

B = nx.Graph()
B.add_nodes_from(users, bipartite=0)
B.add_nodes_from(tweets, bipartite=1)
B.add_edges_from(edgelist)

In [36]:
from networkx.algorithms import bipartite

users_nodes = {n for n, a in B.nodes(data=True) if a["bipartite"] == 0}
tweets_nodes = set(B) - users_nodes

B_users = bipartite.projected_graph(B, users_nodes)
B_tweets = bipartite.projected_graph(B, tweets_nodes)
# 16315 256226
#5244 1114

B_users_w = bipartite.weighted_projected_graph(B, users_nodes)
B_tweets_w = bipartite.weighted_projected_graph(B, tweets_nodes)

print(nx.info(B_users_w))
print(nx.info(B_tweets_w))


Graph with 16315 nodes and 256226 edges
Graph with 5244 nodes and 1114 edges


In [37]:
# undirected, unweighted
nx.is_directed(B_users)
nx.is_weighted(B_users_w)

True

In [57]:
# write edgelist of projected network
nx.write_weighted_edgelist(B_users, "projected_w_user_edgelist.csv", delimiter=',', encoding='utf-8')


## Compare two timeframes

In [50]:
# Filter tweets by time period: before and after the tweeter ban 
time_before = duplicates[(duplicates['tweetcreatedts'] < '2022-03-04 24:59:59+00:00')]
time_after = duplicates[(duplicates['tweetcreatedts'] > '2022-03-04 24:59:59+00:00')]
time_before.shape #23156 tweets -> 6505 replicated retweets
time_after.shape  #30002 tweets -> 10750 replicated retweets

(6505, 15)

### (1) timeframe1: before

In [55]:
# Dictionary for text id for easier work with nodes in network
unique = time_before["text"].drop_duplicates(keep="first")
unique_hash = {text: i for i, text in unique.items()}
unique_keys = unique_hash.keys() #texts
unique_values = unique_hash.values() #indexes

# Get nodes for bipartite network
users = time_before["userid"]
tweets = time_before["textid"]

# Create edge list for bipartite network
column_e = ["userid", "textid"]
edgelist = [(user, textid) for user, textid in time_before[columns].values]

edges = pd.DataFrame(edgelist,columns = column_e)

edges.to_csv("edgelist_UtoT_t1.csv",sep = ",", header = True, encoding = "UTF-8",index=False)

import networkx as nx
import matplotlib.pyplot as plt

B1 = nx.Graph()
B1.add_nodes_from(users, bipartite=0)
B1.add_nodes_from(tweets, bipartite=1)
B1.add_edges_from(edgelist)
from networkx.algorithms import bipartite

users_nodes = {n for n, a in B1.nodes(data=True) if a["bipartite"] == 0}
tweets_nodes = set(B1) - users_nodes

B1_users = bipartite.projected_graph(B1, users_nodes)
B1_tweets = bipartite.projected_graph(B1, tweets_nodes)
# 16315 256226
#5244 1114

B1_users_w = bipartite.weighted_projected_graph(B1, users_nodes)
B1_tweets_w = bipartite.weighted_projected_graph(B1, tweets_nodes)

print(nx.info(B1_users_w))
print(nx.info(B1_tweets_w))

# write edgelist of projected network
nx.write_weighted_edgelist(B1_users, "projected_w_user_edgelist_1.csv", delimiter=',', encoding='utf-8')


Graph with 6398 nodes and 68226 edges
Graph with 2267 nodes and 107 edges


### (2) timeframe2: after 

In [56]:
# Dictionary for text id for easier work with nodes in network
unique = time_after["text"].drop_duplicates(keep="first")
unique_hash = {text: i for i, text in unique.items()}
unique_keys = unique_hash.keys() #texts
unique_values = unique_hash.values() #indexes

# Get nodes for bipartite network
users = time_after["userid"]
tweets = time_after["textid"]

# Create edge list for bipartite network
column_e = ["userid", "textid"]
edgelist = [(user, textid) for user, textid in time_after[columns].values]

edges = pd.DataFrame(edgelist,columns = column_e)

edges.to_csv("edgelist_UtoT_t2.csv",sep = ",", header = True, encoding = "UTF-8",index=False)

import networkx as nx
import matplotlib.pyplot as plt

B2 = nx.Graph()
B2.add_nodes_from(users, bipartite=0)
B2.add_nodes_from(tweets, bipartite=1)
B2.add_edges_from(edgelist)
from networkx.algorithms import bipartite

users_nodes = {n for n, a in B2.nodes(data=True) if a["bipartite"] == 0}
tweets_nodes = set(B2) - users_nodes

B2_users = bipartite.projected_graph(B2, users_nodes)
B2_tweets = bipartite.projected_graph(B2, tweets_nodes)
# 16315 256226
#5244 1114

B2_users_w = bipartite.weighted_projected_graph(B2, users_nodes)
B2_tweets_w = bipartite.weighted_projected_graph(B2, tweets_nodes)

print(nx.info(B2_users_w))
print(nx.info(B2_tweets_w))

# write edgelist of projected network
nx.write_weighted_edgelist(B2_users, "projected_w_user_edgelist_2.csv", delimiter=',', encoding='utf-8')


Graph with 10164 nodes and 173330 edges
Graph with 3190 nodes and 670 edges


In [None]:
# Hierarchial clustering 

# from collections import defaultdict
# import numpy as np
# from scipy.cluster import hierarchy
# from scipy.spatial import distance
# from builtins import next

# def create_hc(G, t):
#     """Creates hierarchical cluster of graph G from distance matrix"""
#     path_length = nx.all_pairs_shortest_path_length(G)
#     distances = np.zeros((len(G), len(G)))
#     for u, p in path_length:
#         for v, d in p.items():
#             distances[u][v] = d
#     # Create hierarchical cluster
#     Y = distance.squareform(distances)
#     Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage
    
#     membership = list(hierarchy.fcluster(Z, t = t))
    
#     # Create collection of lists for blockmodel
#     partition = defaultdict(list)
#     for n, p in zip(list(range(len(G))), membership):
#         partition[p].append(n)
#     return list(partition.values())

# # Users network
# G = B_users.copy()

# x = nx.connected_components(G)
# sub = next(x)

# # Extract largest connected component into graph H
# H = G.subgraph(sub)
# # Makes life easier to have consecutively labeled integer nodes
# H = nx.convert_node_labels_to_integers(H)
# # Create parititions with hierarchical clustering
# partitions = create_hc(H, 0.5)
# # Build blockmodel graph
# BM = nx.quotient_graph(H, partitions, relabel=True)

# # Draw block model with nodes sized by number of internal nodes
# # node_size = [H.nodes[x]["nnodes"] * 100 for x in H.nodes()]
# nx.draw(B_users, width=0.2, with_labels=False)
# # node_size = [BM.nodes[x]["nnodes"] * 100 for x in BM.nodes()]
# # nx.draw(BM, node_size=node_size, width=0.2, with_labels=False)
# plt.title("One-mode network of users")
# plt.savefig("users_net.png")
# plt.show()

In [73]:
# Users network
fig,ax = plt.subplots(1,1,figsize=(20,15))
node_size = [x for x in nx.degree_centrality(B_users)]
pos = nx.spring_layout(B_users, k=0.3)
nx.draw_networkx(B_users, pos, node_size=node_size, width=0.1, with_labels = False,alpha=0.7,edgecolors='w', edge_color='#999990')
ax.set_title('One-mode users network', fontsize=16)
ax.set_axis_off()
plt.savefig("users_net.png")
plt.show()