In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
retweet_df = pd.read_json("dataset/user_timeline.json")

In [3]:
retweet_df['author'] = retweet_df['author'].apply(lambda v: (json.loads(v, strict=False)) if v else None)
retweet_df['origin_tweet'] = retweet_df['origin_tweet'].apply(lambda v: (json.loads(v, strict=False)) if v else None)

retweet_df = retweet_df.loc[retweet_df['content'].apply(lambda v: v.startswith('RT'))]
retweet_df = retweet_df.loc[~(retweet_df['origin_tweet'].isna())]
retweet_df = retweet_df.reset_index(drop=True)

In [4]:
df = {
    'tweet_id':[],
    'publish_time':[],
    'author_id':[],
    'author_name':[],
    'retweet_id':[],
    'retweet_name':[],
}

for tweet_id, publish_time, author, origin_tweet in zip(retweet_df['tweet_id'], retweet_df['publish_time'], retweet_df['author'], retweet_df['origin_tweet']):
    df['author_id'].append(origin_tweet['author']['user_id'])
    df['author_name'].append(origin_tweet['author']['name'])
    df['retweet_id'].append(author['user_id'])
    df['retweet_name'].append(author['name'])
    df['tweet_id'].append(tweet_id)
    df['publish_time'].append(publish_time)
df = pd.DataFrame(df)
df = df.astype({'author_id':'int', 'retweet_id':'int'})

In [5]:
# Filter for users who have retweet count > the given threshold
retweet_count_threshold = 10
retweet_distribution_df = df.groupby('retweet_id')['retweet_id'].count().to_frame('count').reset_index()
target_df = df.loc[df['retweet_id'].isin(retweet_distribution_df.loc[retweet_distribution_df['count'] > retweet_count_threshold, 'retweet_id'].tolist())]

In [6]:
# Map author and retweet to code
retweet_author_df = target_df.groupby('retweet_id')['author_id'].value_counts().to_frame('count').reset_index()

unique_author_id_list = list(set(retweet_author_df['author_id'].tolist()))
unique_retweet_id_list = list(set(retweet_author_df['retweet_id'].tolist()))

author_to_code = {j: i for i, j in enumerate(unique_author_id_list)}
retweet_to_code = {j: i for i, j in enumerate(unique_retweet_id_list)}
code_to_retweet = {i: j for i, j in enumerate(unique_retweet_id_list)}

retweet_author_df['author_code'] = retweet_author_df['author_id'].apply(lambda v: author_to_code[v])
retweet_author_df['retweet_code'] = retweet_author_df['retweet_id'].apply(lambda v: retweet_to_code[v])

In [7]:
retweet_author_array = np.zeros((retweet_author_df['retweet_code'].nunique(), retweet_author_df['author_code'].nunique()))

for author, retweet, count in zip(retweet_author_df['author_code'], retweet_author_df['retweet_code'], retweet_author_df['count']):
    retweet_author_array[retweet, author] = count

In [8]:
# Normalize the array
norm_retweet_author_array = normalize(retweet_author_array, axis=1, norm='l1')

In [9]:
# Calculate the retweet author similarity
retweet_author_sim = cosine_similarity(norm_retweet_author_array)

In [13]:
# Create high author similarity dataframe
retweet_author_sim_df = {
    'user_i':[],
    'user_j':[],
    'sim':[],
}

for useri in range(len(retweet_author_sim)):
    for userj in range(len(retweet_author_sim)):
        if useri != userj:
            retweet_author_sim_df['user_i'].append(code_to_retweet[useri])
            retweet_author_sim_df['user_j'].append(code_to_retweet[userj])
            retweet_author_sim_df['sim'].append(retweet_author_sim[useri, userj])

retweet_author_sim_df = pd.DataFrame(retweet_author_sim_df)

In [11]:
retweet_author_sim_df.head()

Unnamed: 0,user_i,user_j,sim
0,702433901394993152,1463549953,0.0
1,702433901394993152,705766540952657922,0.0
2,702433901394993152,706172170301407233,0.077968
3,702433901394993152,707910187248525312,0.0
4,702433901394993152,710179147956559873,0.0


In [14]:
retweet_author_sim_df.to_csv("outputs/retweet_author_sim_df.csv", index=False)