# Setup

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import MinMaxScaler
from pylab import rcParams

In [2]:
users_df = pd.read_excel(r'./old_data/userdata.xlsx')
network_df = pd.read_excel(r'./old_data/network.xlsx')

# If index column remains
#users_df.drop(users_df.columns[0], axis=1, inplace=True)
#network_df.drop(network_df.columns[0], axis=1, inplace=True)

In [3]:
users_df.head(10)

Unnamed: 0,user,listed,followers
0,SportChic_,0,137
1,HumbleAttitude1,28,3422
2,kjerbellis,0,11
3,InfogolApp,511,32357
4,DukeDFS,48,2363
5,the12gameparlay,0,5
6,b0br0vsky,0,126
7,lookn4aFULLRIDE,3,3406
8,Mateusz74968284,0,0
9,perilofafrica,17,1070


In [4]:
network_df.head(10)

Unnamed: 0,Source,Sourceid,Target,Targetid,type_of_content
0,ArijitFan1,1355121831372615683,ArijitFan1,1355121831372615683,Tweet
1,USMNTCORNER,1172274702049583105,USMNTCORNER,1172274702049583105,Tweet
2,ber2reh,1207348737258987520,Benzema,1964571728,Retweet
3,KB9_GOAT,1339101169042518017,Benzema,1964571728,Retweet
4,HfutbolW,1274615075723268097,HfutbolW,1274615075723268097,Tweet
5,miles__69,1181660258118389760,Royaltycfc,1050069385094811648,Retweet
6,JudyLazo,410740338,nclarkrd,103933553,Retweet
7,BanglaMostbet,1287653327082315777,BanglaMostbet,1287653327082315777,Tweet
8,FootAggregator,1204015945422909445,Genting_Bet,4201010939,Retweet
9,FootAggregator,1204015945422909445,GuillemBalague,26759984,Retweet


In [5]:
network_df.rename(columns={'Source': 'source', 'Sourceid': 'source_id', 'Target': 'target',
                          'Targetid': 'target_id'}, inplace=True)

In [6]:
network_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5816 entries, 0 to 5815
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   source           5816 non-null   object
 1   source_id        5816 non-null   int64 
 2   target           5816 non-null   object
 3   target_id        5816 non-null   int64 
 4   type_of_content  5816 non-null   object
dtypes: int64(2), object(3)
memory usage: 227.3+ KB


In [7]:
network_df['type_of_content'].value_counts()

Retweet    4475
Tweet       905
Mention     353
Reply        83
Name: type_of_content, dtype: int64

In [8]:
network_df['type_of_content'].replace(['Retweet', 'Mention', 'Reply'], 'Non-Tweet', inplace=True)

In [9]:
network_df['type_of_content'].value_counts()

Non-Tweet    4911
Tweet         905
Name: type_of_content, dtype: int64

In [10]:
network_df.head(10)

Unnamed: 0,source,source_id,target,target_id,type_of_content
0,ArijitFan1,1355121831372615683,ArijitFan1,1355121831372615683,Tweet
1,USMNTCORNER,1172274702049583105,USMNTCORNER,1172274702049583105,Tweet
2,ber2reh,1207348737258987520,Benzema,1964571728,Non-Tweet
3,KB9_GOAT,1339101169042518017,Benzema,1964571728,Non-Tweet
4,HfutbolW,1274615075723268097,HfutbolW,1274615075723268097,Tweet
5,miles__69,1181660258118389760,Royaltycfc,1050069385094811648,Non-Tweet
6,JudyLazo,410740338,nclarkrd,103933553,Non-Tweet
7,BanglaMostbet,1287653327082315777,BanglaMostbet,1287653327082315777,Tweet
8,FootAggregator,1204015945422909445,Genting_Bet,4201010939,Non-Tweet
9,FootAggregator,1204015945422909445,GuillemBalague,26759984,Non-Tweet


In [11]:
non_tweet_df = network_df[network_df['type_of_content'] == 'Non-Tweet']
tweet_df = network_df[network_df['type_of_content'] == 'Tweet']

In [12]:
# Calculate number of tweets
# user_df['tweets'] = 
users_df['tweets'] = pd.to_numeric(tweet_df.groupby('source')['type_of_content'].transform('count'))

In [13]:
# Calculate non-tweets sent and received
users_df['non_tweets_sent'] = pd.to_numeric(non_tweet_df.groupby('source')['type_of_content'].transform('count'))
users_df['non_tweets_received'] = pd.to_numeric(non_tweet_df.groupby('target')['type_of_content'].transform('count'))

# Graph

In [14]:
users_df.fillna(0, inplace=True)
users_df

Unnamed: 0,user,listed,followers,tweets,non_tweets_sent,non_tweets_received
0,SportChic_,0,137,2.0,0.0,0.0
1,HumbleAttitude1,28,3422,1.0,0.0,0.0
2,kjerbellis,0,11,0.0,1.0,2764.0
3,InfogolApp,511,32357,0.0,1.0,2764.0
4,DukeDFS,48,2363,3.0,0.0,0.0
...,...,...,...,...,...,...
4734,Sidneyyyyy__,0,78,0.0,13.0,1.0
4735,adityatrd,4,513,0.0,1.0,2764.0
4736,santiagos83,10,882,0.0,1.0,2764.0
4737,brtop7,1,76,0.0,1.0,2764.0


In [15]:
G = nx.DiGraph()

edges = list()
for (a,b) in zip(non_tweet_df['source'], non_tweet_df['target']):
    edges.append((a,b))

G.add_edges_from(edges)
G

<networkx.classes.digraph.DiGraph at 0x1f47b1245c8>

In [16]:
# Plot graph
'''
rcParams['figure.figsize'] = 14, 10
pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
nx.draw(G, pos, node_color='lightblue')
'''

"\nrcParams['figure.figsize'] = 14, 10\npos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))\nnx.draw(G, pos, node_color='lightblue')\n"

In [17]:
degree_centrality=pd.DataFrame.from_dict(nx.degree_centrality(G), orient='index').reset_index()
degree_centrality

Unnamed: 0,index,0
0,ber2reh,0.000232
1,Benzema,0.641746
2,KB9_GOAT,0.000232
3,miles__69,0.000232
4,Royaltycfc,0.009984
...,...,...
4303,Sidneyyyyy__,0.000232
4304,adityatrd,0.000232
4305,santiagos83,0.000232
4306,brtop7,0.000232


In [18]:
closeness_centrality=pd.DataFrame.from_dict(nx.closeness_centrality(G), orient='index').reset_index()
closeness_centrality

Unnamed: 0,index,0
0,ber2reh,0.000000
1,Benzema,0.641746
2,KB9_GOAT,0.000000
3,miles__69,0.000000
4,Royaltycfc,0.009519
...,...,...
4303,Sidneyyyyy__,0.000000
4304,adityatrd,0.000000
4305,santiagos83,0.000000
4306,brtop7,0.000000


In [19]:
betweenness_centrality=pd.DataFrame.from_dict(nx.betweenness_centrality(G), orient='index').reset_index()
betweenness_centrality

Unnamed: 0,index,0
0,ber2reh,0.0
1,Benzema,0.0
2,KB9_GOAT,0.0
3,miles__69,0.0
4,Royaltycfc,0.0
...,...,...
4303,Sidneyyyyy__,0.0
4304,adityatrd,0.0
4305,santiagos83,0.0
4306,brtop7,0.0


In [20]:
network_measures = degree_centrality.merge(closeness_centrality, on='index').merge(betweenness_centrality, on='index')
network_measures

Unnamed: 0,index,0_x,0_y,0
0,ber2reh,0.000232,0.000000,0.0
1,Benzema,0.641746,0.641746,0.0
2,KB9_GOAT,0.000232,0.000000,0.0
3,miles__69,0.000232,0.000000,0.0
4,Royaltycfc,0.009984,0.009519,0.0
...,...,...,...,...
4303,Sidneyyyyy__,0.000232,0.000000,0.0
4304,adityatrd,0.000232,0.000000,0.0
4305,santiagos83,0.000232,0.000000,0.0
4306,brtop7,0.000232,0.000000,0.0


In [21]:
network_measures.columns = ['user', 'degree', 'closeness', 'betweenness']

In [22]:
network_measures = network_measures.merge(users_df, on='user')
network_measures

Unnamed: 0,user,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received
0,ber2reh,0.000232,0.000000,0.0,0,41,0.0,2.0,12.0
1,Benzema,0.641746,0.641746,0.0,5247,12130878,0.0,38.0,1.0
2,KB9_GOAT,0.000232,0.000000,0.0,0,3,0.0,2.0,2.0
3,miles__69,0.000232,0.000000,0.0,0,4879,2.0,0.0,0.0
4,Royaltycfc,0.009984,0.009519,0.0,6,4623,14.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
4302,Sidneyyyyy__,0.000232,0.000000,0.0,0,78,0.0,13.0,1.0
4303,adityatrd,0.000232,0.000000,0.0,4,513,0.0,1.0,2764.0
4304,santiagos83,0.000232,0.000000,0.0,10,882,0.0,1.0,2764.0
4305,brtop7,0.000232,0.000000,0.0,1,76,0.0,1.0,2764.0


In [23]:
network_measures.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4307 entries, 0 to 4306
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user                 4307 non-null   object 
 1   degree               4307 non-null   float64
 2   closeness            4307 non-null   float64
 3   betweenness          4307 non-null   float64
 4   listed               4307 non-null   int64  
 5   followers            4307 non-null   int64  
 6   tweets               4307 non-null   float64
 7   non_tweets_sent      4307 non-null   float64
 8   non_tweets_received  4307 non-null   float64
dtypes: float64(6), int64(2), object(1)
memory usage: 336.5+ KB


In [24]:
network_measures.set_index('user', inplace=True)

In [25]:
'''
d_network_measures = pd.DataFrame(columns=['A_user', 'B_user', 'dAB_degree', 'dAB_closeness', 'dAB_betweenness', 'dAB_listed', 
                                          'dAB_followers', 'dAB_tweets', 'dAB_non_tweets_sent', 'dAB_non_tweets_received'])
for user_a, row_a in network_measures.iterrows():
    for user_b, row_b in network_measures.iterrows():
        d_network_measures.append({
            'A_user': user_a,
            'B_user': user_b,
            'dAB_degree': row_a['degree'] - row_b['degree'],
            'dAB_closeness': row_a['closeness'] - row_b['closeness'],
            'dAB_betweenness': row_a['betweenness'] - row_b['betweenness'],
            'dAB_listed': row_a['listed'] - row_b['listed'],
            'dAB_followers': row_a['followers'] - row_b['followers'],
            'dAB_tweets': row_a['tweets'] - row_b['tweets'],
            'dAB_non_tweets_sent': row_a['non_tweets_sent'] - row_b['non_tweets_sent'],
            'dAB_non_tweets_received': row_a['non_tweets_received'] - row_b['non_tweets_received'],
        }, ignore_index=True)
d_network_measures
'''

"\nd_network_measures = pd.DataFrame(columns=['A_user', 'B_user', 'dAB_degree', 'dAB_closeness', 'dAB_betweenness', 'dAB_listed', \n                                          'dAB_followers', 'dAB_tweets', 'dAB_non_tweets_sent', 'dAB_non_tweets_received'])\nfor user_a, row_a in network_measures.iterrows():\n    for user_b, row_b in network_measures.iterrows():\n        d_network_measures.append({\n            'A_user': user_a,\n            'B_user': user_b,\n            'dAB_degree': row_a['degree'] - row_b['degree'],\n            'dAB_closeness': row_a['closeness'] - row_b['closeness'],\n            'dAB_betweenness': row_a['betweenness'] - row_b['betweenness'],\n            'dAB_listed': row_a['listed'] - row_b['listed'],\n            'dAB_followers': row_a['followers'] - row_b['followers'],\n            'dAB_tweets': row_a['tweets'] - row_b['tweets'],\n            'dAB_non_tweets_sent': row_a['non_tweets_sent'] - row_b['non_tweets_sent'],\n            'dAB_non_tweets_received': row_a

# Score 

In [26]:
feature_importance = pd.read_csv('.\data\\feature_importance.csv')
feature_importance = feature_importance.T
feature_importance.columns = feature_importance.iloc[0,:]
feature_importance.drop(feature_importance.index[0], inplace=True)
feature_importance

Unnamed: 0,dAB_follower,dAB_listed,dAB_tweets,dAB_degree,dAB_non_tweets_sent,dAB_non_tweets_received
Importance,0.09368,0.478216,0.077184,0.148541,0.07966,0.122718


In [27]:
feature_importance['dAB_follower'].values[0]

0.0936803743243217

**Normalize the dataframe**

In [28]:
scaled_features = MinMaxScaler().fit_transform(network_measures.values)

In [29]:
normalized_network_measures = pd.DataFrame(scaled_features, index=network_measures.index, columns=network_measures.columns)
normalized_network_measures

Unnamed: 0_level_0,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ber2reh,0.000000,0.000000,0.0,0.000000,4.476422e-07,0.000000,0.052632,0.004342
Benzema,1.000000,1.000000,0.0,0.035562,1.324462e-01,0.000000,1.000000,0.000362
KB9_GOAT,0.000000,0.000000,0.0,0.000000,3.275431e-08,0.000000,0.052632,0.000724
miles__69,0.000000,0.000000,0.0,0.000000,5.326942e-05,0.076923,0.000000,0.000000
Royaltycfc,0.015201,0.014834,0.0,0.000041,5.047439e-05,0.538462,0.000000,0.000000
...,...,...,...,...,...,...,...,...
Sidneyyyyy__,0.000000,0.000000,0.0,0.000000,8.516120e-07,0.000000,0.342105,0.000362
adityatrd,0.000000,0.000000,0.0,0.000027,5.600987e-06,0.000000,0.026316,1.000000
santiagos83,0.000000,0.000000,0.0,0.000068,9.629766e-06,0.000000,0.026316,1.000000
brtop7,0.000000,0.000000,0.0,0.000007,8.297758e-07,0.000000,0.026316,1.000000


In [30]:
normalized_network_measures['score'] = (
    normalized_network_measures['followers']*feature_importance['dAB_follower'].values +
    normalized_network_measures['listed']*feature_importance['dAB_listed'].values + 
    normalized_network_measures['tweets']*feature_importance['dAB_tweets'].values + 
    normalized_network_measures['degree']*feature_importance['dAB_degree'].values + 
    normalized_network_measures['non_tweets_sent']*feature_importance['dAB_non_tweets_sent'].values + 
    normalized_network_measures['non_tweets_received']*feature_importance['dAB_non_tweets_received'].values)                 

normalized_network_measures['score'] = normalized_network_measures['score'].astype(np.float32)
normalized_network_measures

Unnamed: 0_level_0,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received,score
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ber2reh,0.000000,0.000000,0.0,0.000000,4.476422e-07,0.000000,0.052632,0.004342,0.004725
Benzema,1.000000,1.000000,0.0,0.035562,1.324462e-01,0.000000,1.000000,0.000362,0.257660
KB9_GOAT,0.000000,0.000000,0.0,0.000000,3.275431e-08,0.000000,0.052632,0.000724,0.004281
miles__69,0.000000,0.000000,0.0,0.000000,5.326942e-05,0.076923,0.000000,0.000000,0.005942
Royaltycfc,0.015201,0.014834,0.0,0.000041,5.047439e-05,0.538462,0.000000,0.000000,0.043843
...,...,...,...,...,...,...,...,...,...
Sidneyyyyy__,0.000000,0.000000,0.0,0.000000,8.516120e-07,0.000000,0.342105,0.000362,0.027297
adityatrd,0.000000,0.000000,0.0,0.000027,5.600987e-06,0.000000,0.026316,1.000000,0.124827
santiagos83,0.000000,0.000000,0.0,0.000068,9.629766e-06,0.000000,0.026316,1.000000,0.124847
brtop7,0.000000,0.000000,0.0,0.000007,8.297758e-07,0.000000,0.026316,1.000000,0.124817


**Get top 100**

In [31]:
results = normalized_network_measures.nlargest(100, 'score', keep='all')
results

Unnamed: 0_level_0,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received,score
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CNN,0.000000,0.000362,0.0,1.000000,0.581337,0.038462,0.000000,0.000000,0.535645
Reuters,0.000000,0.000362,0.0,0.855842,0.253763,0.653846,0.000000,0.000000,0.483517
YouTube,0.003981,0.004342,0.0,0.538246,0.797753,0.000000,0.026316,1.000000,0.457537
Cristiano,0.000362,0.000724,0.0,0.567403,1.000000,0.000000,0.026316,0.001809,0.367394
FCBarcelona,0.000000,0.000362,0.0,0.207373,0.392083,0.000000,0.026316,1.000000,0.260713
...,...,...,...,...,...,...,...,...,...
fentywhore,0.000000,0.000000,0.0,0.000454,0.000051,0.000000,0.026316,1.000000,0.125036
mr_jospinchrist,0.000000,0.000000,0.0,0.000447,0.000016,0.000000,0.026316,1.000000,0.125029
Lucy03075703,0.001448,0.000000,0.0,0.000000,0.000001,0.000000,0.026316,1.000000,0.125029
firebrnd13,0.001086,0.000000,0.0,0.000108,0.000003,0.000000,0.026316,1.000000,0.125027


# Save Results

In [32]:
results.to_csv(r'.\data\soccer_results.csv')

In [33]:
top_non_tweet_df = non_tweet_df.drop(columns=['source_id', 'target_id', 'type_of_content'])
top_non_tweet_df

Unnamed: 0,source,target
2,ber2reh,Benzema
3,KB9_GOAT,Benzema
5,miles__69,Royaltycfc
6,JudyLazo,nclarkrd
8,FootAggregator,Genting_Bet
...,...,...
5811,Sidneyyyyy__,Benzema
5812,adityatrd,Benzema
5813,santiagos83,Benzema
5814,brtop7,Benzema


In [34]:
top_non_tweet_df.to_csv(r'.\data\edges.csv')