# Setup

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import MinMaxScaler
from pylab import rcParams

In [2]:
users_df = pd.read_excel(r'./data/userdata.xlsx') # Per user data
network_df = pd.read_excel(r'./data/network2.xlsx') # Twitter network data

# Delete index column
users_df.drop(users_df.columns[0], axis=1, inplace=True)
network_df.drop(network_df.columns[0], axis=1, inplace=True)

In [3]:
users_df.head(10)

Unnamed: 0,user,listed,followers
0,cloth_couture,0,1
1,Britany82060809,0,97
2,fifakitcreator,3,1476
3,DiscountTreasu1,0,133
4,Scarlet2Gray,0,42
5,salepricejo,0,12
6,IsabelBarreir12,0,9
7,MeSnooty,1,230
8,CFlirtwear,0,0
9,DreamsofaOwl,6,301


In [4]:
network_df.head(10)

Unnamed: 0,Source,Sourceid,Target,Targetid,type_of_content
0,mrsmcpuffin,60245403,FAMU_MBB,2726221854,Retweet
1,mrsmcpuffin,60245403,KingJames,23083404,Retweet
2,somenailstrips,1094314346593861633,WandaCo87259801,1117986510551302144,Retweet
3,somenailstrips,1094314346593861633,Poshmarkapp,357211620,Retweet
4,ja_corey14,3774742874,originelllly,1663859550,Retweet
5,BIGGMIKE904,66279549,HCWillieSimmons,29780461,Retweet
6,BIGGMIKE904,66279549,FAMU_MBB,2726221854,Retweet
7,BIGGMIKE904,66279549,KingJames,23083404,Retweet
8,_kingfree,1541895800,FAMU_MBB,2726221854,Retweet
9,_kingfree,1541895800,KingJames,23083404,Retweet


In [5]:
# Format names
network_df.rename(columns={'Source': 'source', 'Sourceid': 'source_id', 'Target': 'target',
                          'Targetid': 'target_id'}, inplace=True)

In [6]:
network_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6012 entries, 0 to 6011
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   source           6012 non-null   object
 1   source_id        6012 non-null   int64 
 2   target           6012 non-null   object
 3   target_id        6012 non-null   int64 
 4   type_of_content  6012 non-null   object
dtypes: int64(2), object(3)
memory usage: 235.0+ KB


In [7]:
network_df['type_of_content'].value_counts()

Retweet    2281
Mention    1902
Tweet      1694
Reply       135
Name: type_of_content, dtype: int64

In [8]:
# Merge retweet, mentions, and reply into non-tweets
network_df['type_of_content'].replace(['Retweet', 'Mention', 'Reply'], 'Non-Tweet', inplace=True)

In [9]:
network_df['type_of_content'].value_counts()

Non-Tweet    4318
Tweet        1694
Name: type_of_content, dtype: int64

In [10]:
network_df.head(10)

Unnamed: 0,source,source_id,target,target_id,type_of_content
0,mrsmcpuffin,60245403,FAMU_MBB,2726221854,Non-Tweet
1,mrsmcpuffin,60245403,KingJames,23083404,Non-Tweet
2,somenailstrips,1094314346593861633,WandaCo87259801,1117986510551302144,Non-Tweet
3,somenailstrips,1094314346593861633,Poshmarkapp,357211620,Non-Tweet
4,ja_corey14,3774742874,originelllly,1663859550,Non-Tweet
5,BIGGMIKE904,66279549,HCWillieSimmons,29780461,Non-Tweet
6,BIGGMIKE904,66279549,FAMU_MBB,2726221854,Non-Tweet
7,BIGGMIKE904,66279549,KingJames,23083404,Non-Tweet
8,_kingfree,1541895800,FAMU_MBB,2726221854,Non-Tweet
9,_kingfree,1541895800,KingJames,23083404,Non-Tweet


In [11]:
# Separate tweets and non-tweets
# Tweets used to calculate per user information
# Non-tweets used for graph edges
non_tweet_df = network_df[network_df['type_of_content'] == 'Non-Tweet']
tweet_df = network_df[network_df['type_of_content'] == 'Tweet']

In [12]:
# Calculate number of tweets
# user_df['tweets'] = 
users_df['tweets'] = pd.to_numeric(tweet_df.groupby('source')['type_of_content'].transform('count'))

In [13]:
# Calculate non-tweets sent and received
users_df['non_tweets_sent'] = pd.to_numeric(non_tweet_df.groupby('source')['type_of_content'].transform('count'))
users_df['non_tweets_received'] = pd.to_numeric(non_tweet_df.groupby('target')['type_of_content'].transform('count'))

# Graph

In [14]:
users_df.fillna(0, inplace=True) # Consitency
users_df

Unnamed: 0,user,listed,followers,tweets,non_tweets_sent,non_tweets_received
0,cloth_couture,0,1,0.0,2.0,110.0
1,Britany82060809,0,97,0.0,2.0,124.0
2,fifakitcreator,3,1476,0.0,2.0,4.0
3,DiscountTreasu1,0,133,0.0,2.0,1207.0
4,Scarlet2Gray,0,42,0.0,1.0,1.0
...,...,...,...,...,...,...
3972,PickerTesla,0,29,0.0,3.0,1.0
3973,Cop_Market,1,1327,0.0,1.0,1207.0
3974,Gluxia1,0,4,0.0,2.0,1207.0
3975,rgikpttn,11,56,0.0,2.0,3.0


In [15]:
# Create network
G = nx.DiGraph()

edges = list()
for (a,b) in zip(non_tweet_df['source'], non_tweet_df['target']):
    edges.append((a,b))

G.add_edges_from(edges)
G

<networkx.classes.digraph.DiGraph at 0x15fe9e29d08>

In [16]:
# Plot graph
'''
rcParams['figure.figsize'] = 14, 10
pos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))
nx.draw(G, pos, node_color='lightblue')
'''

"\nrcParams['figure.figsize'] = 14, 10\npos = nx.spring_layout(G, scale=20, k=3/np.sqrt(G.order()))\nnx.draw(G, pos, node_color='lightblue')\n"

In [17]:
degree_centrality=pd.DataFrame.from_dict(nx.degree_centrality(G), orient='index').reset_index()
degree_centrality

Unnamed: 0,index,0
0,mrsmcpuffin,0.000576
1,FAMU_MBB,0.031970
2,KingJames,0.034562
3,somenailstrips,0.000576
4,WandaCo87259801,0.001440
...,...,...
3468,SteakxEggs,0.000288
3469,KaysUniverse11,0.000288
3470,Gluxia1,0.000288
3471,KingSize199619,0.000288


In [18]:
closeness_centrality=pd.DataFrame.from_dict(nx.closeness_centrality(G), orient='index').reset_index()
closeness_centrality

Unnamed: 0,index,0
0,mrsmcpuffin,0.000000
1,FAMU_MBB,0.031685
2,KingJames,0.034562
3,somenailstrips,0.000000
4,WandaCo87259801,0.001152
...,...,...
3468,SteakxEggs,0.000288
3469,KaysUniverse11,0.000288
3470,Gluxia1,0.000000
3471,KingSize199619,0.000288


In [19]:
betweenness_centrality=pd.DataFrame.from_dict(nx.betweenness_centrality(G), orient='index').reset_index()
betweenness_centrality

Unnamed: 0,index,0
0,mrsmcpuffin,0.0
1,FAMU_MBB,0.0
2,KingJames,0.0
3,somenailstrips,0.0
4,WandaCo87259801,0.0
...,...,...
3468,SteakxEggs,0.0
3469,KaysUniverse11,0.0
3470,Gluxia1,0.0
3471,KingSize199619,0.0


In [20]:
network_measures = degree_centrality.merge(closeness_centrality, on='index').merge(betweenness_centrality, on='index')
network_measures

Unnamed: 0,index,0_x,0_y,0
0,mrsmcpuffin,0.000576,0.000000,0.0
1,FAMU_MBB,0.031970,0.031685,0.0
2,KingJames,0.034562,0.034562,0.0
3,somenailstrips,0.000576,0.000000,0.0
4,WandaCo87259801,0.001440,0.001152,0.0
...,...,...,...,...
3468,SteakxEggs,0.000288,0.000288,0.0
3469,KaysUniverse11,0.000288,0.000288,0.0
3470,Gluxia1,0.000288,0.000000,0.0
3471,KingSize199619,0.000288,0.000288,0.0


In [21]:
network_measures.columns = ['user', 'degree', 'closeness', 'betweenness'] # Consistent formatting

In [22]:
network_measures = network_measures.merge(users_df, on='user') # Merge on unique user name
network_measures

Unnamed: 0,user,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received
0,mrsmcpuffin,0.000576,0.000000,0.0,37,1626,0.0,2.0,124.0
1,FAMU_MBB,0.031970,0.031685,0.0,43,2744,0.0,1.0,144.0
2,KingJames,0.034562,0.034562,0.0,44542,49401456,0.0,2.0,110.0
3,somenailstrips,0.000576,0.000000,0.0,7,2506,27.0,0.0,0.0
4,WandaCo87259801,0.001440,0.001152,0.0,2,280,0.0,2.0,1207.0
...,...,...,...,...,...,...,...,...,...
3359,SteakxEggs,0.000288,0.000288,0.0,0,275,0.0,1.0,144.0
3360,KaysUniverse11,0.000288,0.000288,0.0,0,47,4.0,0.0,0.0
3361,Gluxia1,0.000288,0.000000,0.0,0,4,0.0,2.0,1207.0
3362,KingSize199619,0.000288,0.000288,0.0,2,1453,1.0,0.0,0.0


In [23]:
network_measures.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3364 entries, 0 to 3363
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user                 3364 non-null   object 
 1   degree               3364 non-null   float64
 2   closeness            3364 non-null   float64
 3   betweenness          3364 non-null   float64
 4   listed               3364 non-null   int64  
 5   followers            3364 non-null   int64  
 6   tweets               3364 non-null   float64
 7   non_tweets_sent      3364 non-null   float64
 8   non_tweets_received  3364 non-null   float64
dtypes: float64(6), int64(2), object(1)
memory usage: 262.8+ KB


In [24]:
network_measures.set_index('user', inplace=True)

In [25]:
# No need to calculate differences, look at a per user basis instead
'''
d_network_measures = pd.DataFrame(columns=['A_user', 'B_user', 'dAB_degree', 'dAB_closeness', 'dAB_betweenness', 'dAB_listed', 
                                          'dAB_followers', 'dAB_tweets', 'dAB_non_tweets_sent', 'dAB_non_tweets_received'])
for user_a, row_a in network_measures.iterrows():
    for user_b, row_b in network_measures.iterrows():
        d_network_measures.append({
            'A_user': user_a,
            'B_user': user_b,
            'dAB_degree': row_a['degree'] - row_b['degree'],
            'dAB_closeness': row_a['closeness'] - row_b['closeness'],
            'dAB_betweenness': row_a['betweenness'] - row_b['betweenness'],
            'dAB_listed': row_a['listed'] - row_b['listed'],
            'dAB_followers': row_a['followers'] - row_b['followers'],
            'dAB_tweets': row_a['tweets'] - row_b['tweets'],
            'dAB_non_tweets_sent': row_a['non_tweets_sent'] - row_b['non_tweets_sent'],
            'dAB_non_tweets_received': row_a['non_tweets_received'] - row_b['non_tweets_received'],
        }, ignore_index=True)
d_network_measures
'''

"\nd_network_measures = pd.DataFrame(columns=['A_user', 'B_user', 'dAB_degree', 'dAB_closeness', 'dAB_betweenness', 'dAB_listed', \n                                          'dAB_followers', 'dAB_tweets', 'dAB_non_tweets_sent', 'dAB_non_tweets_received'])\nfor user_a, row_a in network_measures.iterrows():\n    for user_b, row_b in network_measures.iterrows():\n        d_network_measures.append({\n            'A_user': user_a,\n            'B_user': user_b,\n            'dAB_degree': row_a['degree'] - row_b['degree'],\n            'dAB_closeness': row_a['closeness'] - row_b['closeness'],\n            'dAB_betweenness': row_a['betweenness'] - row_b['betweenness'],\n            'dAB_listed': row_a['listed'] - row_b['listed'],\n            'dAB_followers': row_a['followers'] - row_b['followers'],\n            'dAB_tweets': row_a['tweets'] - row_b['tweets'],\n            'dAB_non_tweets_sent': row_a['non_tweets_sent'] - row_b['non_tweets_sent'],\n            'dAB_non_tweets_received': row_a

# Score 

In [26]:
feature_importance = pd.read_csv('.\data\\feature_importance.csv') # Get weights
feature_importance = feature_importance.T
feature_importance.columns = feature_importance.iloc[0,:]
feature_importance.drop(feature_importance.index[0], inplace=True)
feature_importance

Unnamed: 0,dAB_follower,dAB_listed,dAB_tweets,dAB_degree,dAB_non_tweets_sent,dAB_non_tweets_received
Importance,0.09368,0.478216,0.077184,0.148541,0.07966,0.122718


In [27]:
feature_importance['dAB_follower'].values[0]

0.0936803743243217

**Normalize the dataframe**

In [28]:
scaled_features = MinMaxScaler().fit_transform(network_measures.values)

In [29]:
normalized_network_measures = pd.DataFrame(scaled_features, index=network_measures.index, columns=network_measures.columns)
normalized_network_measures

Unnamed: 0_level_0,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mrsmcpuffin,0.001015,0.000000,0.0,0.000318,1.775017e-05,0.000000,0.035088,0.102734
FAMU_MBB,0.111675,0.111570,0.0,0.000370,2.995478e-05,0.000000,0.017544,0.119304
KingJames,0.120812,0.121703,0.0,0.383213,5.392892e-01,0.000000,0.035088,0.091135
somenailstrips,0.001015,0.000000,0.0,0.000060,2.735666e-05,0.287234,0.000000,0.000000
WandaCo87259801,0.004061,0.004057,0.0,0.000017,3.056610e-06,0.000000,0.035088,1.000000
...,...,...,...,...,...,...,...,...
SteakxEggs,0.000000,0.001014,0.0,0.000000,3.002028e-06,0.000000,0.017544,0.119304
KaysUniverse11,0.000000,0.001014,0.0,0.000000,5.130738e-07,0.042553,0.000000,0.000000
Gluxia1,0.000000,0.000000,0.0,0.000000,4.366586e-08,0.000000,0.035088,1.000000
KingSize199619,0.000000,0.001014,0.0,0.000017,1.586162e-05,0.010638,0.000000,0.000000


In [30]:
# Calculated normalized per user score
normalized_network_measures['score'] = (
    normalized_network_measures['followers']*feature_importance['dAB_follower'].values +
    normalized_network_measures['listed']*feature_importance['dAB_listed'].values + 
    normalized_network_measures['tweets']*feature_importance['dAB_tweets'].values + 
    normalized_network_measures['degree']*feature_importance['dAB_degree'].values + 
    normalized_network_measures['non_tweets_sent']*feature_importance['dAB_non_tweets_sent'].values + 
    normalized_network_measures['non_tweets_received']*feature_importance['dAB_non_tweets_received'].values)                 

normalized_network_measures['score'] = normalized_network_measures['score'].astype(np.float32)
normalized_network_measures

Unnamed: 0_level_0,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received,score
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
mrsmcpuffin,0.001015,0.000000,0.0,0.000318,1.775017e-05,0.000000,0.035088,0.102734,0.015707
FAMU_MBB,0.111675,0.111570,0.0,0.000370,2.995478e-05,0.000000,0.017544,0.119304,0.032806
KingJames,0.120812,0.121703,0.0,0.383213,5.392892e-01,0.000000,0.035088,0.091135,0.265704
somenailstrips,0.001015,0.000000,0.0,0.000060,2.735666e-05,0.287234,0.000000,0.000000,0.022352
WandaCo87259801,0.004061,0.004057,0.0,0.000017,3.056610e-06,0.000000,0.035088,1.000000,0.126124
...,...,...,...,...,...,...,...,...,...
SteakxEggs,0.000000,0.001014,0.0,0.000000,3.002028e-06,0.000000,0.017544,0.119304,0.016039
KaysUniverse11,0.000000,0.001014,0.0,0.000000,5.130738e-07,0.042553,0.000000,0.000000,0.003284
Gluxia1,0.000000,0.000000,0.0,0.000000,4.366586e-08,0.000000,0.035088,1.000000,0.125513
KingSize199619,0.000000,0.001014,0.0,0.000017,1.586162e-05,0.010638,0.000000,0.000000,0.000831


**Get top 100**

In [31]:
results = normalized_network_measures.nlargest(100, 'score', keep='all')
results

Unnamed: 0_level_0,degree,closeness,betweenness,listed,followers,tweets,non_tweets_sent,non_tweets_received,score
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TheEconomist,0.000000,0.001014,0.000000,1.000000,0.279040,0.000000,0.035088,0.009942,0.508372
Cristiano,0.000000,0.001014,0.000000,0.720295,1.000000,0.276596,0.000000,0.000000,0.459486
YouTube,0.013198,0.014199,0.000000,0.680065,0.797726,0.000000,0.035088,0.091135,0.415889
Poshmarkapp,1.000000,1.000000,0.000000,0.003897,0.001356,0.000000,0.052632,1.000000,0.277442
KingJames,0.120812,0.121703,0.000000,0.383213,0.539289,0.000000,0.035088,0.091135,0.265704
...,...,...,...,...,...,...,...,...,...
ac_dzn,0.006091,0.000000,0.000000,0.000129,0.000021,0.000000,0.052632,1.000000,0.127879
THEDOPESOLE,0.006091,0.001014,0.116279,0.000043,0.000009,0.000000,0.052632,1.000000,0.127836
SouthernGemGal,0.002030,0.000000,0.000000,0.003880,0.000032,0.000000,0.035088,1.000000,0.127673
SU2CUK,0.000000,0.001014,0.000000,0.001385,0.000794,0.000000,0.052632,1.000000,0.127647


# Save Results

In [32]:
results.to_csv(r'.\data\nike_results.csv')

In [33]:
top_non_tweet_df = non_tweet_df.drop(columns=['source_id', 'target_id', 'type_of_content'])
top_non_tweet_df

Unnamed: 0,source,target
0,mrsmcpuffin,FAMU_MBB
1,mrsmcpuffin,KingJames
2,somenailstrips,WandaCo87259801
3,somenailstrips,Poshmarkapp
4,ja_corey14,originelllly
...,...,...
6004,products_hot,KaysUniverse11
6005,products_hot,Poshmarkapp
6008,Gluxia1,KingSize199619
6009,rgikpttn,Poshmarkapp


In [34]:
top_non_tweet_df.to_csv(r'.\data\edges.csv')