In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import networkx.algorithms.community as nx_comm
# from networkx.algorithms.community.quality import modularity, to_partition
# from networkx.algorithms.community import utils as nx_comm_utils

import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv("/content/gdrive/My Drive/tweetsConvereted2017_toxicPerspectiveScore.csv")

In [5]:
G = nx.read_edgelist('/content/gdrive/My Drive/users.edgelist')

In [6]:
groupDf = df[['user_id', 'Toxicity']]
groupDf2 = groupDf[['user_id', 'Toxicity']].astype({'user_id': str})
groupedDf = groupDf2.groupby(['user_id']).agg({'Toxicity': 'mean'}).reset_index()

In [7]:
users_list = list(groupedDf['user_id'].unique().astype(str))
G2 = G.subgraph(users_list)

In [8]:
G2.number_of_edges()

1161596

In [9]:
user_toxicity = groupedDf.set_index('user_id')['Toxicity'].to_dict()

In [10]:
neighbor_toxicity = {}
for user in G2.nodes():
  neighbors = list(G2.neighbors(user))
  if user in neighbors:
      neighbors.remove(user)
  if len(neighbors) > 0:
      neighbor_toxicity[user] = groupedDf[groupedDf['user_id'].isin(neighbors)]['Toxicity'].mean()

In [11]:
x = [user_toxicity[user] for user in neighbor_toxicity.keys()]
y = list(neighbor_toxicity.values())

In [12]:
diff = [x[i] - y[i] for i in range(len(x))]

In [13]:
diff_dict = {user_id: user_toxicity[user_id] - neighbor_toxicity[user_id] for user_id in neighbor_toxicity}

In [14]:
sorted_diff_dict = {k: v for k, v in sorted(diff_dict.items(), key=lambda item: item[1])}

In [15]:
values = list(sorted_diff_dict.values())
Q1 = np.percentile(values, 25, interpolation='midpoint')
Q2 = np.percentile(values, 50, interpolation='midpoint')
Q3 = np.percentile(values, 75, interpolation='midpoint')

iqr = Q3 - Q1

In [16]:
low_lim = Q1 - 1.5 * iqr
up_lim = Q3 + 1.5 * iqr

In [17]:
outlier = []
for i in values:
    if ((i> up_lim) or (i<low_lim)):
         outlier.append(i)

In [18]:
outlier_dict = {}
for i in range(len(outlier)):
    index = values.index(outlier[i])
    user_id = list(sorted_diff_dict.keys())[index]
    outlier_dict[user_id] = outlier[i]

In [19]:
attenuator_list = []
amplifier_list = []

for user_id, value in outlier_dict.items():
    if value < 0:
        attenuator_list.append(user_id)
    elif value >= 0:
        amplifier_list.append(user_id)

In [20]:
copyCat_list = list(set(users_list) - set(attenuator_list) - set(amplifier_list))

In [21]:
print(len(attenuator_list))
print(len(amplifier_list))
print(len(copyCat_list))

1237
3679
54932


In [22]:
mod = nx_comm.modularity(G2, [attenuator_list,amplifier_list,copyCat_list])
print(mod)

0.019838843258746963


In [23]:
com = nx_comm.louvain_communities(G2)
print(len(com))

66


In [24]:
com2 = nx_comm.louvain_communities(G2)
print(len(com2))

82


In [25]:
com3 = nx_comm.louvain_communities(G2)
print(len(com3))

78


In [28]:
lc_mod = nx_comm.modularity(G2, com)
print(lc_mod)

0.6532553780588601
