In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import networkx.algorithms.community as nx_comm

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("/content/gdrive/My Drive/tweetsConvereted2017_toxicPerspectiveScore.csv")

In [4]:
G = nx.read_edgelist('/content/gdrive/My Drive/users.edgelist')

In [5]:
groupDf = df[['user_id', 'Toxicity']]
groupDf2 = groupDf[['user_id', 'Toxicity']].astype({'user_id': str})
groupedDf = groupDf2.groupby(['user_id']).agg({'Toxicity': 'mean'}).reset_index()

In [6]:
users_list = list(groupedDf['user_id'].unique().astype(str))
G2 = G.subgraph(users_list)

In [7]:
user_toxicity = groupedDf.set_index('user_id')['Toxicity'].to_dict()

In [8]:
neighbor_toxicity = {}
for user in G2.nodes():
  neighbors = list(G2.neighbors(user))
  if user in neighbors:
      neighbors.remove(user)
  if len(neighbors) > 0:
      neighbor_toxicity[user] = groupedDf[groupedDf['user_id'].isin(neighbors)]['Toxicity'].mean()

In [9]:
x = [user_toxicity[user] for user in neighbor_toxicity.keys()]
y = list(neighbor_toxicity.values())

In [10]:
ratio = [x[i]/y[i] for i in range(len(x))]

In [11]:
ratio_dict = {user_id: user_toxicity[user_id] / neighbor_toxicity[user_id] for user_id in neighbor_toxicity}

In [13]:
min(ratio_dict.values())

0.022785761717238426

In [14]:
sorted_diff_dict = {k: v for k, v in sorted(ratio_dict.items(), key=lambda item: item[1])}

In [15]:
values = list(sorted_diff_dict.values())
Q1 = np.percentile(values, 25, interpolation='midpoint')
Q2 = np.percentile(values, 50, interpolation='midpoint')
Q3 = np.percentile(values, 75, interpolation='midpoint')

iqr = Q3 - Q1

In [30]:
print(iqr)

0.5718821123508927


In [31]:
low_lim = Q1 - 1.5 * iqr
up_lim = Q3 + 1.5 * iqr
print(low_lim)
print(up_lim)

-0.24710411940646637
2.0404243299971045


In [17]:
outlier = []
for i in values:
    if ((i> up_lim) or (i<low_lim)):
         outlier.append(i)

In [32]:
len(outlier)

1825

In [18]:
outlier_dict = {}
for i in range(len(outlier)):
    index = values.index(outlier[i])
    user_id = list(sorted_diff_dict.keys())[index]
    outlier_dict[user_id] = outlier[i]

In [26]:
attenuator_list = []
amplifier_list = []

for user_id, value in outlier_dict.items():
    if value < 1:
        attenuator_list.append(user_id)
    elif value >= 1:
        amplifier_list.append(user_id)

In [29]:
attenuator_list

[]

In [27]:
copyCat_list = list(set(users_list) - set(attenuator_list) - set(amplifier_list))

In [28]:
print(len(attenuator_list))
print(len(amplifier_list))
print(len(copyCat_list))

0
1825
58023


In [33]:
mod = nx_comm.modularity(G2, [attenuator_list,amplifier_list,copyCat_list])
print(mod)

0.0029075721786772056


In [34]:
communities = nx_comm.louvain_communities(G2)

In [35]:
len(communities)

88