In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from datetime import datetime
import networkx as nx
import math
from collections import Counter, OrderedDict
from scipy.stats import shapiro
from scipy.stats import ks_2samp
import scipy.stats as stats
from scipy.stats import lognorm
import statsmodels.api as sm
from scipy.stats import norm
import pylab
from scipy.stats import yeojohnson

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("/content/gdrive/My Drive/tweetsConvereted2017_toxicPerspectiveScore.csv")

In [4]:
G = nx.read_edgelist('/content/gdrive/My Drive/users.edgelist')

In [5]:
groupDf = df[['user_id', 'Toxicity']]
groupDf2 = groupDf[['user_id', 'Toxicity']].astype({'user_id': str})
groupedDf = groupDf2.groupby(['user_id']).agg({'Toxicity': 'mean'}).reset_index()

In [6]:
users_list = list(groupedDf['user_id'].unique().astype(str))
G2 = G.subgraph(users_list)

In [7]:
user_toxicity = groupedDf.set_index('user_id')['Toxicity'].to_dict()

In [8]:
neighbor_toxicity = {}
for user in G2.nodes():
  neighbors = list(G2.neighbors(user))
  if user in neighbors:
      neighbors.remove(user)
  if len(neighbors) > 0:
      neighbor_toxicity[user] = groupedDf[groupedDf['user_id'].isin(neighbors)]['Toxicity'].mean()

In [9]:
x = [user_toxicity[user] for user in neighbor_toxicity.keys()]
y = list(neighbor_toxicity.values())

In [21]:
diff = [x[i] - y[i] for i in range(len(x))]

In [22]:
diff_dict = {user_id: user_toxicity[user_id] - neighbor_toxicity[user_id] for user_id in neighbor_toxicity}

In [23]:
len(diff_dict)

59837

In [24]:
diff_dict_keys = list(diff_dict.keys())
neighbor_toxicity_keys = list(neighbor_toxicity.keys())

if diff_dict_keys == neighbor_toxicity_keys:
    print("The keys in toxicity_diff and neighbor_toxicity are the same")
else:
    print("The keys in toxicity_diff and neighbor_toxicity are different")


The keys in toxicity_diff and neighbor_toxicity are the same


In [44]:
if diff == list(diff_dict.values()):
  print('same')
else:
  print('not same')

same


In [28]:
sorted_diff_dict = {k: v for k, v in sorted(diff_dict.items(), key=lambda item: item[1])}

In [30]:
len(sorted_diff_dict)

59837

In [37]:
values = list(sorted_diff_dict.values())
Q1 = np.percentile(values, 25, interpolation='midpoint')
Q2 = np.percentile(values, 50, interpolation='midpoint')
Q3 = np.percentile(values, 75, interpolation='midpoint')

print('Q1 25 percentile of the given data is, ', Q1)
print('Q2 50 percentile of the given data is, ', Q2)
print('Q3 75 percentile of the given data is, ', Q3)

iqr = Q3 - Q1
print('Interquartile range is', iqr)

Q1 25 percentile of the given data is,  -0.027702269315666092
Q2 50 percentile of the given data is,  -0.008956206727245425
Q3 75 percentile of the given data is,  0.0159364772955487
Interquartile range is 0.04363874661121479


In [38]:
low_lim = Q1 - 1.5 * iqr
up_lim = Q3 + 1.5 * iqr
print('low_limit is', low_lim)
print('up_limit is', up_lim)

low_limit is -0.09316038923248827
up_limit is 0.08139459721237088


In [41]:
outlier = []
for i in values:
    if ((i> up_lim) or (i<low_lim)):
         outlier.append(i)
print(outlier)

[-0.3684331612787879, -0.35286040406987623, -0.31184739884954166, -0.31132010631157897, -0.29191923409420284, -0.2918100766722222, -0.2710583881035476, -0.2602960711253876, -0.25910483986823735, -0.2585312053607018, -0.24958612497466465, -0.24888972915357144, -0.23581135154141486, -0.23222917680610028, -0.23045069875398422, -0.2296212406995307, -0.22607422577399588, -0.2200075808241916, -0.2118543263437985, -0.21078495665854413, -0.2103158038872093, -0.20581877317616734, -0.20577103060000002, -0.20472380606060606, -0.20407122596983507, -0.19888214527777776, -0.19730042424600458, -0.1971885751619658, -0.19633611458665048, -0.19595406394711878, -0.19565144891290473, -0.19450423961151236, -0.19399829053162393, -0.19362248028840512, -0.19352438817311804, -0.1915857595041958, -0.19149225182375895, -0.18826488657710844, -0.18823788725702129, -0.18809786335532758, -0.1875972673570497, -0.18741280524586668, -0.1873052062608786, -0.185927867800619, -0.1836266967630811, -0.18183843510113779, -0.

In [42]:
len(outlier)

4916

In [46]:
outlier_dict = {}
for i in range(len(outlier)):
    index = values.index(outlier[i])
    user_id = list(sorted_diff_dict.keys())[index]
    outlier_dict[user_id] = outlier[i]

In [48]:
len(outlier_dict)

4916

In [53]:
attenuator_list = []
amplifier_list = []

for user_id, value in outlier_dict.items():
    if value < 0:
        attenuator_list.append(user_id)
    elif value >= 0:
        amplifier_list.append(user_id)

In [54]:
len(attenuator_list)

1237

In [55]:
len(amplifier_list)

3679

In [43]:
'''
df = pd.read_csv("/content/gdrive/My Drive/tweetsConvereted2017_toxicPerspectiveScore.csv")
G = nx.read_edgelist('/content/gdrive/My Drive/users.edgelist')
groupDf = df[['user_id', 'Toxicity']]
groupDf2 = groupDf[['user_id', 'Toxicity']].astype({'user_id': str})
groupedDf = groupDf2.groupby(['user_id']).agg({'Toxicity': 'mean'}).reset_index()
users_list = list(groupedDf['user_id'].unique().astype(str))
G2 = G.subgraph(users_list)
user_toxicity = groupedDf.set_index('user_id')['Toxicity'].to_dict()
neighbor_toxicity = {}
for user in G2.nodes():
  neighbors = list(G2.neighbors(user))
  if user in neighbors:
      neighbors.remove(user)
  if len(neighbors) > 0:
      neighbor_toxicity[user] = groupedDf[groupedDf['user_id'].isin(neighbors)]['Toxicity'].mean()
x = [user_toxicity[user] for user in neighbor_toxicity.keys()]
y = list(neighbor_toxicity.values())
diff = [x[i] - y[i] for i in range(len(x))]
diff_dict = {user_id: user_toxicity[user_id] - neighbor_toxicity[user_id] for user_id in neighbor_toxicity}
sorted_diff_dict = {k: v for k, v in sorted(diff_dict.items(), key=lambda item: item[1])}
values = list(sorted_diff_dict.values())
Q1 = np.percentile(values, 25, interpolation='midpoint')
Q2 = np.percentile(values, 50, interpolation='midpoint')
Q3 = np.percentile(values, 75, interpolation='midpoint')

print('Q1 25 percentile of the given data is, ', Q1)
print('Q2 50 percentile of the given data is, ', Q2)
print('Q3 75 percentile of the given data is, ', Q3)

iqr = Q3 - Q1
print('Interquartile range is', iqr)
low_lim = Q1 - 1.5 * iqr
up_lim = Q3 + 1.5 * iqr
print('low_limit is', low_lim)
print('up_limit is', up_lim)
outlier = []
for i in values:
    if ((i> up_lim) or (i<low_lim)):
         outlier.append(i)
print(outlier)
'''

'\ndf = pd.read_csv("/content/gdrive/My Drive/tweetsConvereted2017_toxicPerspectiveScore.csv")\nG = nx.read_edgelist(\'/content/gdrive/My Drive/users.edgelist\')\ngroupDf = df[[\'user_id\', \'Toxicity\']]\ngroupDf2 = groupDf[[\'user_id\', \'Toxicity\']].astype({\'user_id\': str})\ngroupedDf = groupDf2.groupby([\'user_id\']).agg({\'Toxicity\': \'mean\'}).reset_index()\nusers_list = list(groupedDf[\'user_id\'].unique().astype(str))\nG2 = G.subgraph(users_list)\nuser_toxicity = groupedDf.set_index(\'user_id\')[\'Toxicity\'].to_dict()\nneighbor_toxicity = {}\nfor user in G2.nodes():\n  neighbors = list(G2.neighbors(user))\n  if user in neighbors:\n      neighbors.remove(user)\n  if len(neighbors) > 0:\n      neighbor_toxicity[user] = groupedDf[groupedDf[\'user_id\'].isin(neighbors)][\'Toxicity\'].mean()\nx = [user_toxicity[user] for user in neighbor_toxicity.keys()]\ny = list(neighbor_toxicity.values())\ndiff = [x[i] - y[i] for i in range(len(x))]\ndiff_dict = {user_id: user_toxicity[user_