In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from datetime import datetime
import networkx as nx
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("/content/gdrive/My Drive/tweetsConvereted2017_toxicPerspectiveScore.csv")

In [4]:
G = nx.read_edgelist('/content/gdrive/My Drive/users.edgelist')

In [5]:
df["week"] = df["NewDateFormat"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime("%U"))
df["month"] = df["NewDateFormat"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").strftime("%m"))
df["day"] = df["NewDateFormat"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").day)

In [6]:
df.head()

Unnamed: 0,user_id,screen_name,tweet_id,tweet_text,tweet_creation,tweet_fav,tweet_rt,rp_flag,rp_status,rp_user,...,rt_text,rt_creation,rt_fav,rt_rt,Toxicity,Severe Toxicity,NewDateFormat,week,month,day
0,4746709454,just_jusss,911477065328631809,"Having a strict dad, if I ask to go out one we...",1506159000.0,1.0,0.0,False,,,...,,,,,0.052305,0.002346,2017-09-23 09:27:21,38,9,23
1,4746709454,just_jusss,911399501264384001,Lookin at all these couples like MBN🤧 but then...,1506140000.0,2.0,0.0,False,,,...,,,,,0.068171,0.003433,2017-09-23 04:19:08,38,9,23
2,4746709454,just_jusss,911343372282900480,Being a senior kinda sucks when most of your f...,1506127000.0,3.0,0.0,False,,,...,,,,,0.518691,0.023787,2017-09-23 00:36:06,38,9,23
3,773711528398848000,CassandraCohan,923729371235586049,@periwinkledink So pretty,1509076000.0,1.0,0.0,True,9.237051062640271e+17,3434322000.0,...,,,,,0.010681,0.002117,2017-10-27 03:53:38,43,10,27
4,773711528398848000,CassandraCohan,923689995763843072,@readyforramos It’s 2017 we go to church and pray,1509067000.0,3.0,0.0,True,9.236899364060447e+17,2633570000.0,...,,,,,0.018975,0.001316,2017-10-27 01:17:10,43,10,27


In [7]:
# df.loc[df['user_id']==2283058711]

In [8]:
groupDf = df[['user_id', 'tweet_id', 'Toxicity', 'week','month','day']]

In [9]:
print(groupDf['day'].dtypes)
print(groupDf['week'].dtypes)
print(groupDf['month'].dtypes)

int64
object
object


In [10]:
groupDf['day'] = groupDf['day'].astype(str)
groupDf['week'] = groupDf['week'].astype(str)
groupDf['month'] = groupDf['month'].astype(str)

In [11]:
print(groupDf['day'].dtypes)
print(groupDf['week'].dtypes)
print(groupDf['month'].dtypes)

object
object
object


In [12]:
weekDf = groupDf[groupDf['week']=="43"]
monthDf = groupDf[groupDf['month']=="10"]

In [13]:
weekDf.head() 

Unnamed: 0,user_id,tweet_id,Toxicity,week,month,day
3,773711528398848000,923729371235586049,0.010681,43,10,27
4,773711528398848000,923689995763843072,0.018975,43,10,27
5,773711528398848000,923688749896294400,0.440214,43,10,27
6,773711528398848000,923688494379216896,0.230907,43,10,27
7,773711528398848000,923687534848299008,0.272364,43,10,27


In [14]:
dayList = monthDf['day'].unique()
dayList.sort()
print(dayList)

['1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22'
 '23' '24' '25' '26' '27' '28' '29' '3' '30' '31' '4' '5' '6' '7' '8' '9']


In [15]:
dayDf = monthDf[monthDf['day']=="1"]

In [16]:
dayDf.head()

Unnamed: 0,user_id,tweet_id,Toxicity,week,month,day
677,711391730470227968,914589821565153280,0.010618,40,10,1
678,711391730470227968,914588529165926400,0.032392,40,10,1
679,711391730470227968,914382087490113536,0.080834,40,10,1
680,711391730470227968,914304370925223936,0.020106,40,10,1
681,711391730470227968,914277851892932608,0.074503,40,10,1


In [17]:
users_list = list(dayDf['user_id'].unique().astype(str))

In [18]:
len(users_list)

11649

In [19]:
G_week = G.subgraph(users_list)

In [20]:
print(G_week.number_of_nodes())

11649


In [21]:
user_toxicity_dict = dayDf.groupby('user_id')['Toxicity'].mean().to_dict()

In [None]:
neighbour_toxicity_dict = {}
for user in dayDf['user_id'].unique():
    neighbourDf = dayDf[dayDf['user_id'] != user]
    neighbour_toxicity_dict[user] = neighbourDf.groupby('user_id')['Toxicity'].mean().to_dict()

In [None]:
user_toxicity_means = []
neighbor_toxicity_means = []

In [None]:
for user in G_week.nodes():
    user_mean = user_toxicity_dict[str(user)]
    neighbour_mean = np.mean(list(neighbour_toxicity_dict[user].values()))
    user_toxicity_means.append(user_mean)
    neighbor_toxicity_means.append(neighbour_mean)

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(user_toxicity_means, neighbor_toxicity_means)
plt.xlabel('Mean toxicity score of user')
plt.ylabel('Mean toxicity score of user\'s neighbors')

# plt.xticks([i/10 for i in range(0, 11)])
# plt.yticks([i/10 for i in range(0, 11)])
plt.show()

In [None]:
# for node in G_week.nodes():
#     neighbors = list(G_week.neighbors(node))
#     neighbor_toxicity_mean = weekDf.loc[weekDf['user_id'].isin(neighbors), 'Toxicity'].mean()
#     if pd.isna(neighbor_toxicity_mean):
#       neighbor_toxicity_mean = 0
#     user_toxicity_mean = weekDf.loc[weekDf['user_id'] == node, 'Toxicity'].mean()
#     if pd.isna(user_toxicity_mean):
#       user_toxicity_mean = 0
  
#     user_toxicity_means.append(user_toxicity_mean)
#     neighbor_toxicity_means.append(neighbor_toxicity_mean)


In [None]:
# user_id = '20430045'
# neighbors = list(G_week.neighbors(user_id))
# neighbor_toxicity_mean = weekDf.loc[weekDf['user_id'].isin(neighbors), 'Toxicity'].mean()
# if pd.isna(neighbor_toxicity_mean):
#     neighbor_toxicity_mean = 0
# user_toxicity_mean = weekDf.loc[weekDf['user_id'] == user_id, 'Toxicity'].mean()
# if pd.isna(user_toxicity_mean):
#     user_toxicity_mean = 0

In [None]:
'''
groupDf = df[['user_id', 'tweet_id', 'Toxicity', 'week','month','day']]
weekDf = groupDf[groupDf['week']=="43"]
monthDf = groupDf[groupDf['month']=="10"]
dayDf = monthDf[monthDf['day']=="1"]
users_list = list(dayDf['user_id'].unique().astype(str))
G_week = G.subgraph(users_list)
user_toxicity_means = []
neighbor_toxicity_means = []
def get_neighbour_toxicity_mean(user):
  neighbours = list(G_week.neighbors(user))
  temp_list = []
  for neigh in neighbours:
    neighbourDf = dayDf[dayDf['user_id'].astype(str) ==neigh]
    mean_toxicity = neighbourDf['Toxicity'].mean()
    temp_list.append(mean_toxicity)
  return sum(temp_list)/len(temp_list)
def get_user_toxicity_mean(user):
  userDf = dayDf[dayDf['user_id'].astype(str) ==user]
  return userDf['Toxicity'].mean()
for user in G_week.nodes():
  user_mean = get_user_toxicity_mean(user)
  neighbour_mean = get_neighbour_toxicity_mean(user)
  user_toxicity_means.append(user_mean)
  neighbor_toxicity_means.append(neighbour_mean)
plt.figure(figsize=(15,10))
plt.scatter(user_toxicity_means, neighbor_toxicity_means)
plt.xlabel('Mean toxicity score of user')
plt.ylabel('Mean toxicity score of user\'s neighbors')

# plt.xticks([i/10 for i in range(0, 11)])
# plt.yticks([i/10 for i in range(0, 11)])
plt.show()

in this code, 
this code snippet is taking a long time to run? 

for user in G_week.nodes():
  user_mean = get_user_toxicity_mean(user)
  neighbour_mean = get_neighbour_toxicity_mean(user)
  user_toxicity_means.append(user_mean)
  neighbor_toxicity_means.append(neighbour_mean)

how can I optimize it to make it run faster?
'''