## Mount al drive, instalación NetworkX e importaciones

In [None]:
import pandas as pd
import warnings
import itertools
import json
import os

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
!pip install -U networkx==2.8.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Funciones de extracción de nodos y aristas del tweet y creación de filas

In [None]:
def source_usernames_and_target_extractor( data : dict, source_name: list,  usernames : list, target_names: list) :
  # extract the usernames of every account that may appear in the tweet

  #extract the ursername of the main user
  main_user = data['user']['screen_name']
  source_name.append(main_user)
  usernames.append(main_user)

  if 'entities' in data:
      entities = data['entities']
    
      # extract usernames from user mentions
      if 'user_mentions' in entities:
          mentions = entities['user_mentions']
          for mention in mentions:
              username = mention['screen_name']
              usernames.append(username)
              target_names.append(username)
    
      # extract usernames from tweet text
      if 'full_text' in data:
          text = data['full_text']
      elif 'text' in data:
          text = data['text']
    
      words = text.split()
      for word in words:
          if word.startswith('@') and len(word[1:]) > 0:
              username = word[1:]
              if username[-1] != ':' :
                usernames.append(username)
                target_names.append(username)

  # extract usernames from retweeted_status
  if 'retweeted_status' in data:
      retweeted_data = data['retweeted_status']

      # extract username from user screen name
      if 'user' in retweeted_data and 'screen_name' in retweeted_data['user']:
          usernames.append(retweeted_data['user']['screen_name'])
          target_names.append(retweeted_data['user']['screen_name'])

      # extract usernames from entities
      if 'entities' in retweeted_data:
          entities = retweeted_data['entities']

          # extract usernames from user mentions
          if 'user_mentions' in entities:
              mentions = entities['user_mentions']
              for mention in mentions:
                  username = mention['screen_name']
                  usernames.append(username)
                  target_names.append(username)

          # extract usernames from tweet text
          if 'full_text' in retweeted_data:
              text = retweeted_data['full_text']
          elif 'text' in retweeted_data:
              text = retweeted_data['text']

          words = text.split()
          for word in words:
              if word.startswith('@') and len(word[1:]) > 0:
                  username = word[1:]
                  if username[-1] != ':' :
                    usernames.append(username)
                    target_names.append(username)


In [None]:
def create_row(df, target, source_name, hashtag):
  idx = ((df['source'] == target) & (df['target'] == source_name[0]))
  if not df.loc[idx].empty:
    row = {'source': target,
               'target': source_name[0],
               'weight': 1,
               'hashtags': hashtag}
  else:
    row = {'source': source_name[0],
               'target': target,
               'weight': 1,
               'hashtags': hashtag}
  return row

## Función para obtener dataframe a partir de json_objects

In [None]:
def get_dataframe_from_tweets(json_objects):
  # create an empty dataframe with the desired columns
  df = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])

  usernames = []
  # iterate over each JSON object
  for json_obj in json_objects:
      source_name = []
      target_names = []

      source_usernames_and_target_extractor( json_obj, source_name,  usernames, target_names)

      #print('source: ', source_name, '\n', 'targets: ',target_names, '\n')

      # add a row to the dataframe for each target
      for target in target_names:
          # create a row for the current source-target pair
          row = create_row(df, target, source_name, hashtag)

          # add the new row to the dataframe              
          df = df.append(row, ignore_index=True)
          #print(df, '\n')
    

  #print('usernames: ', set(usernames), '\n')

  # generate all combinations of tuples of strings in the set
  #combinations = list(itertools.combinations(set(usernames), 2))
 # for edge in combinations:
    #source_name = [edge[0]]
    #target = edge[1]
    #row = create_row(df, target, source_name, hashtag)
    #df = df.append(row, ignore_index=True)

  return df

## Dataframes extracción

In [None]:
# MENTAL HEALTH

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#shtwt_tweets', '#depression_tweets']
df_mental_health = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_mental_health = pd.concat([df_mental_health, df_aux])

#shtwt_tweets 697
#depression_tweets 2867


In [None]:
# FITNESS

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#healthy_tweets','#fitspo_tweets', '#weightloss_tweets']
df_fitness = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_fitness = pd.concat([df_fitness, df_aux])

#healthy_tweets 210
#fitspo_tweets 225
#weightloss_tweets 3309


In [None]:
# EATING DISORDERS VERSION 1

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#ricecaketwt_tweets', '#proana_tweets','#thinspo_tweets', '#anorexia_tweets', '#eatingdisorder_tweets', '#anatwt_tweets', '#edtwtdiet_tweets', '#bulimia_tweets', '#meanspo_tweets', '#promia_tweets', '#anamia_tweets','#EdTwitter_tweets']
df_disorders_1 = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_disorders_1 = pd.concat([df_disorders_1, df_aux])

#ricecaketwt_tweets 403
#proana_tweets 254
#thinspo_tweets 392
#anorexia_tweets 495
#eatingdisorder_tweets 670
#anatwt_tweets 320
#edtwtdiet_tweets 296
#bulimia_tweets 187
#meanspo_tweets 209
#promia_tweets 46
#anamia_tweets 17
#EdTwitter_tweets 278


In [None]:
# EATING DISORDERS VERSION 2

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#edtwt_tweets']
df_disorders_2 = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_disorders_2 = pd.concat([df_disorders_2, df_aux])

#edtwt_tweets 4255


In [None]:
# MENTAL HEALTH HASHTAG

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#mentalhealth_tweets']
df_mental_health_2 = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_mental_health_2 = pd.concat([df_mental_health_2, df_aux])

#mentalhealth_tweets 6526


In [None]:
# WORKOUT HASHTAG

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#workout_tweets']
df_fitness_2 = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_fitness_2 = pd.concat([df_fitness_2, df_aux])

#workout_tweets 5153


In [None]:
# FITNESS REMAINING HASHTAGS

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#diet_tweets','#caloriedeficit_tweets', '#fitnessmotivation_tweets', '#lowcal_tweets']
df_fitness_3 = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_fitness_3 = pd.concat([df_fitness_3, df_aux])

#diet_tweets 3850
#caloriedeficit_tweets 87
#fitnessmotivation_tweets 3771
#lowcal_tweets 21


In [None]:
# DISORDERS REMAINING HASHTAGS

In [None]:
path = '/content/drive/MyDrive/Computación/tweets/'
hashtags = ['#bonespo_tweets','#eatingdisorders_tweets', '#edtwtthread_tweets', '#3dtwt_tweets']
df_disorders_3 = pd.DataFrame(columns=['source', 'target', 'weight', 'hashtags'])
for hashtag in hashtags:
  if hashtag.startswith('#'):
    tweets = os.listdir(path+hashtag)
    json_objects = []
    for tweet in tweets:
      with open(path + hashtag + '/' + tweet, 'r') as f:
        data = json.load(f)
        json_objects.append(data)
    
    df_aux = get_dataframe_from_tweets(json_objects)
    print(hashtag , len(json_objects))
  df_disorders_3 = pd.concat([df_disorders_3, df_aux])

#bonespo_tweets 42
#eatingdisorders_tweets 1319
#edtwtthread_tweets 487
#3dtwt_tweets 69


## Visualización de dataframes y modificaciones

In [None]:
df_mental_health

Unnamed: 0,source,target,weight,hashtags
0,BbyyMor,esquizofrenicaq,1,#shtwt_tweets
1,BbyyMor,esquizofrenicaq,1,#shtwt_tweets
2,skin_decay,rexicelves,1,#shtwt_tweets
3,skin_decay,rexicelves,1,#shtwt_tweets
4,icoffeebee,st4lkergore,1,#shtwt_tweets
...,...,...,...,...
11193,SimarSi36719725,harjotbains,1,#depression_tweets
11194,SimarSi36719725,AAPDelhi,1,#depression_tweets
11195,SimarSi36719725,BhagwantMann,1,#depression_tweets
11196,SimarSi36719725,Dapindr,1,#depression_tweets


In [None]:
df_mental_health_weighted = df_mental_health.groupby(["source", "target"]).weight.sum().reset_index()
df_mental_health_weighted

Unnamed: 0,source,target,weight
0,001kqwkjnc,eatingmyarms,2
1,001kqwkjnc,miileeww,2
2,01kie_hssh,d3adlili_,2
3,0c4lor1es,168x48,2
4,0l1v4k,icoffeebee,2
...,...,...,...
2498,zeroc4ls_,eatingmyarms,2
2499,zeroc4ls_,some1kls,2
2500,zombiecmorgue,sweetcherrycan1,2
2501,zombiecmorgue,zombiecmorgue,4


In [None]:
df_fitness

Unnamed: 0,source,target,weight,hashtags
0,DebilishO369,BiohackingGuild,1,#healthy_tweets
1,DebilishO369,BiohackingGuild,1,#healthy_tweets
2,felipejr1173,MarkandeshwarF,1,#healthy_tweets
3,felipejr1173,MarkandeshwarF,1,#healthy_tweets
4,magutui009,DSPTrainer,1,#healthy_tweets
...,...,...,...,...
2322,odmous5,ESPNUK,1,#weightloss_tweets
2323,DodlyJosy,trade_2022,1,#weightloss_tweets
2324,DodlyJosy,trade_2022,1,#weightloss_tweets
2325,MineYourFitness,GetFitMining,1,#weightloss_tweets


In [None]:
df_fitness_weighted = df_fitness.groupby(["source", "target"]).weight.sum().reset_index()
df_fitness_weighted

Unnamed: 0,source,target,weight
0,11111o11111lll0,helen_volz,2
1,1234_justme,clickbankunited,2
2,20thCenturyB0I,MarieS2408,2
3,AJAYKUM70347445,ZeeNewsEnglish,2
4,ALHarp55,GetFitMining,4
...,...,...,...
884,yochizzle100,strongest_gamer_alive,1
885,yoyoBita1,dibyamanav,2
886,zahra24323,helen_volz,2
887,zubaidihussain,GP_Update,2


In [None]:
df_disorders_1

Unnamed: 0,source,target,weight,hashtags
0,sarcophogusIII,ilovebananaslut,1,#ricecaketwt_tweets
1,sarcophogusIII,ilovebananaslut,1,#ricecaketwt_tweets
2,skinnyl00,ilovebananaslut,1,#ricecaketwt_tweets
3,skinnyl00,ilovebananaslut,1,#ricecaketwt_tweets
4,sarcophogusIII,ilovebananaslut,1,#ricecaketwt_tweets
...,...,...,...,...
660,Maths_Month,YorkshireTAs,1,#EdTwitter_tweets
661,Maths_Month,TwinklCPD,1,#EdTwitter_tweets
662,Maths_Month,EmmaTwinklTA,1,#EdTwitter_tweets
663,Maths_Month,TwinklCPD,1,#EdTwitter_tweets


In [None]:
df_disorders_1_weighted = df_disorders_1.groupby(["source", "target"]).weight.sum().reset_index()
df_disorders_1_weighted

Unnamed: 0,source,target,weight
0,05Lepus,h3m1am1,4
1,0CALORI3S,pearl_30more,2
2,0calblood_,0calblood_,2
3,144Health,MedicalMedium,1
4,144Health,medicalmedium,1
...,...,...,...
1804,yakimalalimpia,radio_vitoria,6
1805,yerinin1,_L0v3_myg,4
1806,zitaejackson,DrNicoleC,2
1807,zoe_zoebelle,h3m1am1,4


In [None]:
df_disorders_2

Unnamed: 0,source,target,weight,hashtags
0,crying_moon123,st4rvedkyle,1,#edtwt_tweets
1,crying_moon123,st4rvedkyle,1,#edtwt_tweets
2,alehadelrey,IVmilan_,1,#edtwt_tweets
3,alehadelrey,IVmilan_,1,#edtwt_tweets
4,ragingmisandry,st4rvingseren4,1,#edtwt_tweets
...,...,...,...,...
5558,bbypmrx,HokaYo4,1,#edtwt_tweets
5559,bbypmrx,yoooilol,1,#edtwt_tweets
5560,bbypmrx,yoooilol,1,#edtwt_tweets
5561,bbypmrx,dittokg,1,#edtwt_tweets


In [None]:
df_disorders_2_weighted = df_disorders_2.groupby(["source", "target"]).weight.sum().reset_index()
df_disorders_2_weighted

Unnamed: 0,source,target,weight
0,001kqwkjnc,miileeww,2
1,01kie_hssh,d3adlili_,2
2,0HWELLiGUESS,st4rvingseren4,2
3,0c4lor1es,168x48,2
4,0cals2day,redmoqa,2
...,...,...,...
2774,zuhakcals,chaeinkgs,2
2775,zuhakcals,delulucals,2
2776,zuhakcals,ghostisdying,2
2777,zuziakochamkot,s18d37,2


In [None]:
df_mental_health_2

Unnamed: 0,source,target,weight,hashtags
0,cmrosari,cmrosari,1,#mentalhealth_tweets
1,cmrosari,cmrosari,1,#mentalhealth_tweets
2,maristic23,ToshWatters,1,#mentalhealth_tweets
3,maristic23,Eastendwalkers,1,#mentalhealth_tweets
4,maristic23,Eastendwalkers,1,#mentalhealth_tweets
...,...,...,...,...
13185,1980_welsh,AmerMedicalAssn,1,#mentalhealth_tweets
13186,DrDanielGih,PsychiatricNews,1,#mentalhealth_tweets
13187,DrDanielGih,PsychiatricNews,1,#mentalhealth_tweets
13188,hummingtrend,anidelaprida,1,#mentalhealth_tweets


In [None]:
df_mental_health_2_weighted = df_mental_health_2.groupby(["source", "target"]).weight.sum().reset_index()
df_mental_health_2_weighted

Unnamed: 0,source,target,weight
0,06cardiff,SoccerAM,4
1,06cardiff,speedomick,4
2,078471351aAnnie,JayRuderman,2
3,0xgunm,Entheotech1,1
4,0xgunm,"Entheotech1,",1
...,...,...,...
5110,zidanesboots,ciaranjoneill,2
5111,zietlow_lab,SGarthusNiegel,2
5112,zms782,mindshelp,2
5113,zuzu447,FindDaneElkins,2


In [None]:
df_fitness_2

Unnamed: 0,source,target,weight,hashtags
0,ahcin_ntk,X_forceclub,1,#workout_tweets
1,ahcin_ntk,X_forceclub,1,#workout_tweets
2,__skysky28,X_forceclub,1,#workout_tweets
3,__skysky28,X_forceclub,1,#workout_tweets
4,Peemai_Patsi,X_forceclub,1,#workout_tweets
...,...,...,...,...
5647,OleeViktory,GetFitMining,1,#workout_tweets
5648,Princeraheja3,moderndayfreak,1,#workout_tweets
5649,Princeraheja3,moderndayfreak,1,#workout_tweets
5650,Krishanguptaji,LydViciousRich1,1,#workout_tweets


In [None]:
df_fitness_2_weighted = df_fitness_2.groupby(["source", "target"]).weight.sum().reset_index()
df_fitness_2_weighted

Unnamed: 0,source,target,weight
0,01T800,AmericanGirlsIg,2
1,0805Aay,X_forceclub,4
2,12_supansa,X_forceclub,2
3,18and28,Bug4city,6
4,18and28,SuperWalk_,6
...,...,...,...
1925,ywy_1999,X_forceclub,2
1926,zeepxuzy,X_forceclub,2
1927,zgame3681,X_forceclub,2
1928,zipbolang,MASKEDMANIACXXX,2


In [None]:
df_fitness_3

Unnamed: 0,source,target,weight,hashtags
0,Omar72105395,Omar72105395,1,#diet_tweets
1,Omar72105395,Omar72105395,1,#diet_tweets
2,DawnBat99781115,OrganicLiveFood,1,#diet_tweets
3,DawnBat99781115,OrganicLiveFood,1,#diet_tweets
4,Escritordefutu1,IsmaelGalancho,1,#diet_tweets
...,...,...,...,...
13,Rim_gon,Susu_lli,1,#lowcal_tweets
14,AshleyParks1,TheReviewWire,1,#lowcal_tweets
15,AshleyParks1,TheReviewWire,1,#lowcal_tweets
16,augstfood,loonismoo,1,#lowcal_tweets


In [None]:
df_fitness_3_weighted = df_fitness_3.groupby(["source", "target"]).weight.sum().reset_index()
df_fitness_3_weighted

Unnamed: 0,source,target,weight
0,0xBigPickawWin,TheMiddleBorn2,2
1,17Uiet,ASPandit9,2
2,1994Saroblack,OrganicLiveFood,2
3,1Tsegaz,fitnesstips1234,2
4,1toni_leigh,OrganicLiveFood,2
...,...,...,...
2215,zetta9090,_Georgeobeid,2
2216,zhiviaga,zhiviaga,2
2217,zimryfitness,IsmaelGalancho,2
2218,zsoto16,DanSoder,2


In [None]:
df_disorders_3

Unnamed: 0,source,target,weight,hashtags
0,BJH_1999,skelexbones,1,#bonespo_tweets
1,BJH_1999,skelexbones,1,#bonespo_tweets
2,bawlsin_urmouth,dizzytherapist,1,#bonespo_tweets
3,bawlsin_urmouth,dizzytherapist,1,#bonespo_tweets
4,fishcals21,pastelfairiie,1,#bonespo_tweets
...,...,...,...,...
73,tyong_xe,sickgrv,1,#3dtwt_tweets
74,edtwtpuppy,sickgrv,1,#3dtwt_tweets
75,edtwtpuppy,sickgrv,1,#3dtwt_tweets
76,rockpaperslice,sickgrv,1,#3dtwt_tweets


In [None]:
df_disorders_3_weighted = df_disorders_3.groupby(["source", "target"]).weight.sum().reset_index()
df_disorders_3_weighted

Unnamed: 0,source,target,weight
0,0325honey,soymiilk_,4
1,035won,soymiilk_,2
2,0KCALFELIX,soymiilk_,2
3,0_natascha,DrNicoleC,2
4,0_natascha,EDCoalition,6
...,...,...,...
1695,yunababydoll,soymiilk_,2
1696,yunjinca1s,soymiilk_,2
1697,zeroelli,leeheexspam,2
1698,zombixgirl,soymiilk_,2


## Guardar los dataframes

In [None]:
df_mental_health.to_csv('mental_health_graph_dataframe.csv', index=False)

In [None]:
df_fitness.to_csv('fitness_graph_dataframe.csv', index=False)

In [None]:
df_disorders_1.to_csv('disorders_1_graph_dataframe.csv', index=False)

In [None]:
df_disorders_2.to_csv('disorders_2_graph_dataframe.csv', index=False)

In [None]:
df_mental_health_2.to_csv('mental_health_2_graph_dataframe.csv', index=False)

In [None]:
df_fitness_2.to_csv('fitness_2_graph_dataframe.csv', index=False)

In [None]:
df_fitness_3.to_csv('fitness_3_graph_dataframe.csv', index=False)

In [None]:
df_disorders_3.to_csv('disorders_3_graph_dataframe.csv', index=False)

In [None]:
import networkx as nx
G = nx.from_pandas_edgelist(df_fitness_weighted, source="source", target="target", edge_attr="weight", create_using=nx.Graph())

In [None]:
from networkx.readwrite import gexf
gexf.write_gexf(G, "my_graph.gexf")