Delete all variables in the current environment (if you have already run some cells) - clean state.

In [1]:
%reset

Import all necessary packages.

In [2]:
import os
import shutil
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from collections import Counter
from IPython.core.getipython import get_ipython
import textwrap
from matplotlib import pyplot as plt
import networkx as nx

In [3]:
rootdir_path = os.getcwd()

Replace with the path to the folder where the raw dataset (the initial .csv files) is stored.

In [4]:
dataset_possibilities = ['15_days', '25_days']

In [5]:
number_of_days = dataset_possibilities[1]

In [6]:
data_path = os.path.join(rootdir_path, 'data', f'covaxxy_merged_{number_of_days}.csv')

In [7]:
# data_path = data_path = os.path.join(rootdir_path, 'data', f'covaxxy_merged_test.csv')

In [8]:
files_path = os.path.join(rootdir_path, 'files')

In [9]:
path_to_unique_dates = os.path.join(files_path, f'unique_dates_{number_of_days}.txt')

In [10]:
opinion_changes_path = os.path.join(files_path, f'opinion-changes-{number_of_days}')

In [11]:
graphs_path = os.path.join(rootdir_path, 'graphs')

In [12]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')

Create subfolders specific to the different types of analyses performed in the project.

Create 1 subfolder within the graphs/covaxxy/ folder to store graphs referring to differences in opinion changes for the covaxxy dataset. If it already existed (from previous runnings of the project), delete the folder and its contents and create an empty folder to store the current graphs, relevant to the current state of the project.

In [13]:
covaxxy_deltas_OC_graphs_path = os.path.join(covaxxy_graphs_path, f'deltas-OC-{number_of_days}')
if os.path.exists(covaxxy_deltas_OC_graphs_path):
   shutil.rmtree(covaxxy_deltas_OC_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_deltas_OC_graphs_path)

In [14]:
covaxxy_networks_graphs_path = os.path.join(covaxxy_graphs_path, f'networks-{number_of_days}')
if os.path.exists(covaxxy_networks_graphs_path):
   shutil.rmtree(covaxxy_networks_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_networks_graphs_path)

In [15]:
covaxxy_networks_quotes_graphs_path = os.path.join(covaxxy_networks_graphs_path, 'quotes')
if os.path.exists(covaxxy_networks_quotes_graphs_path):
   shutil.rmtree(covaxxy_networks_quotes_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_networks_quotes_graphs_path)

In [16]:
covaxxy_networks_replies_graphs_path = os.path.join(covaxxy_networks_graphs_path, 'replies')
if os.path.exists(covaxxy_networks_replies_graphs_path):
   shutil.rmtree(covaxxy_networks_replies_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_networks_replies_graphs_path)

In [17]:
covaxxy_networks_retweets_graphs_path = os.path.join(covaxxy_networks_graphs_path, 'retweets')
if os.path.exists(covaxxy_networks_retweets_graphs_path):
   shutil.rmtree(covaxxy_networks_retweets_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_networks_retweets_graphs_path)

In [109]:
# covaxxy_longitudinal_analysis_graphs_path = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
# if os.path.exists(covaxxy_longitudinal_analysis_graphs_path):
#    shutil.rmtree(covaxxy_longitudinal_analysis_graphs_path, ignore_errors=False, onerror=None)
# os.makedirs(covaxxy_longitudinal_analysis_graphs_path)

In [110]:
merged_days = pd.read_csv(data_path)

In [111]:
def string_to_int(reference_id):
    try:
        return int(reference_id)
    except ValueError:
        return reference_id

In [112]:
merged_days['reference_id'] = merged_days['reference_id'].apply(string_to_int)

In [113]:
# Convert the 'created_at' column to datetime
merged_days['created_at'] = pd.to_datetime(merged_days['created_at'])

In [114]:
merged_days

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
1,2021-02-24 18:00:18+00:00,1364636282664574978,1,26761523,Ready for DAY 2 of State of the Valley? Join u...,"jointventure.org,twitter.com,",Joint Venture SV,JointVentureSVN,False,"San Jose, CA",...,#,#,#,#,#,#,#,#,#,#
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
3,2021-02-24 18:03:16+00:00,1364637028948709377,1,1329106574082641920,"#SD37: Starting next week, @OCHealth will star...","bit.ly,www.ocregister.com,",Senator Dave Min,SenDaveMin,True,"Irvine, CA",...,#,#,#,36069538,ochealth,#,#,#,#,#
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723864,2021-03-20 23:59:42+00:00,1373424038111023106,1,827349983577796608,RT @altNOAA: PSA: If you're in #Oklahoma and a...,#,Howard Hudson,hwrdhdsn,False,"Medford, OR",...,824126001936474113,1373418517484220418,altNOAA,824126001936474113,altNOAA,#,#,#,retweeted,1373418517484220418
10723865,2021-03-20 23:59:42+00:00,1373424038895321101,1,828337737388285952,RT @maura_resister: Covid rates are trending i...,#,KY R.N. Votes Blue😷🦋🌊,changemustcome7,False,Blue dot in red state,...,1167775229109837825,1373420544830361602,maura_resister,1167775229109837825,maura_resister,#,#,#,retweeted,1373420544830361602
10723866,2021-03-20 23:59:43+00:00,1373424044100440072,1,126514742,RT @Best_of_PT: The tiny island of #Corvo in t...,#,Javier RG,Rxavier23,False,#,...,894905209159331840,1373313858476068869,Best_of_PT,894905209159331840,Best_of_PT,#,#,#,retweeted,1373313858476068869
10723867,2021-03-20 23:59:43+00:00,1373424044318420993,1,2918822815,@sethsliltweeter @EncoreBeachClub That’s a goo...,#,Jason Titus,eezeemonee,False,"San Jose, CA",...,#,#,#,1289682623757918209,sethsliltweeter,1289682623757918209,1373417100472033283,sethsliltweeter,replied_to,1373417100472033283


In [115]:
merged_days.columns

Index(['created_at', 'tweet_id', 'credible', 'author_id', 'text', 'urls',
       'name', 'username', 'verified', 'location', 'followers_count',
       'following_count', 'tweet_count', 'like_count', 'quote_count',
       'reply_count', 'retweet_count', 'retweet_author_id', 'retweet_id',
       'retweeted_screen_name', 'user_mentions_id',
       'user_mentions_screen_name', 'in_reply_to_user_id',
       'in_reply_to_tweet_id', 'in_reply_to_username', 'reference_type',
       'reference_id'],
      dtype='object')

MODEL3

In [116]:
from timezonefinder import TimezoneFinder
from geopy.geocoders import Nominatim
import pytz

def get_timezone_from_location(location_name):
    geolocator = Nominatim(user_agent="timezone_finder")
    location = geolocator.geocode(location_name)

    if location is None:
        return None  # Unable to find location

    # Get latitude and longitude from the location
    latitude, longitude = location.latitude, location.longitude

    # Use timezonefinder to get the timezone from latitude and longitude
    timezone_finder = TimezoneFinder()
    timezone_str = timezone_finder.timezone_at(lng=longitude, lat=latitude)

    if timezone_str is not None:
        timezone = pytz.timezone(timezone_str)
        return timezone
    else:
        return None


In [117]:
counter = 1

In [118]:
def tweet_convert_utc_to_local_timezone(tweet_id, dataset):
    utc_time = dataset.loc[dataset["tweet_id"] == tweet_id, 'created_at'].item()
    location_name = dataset.loc[dataset["tweet_id"] == tweet_id, 'location'].item()
    timezone = get_timezone_from_location(location_name)

    global counter
    # print(f"Tweet {dataset[dataset['tweet_id'] == tweet_id].index[0]} / {len(dataset) - 1}")
    print(f"Tweet {counter} / {len(dataset)}")
    counter+=1

    return utc_time.astimezone(timezone)

In [119]:
def df_convert_utc_to_local_timezone(dataset):
    dataset['created_at'] = dataset.apply(lambda row: tweet_convert_utc_to_local_timezone(row['tweet_id'], dataset), axis=1)

In [120]:
march1 = merged_days[merged_days['created_at'].dt.date == pd.to_datetime('2021-03-01').date()]

In [121]:
# Filter the dataset and keep only the reactions. Remove source tweets.
tweets_with_location = march1[march1['location'] != '#']

In [123]:
# Get the top 3000 rows with the largest 'like_count' values
top_3000_rows = tweets_with_location.nlargest(3000, 'like_count')

In [124]:
top_10_rows = tweets_with_location.nlargest(10, 'like_count')

In [127]:
df_convert_utc_to_local_timezone(top_10_rows)

Tweet 1 / 10
Tweet 2 / 10
Tweet 3 / 10
Tweet 4 / 10
Tweet 5 / 10
Tweet 6 / 10
Tweet 7 / 10
Tweet 8 / 10
Tweet 9 / 10
Tweet 10 / 10


In [143]:
top_10_rows.index.name = 'index'

In [145]:
top_10_rows.to_csv("top_10_most_liked_march_1.csv")

In [146]:
top_10_rows_from_csv = pd.read_csv("top_10_most_liked_march_1.csv")

In [148]:
top_10_rows_from_csv.set_index('index', inplace=True)

In [154]:
top_10_rows_from_csv['created_at'] = pd.to_datetime(top_10_rows_from_csv['created_at'])

In [156]:
top_10_rows_from_csv.loc[30333, 'created_at']

datetime.datetime(2021, 3, 1, 7, 6, 35, tzinfo=tzoffset(None, 19800))

In [None]:
# TODO:
# - iau ca timepoint o zi din cele 20
# - o impart in 3 intervale: 9-17 (8h), 17-24(7h), 0-9 (9h)
# - pe o zi, fac top 3000 cele mai liked tweets
# - convertesc la timezone local toate cele 3000 tweets

# OTHER TASKS:
# - cele 3000 de tweet-uri cu tiemzone local le scriu in fisier, ca dureaza foarte mult rularea
# => trbuie sa vad cum fac sa pastrez informatiile de tiemzone
# => pipeline de scriere + citire in fisier corecta, in proiectul Maven o sa pun doar citirea dintr-un fisier cu 
# top 3000 most liked tweets in fiecare zi

# TODO Java:
# - template si binding pentru model 3

MODEL2

In [25]:
# Filter the dataset and keep only the reactions. Remove source tweets.
all_reactions = merged_days[merged_days['reference_id'] != '#']

In [26]:
all_reactions

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
5,2021-02-24 18:03:43+00:00,1364637145361567745,1,961965252,RT @sarahwoodwriter: I know more about Tiger W...,#,Erica,EELzilla,False,Denver,...,191546705,1364355166745612291,sarahwoodwriter,191546705,sarahwoodwriter,#,#,#,retweeted,1364355166745612291
6,2021-02-24 18:04:18+00:00,1364637292531380224,1,1006430633294376960,RT @eleanorbate: how i’ll be walking down the ...,"twitter.com,",Winnie⁷ Military wife,taejinwhore,False,She/her (20+),...,27000222,1364516433938497542,eleanorbate,27000222,eleanorbate,#,#,#,retweeted,1364516433938497542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723864,2021-03-20 23:59:42+00:00,1373424038111023106,1,827349983577796608,RT @altNOAA: PSA: If you're in #Oklahoma and a...,#,Howard Hudson,hwrdhdsn,False,"Medford, OR",...,824126001936474113,1373418517484220418,altNOAA,824126001936474113,altNOAA,#,#,#,retweeted,1373418517484220418
10723865,2021-03-20 23:59:42+00:00,1373424038895321101,1,828337737388285952,RT @maura_resister: Covid rates are trending i...,#,KY R.N. Votes Blue😷🦋🌊,changemustcome7,False,Blue dot in red state,...,1167775229109837825,1373420544830361602,maura_resister,1167775229109837825,maura_resister,#,#,#,retweeted,1373420544830361602
10723866,2021-03-20 23:59:43+00:00,1373424044100440072,1,126514742,RT @Best_of_PT: The tiny island of #Corvo in t...,#,Javier RG,Rxavier23,False,#,...,894905209159331840,1373313858476068869,Best_of_PT,894905209159331840,Best_of_PT,#,#,#,retweeted,1373313858476068869
10723867,2021-03-20 23:59:43+00:00,1373424044318420993,1,2918822815,@sethsliltweeter @EncoreBeachClub That’s a goo...,#,Jason Titus,eezeemonee,False,"San Jose, CA",...,#,#,#,1289682623757918209,sethsliltweeter,1289682623757918209,1373417100472033283,sethsliltweeter,replied_to,1373417100472033283


In [27]:
tweet_id_most_reactions = all_reactions.loc[merged_days['tweet_id'].notnull()]['reference_id'].value_counts().idxmax()

In [28]:
all_reactions_for_user = merged_days[(merged_days['reference_id'] != '#') & (merged_days['reference_id'] == tweet_id_most_reactions)]

In [29]:
all_reactions_for_user

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
30388,2021-03-01 01:36:49+00:00,1366200721087991811,1,1088016983126290432,RT @narendramodi: Took my first dose of the CO...,#,Saravanan Sivan,Saravanansivans,False,தமிழ்நாடு,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
30395,2021-03-01 01:36:52+00:00,1366200732676804610,1,794310968,RT @narendramodi: Took my first dose of the CO...,#,drpoovendran My PM My Pride!,poovendran123,False,karur,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
30397,2021-03-01 01:36:52+00:00,1366200733658271745,1,452147147,RT @narendramodi: Took my first dose of the CO...,#,R.H.Vikram,rhvikram,False,Chennai,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
30431,2021-03-01 01:36:59+00:00,1366200762137632768,1,755271973278711808,RT @narendramodi: Took my first dose of the CO...,#,Pema Khandu པདྨ་མཁའ་འགྲོ་།,PemaKhanduBJP,True,"Itanagar, India",...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
30464,2021-03-01 01:37:08+00:00,1366200802230996997,1,389950297,RT @narendramodi: Took my first dose of the CO...,#,絆 ALL JAPAN 🌐Global,wadakashiho,False,JAPAN,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10425305,2021-03-20 10:36:00+00:00,1373221782904274946,1,1373094325807153155,RT @narendramodi: Took my first dose of the CO...,#,SANJAY MEENA,jaamimeena,False,"Jaipur, India",...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
10431115,2021-03-20 10:52:39+00:00,1373225972816703499,1,1350386062187655168,RT @narendramodi: Took my first dose of the CO...,#,ANIRBAN BANERJEE,ANIRBAN60273768,False,#,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
10477264,2021-03-20 13:03:11+00:00,1373258821074706435,1,1373257749539168266,RT @narendramodi: Took my first dose of the CO...,#,AtulMobile,AtulMobile1,False,#,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016
10482816,2021-03-20 13:18:31+00:00,1373262679272353794,1,1359104435784273920,RT @narendramodi: Took my first dose of the CO...,#,Bibek Chakraborty,BibekCh27496834,False,#,...,18839785,1366200664402006016,narendramodi,18839785,narendramodi,#,#,#,retweeted,1366200664402006016


In [44]:
def create_json_original(original_tweet_id, dataset):

    has_duplicates = dataset[dataset['tweet_id'] == original_tweet_id]['tweet_id'].duplicated().any()
    if has_duplicates:
        raise Exception("Duplicate tweet id. Make sure the tweet id is unique.")
    
    def row_value_for(column_name):
        return dataset.loc[dataset['tweet_id'] == original_tweet_id, column_name].item()

    original = {}
    original["original_tweet_id"] = original_tweet_id
    unfiltered_text = row_value_for('text')
    original["original_text"] = unfiltered_text.replace('\n', '')
    original["ORIGINAL_created_at"] = row_value_for('created_at')
    original["ORIGINAL_location"] = row_value_for('location')
    original["post_id"] = f"post_{original_tweet_id}"
    original["original_author_id"] = f"ORIGINAL_TWEET_author_{row_value_for('author_id')}"
    original["ag_o_name"] = row_value_for('name')
    original["original_author_props_id"] = f"original_author_props_{row_value_for('author_id')}"
    original["ORIGINAL_credible"] = row_value_for('credible')
    original["ORIGINAL_username"] = row_value_for('username')
    original["ORIGINAL_verified"] = row_value_for('verified')
    original["ORIGINAL_followers_count"] = row_value_for('followers_count')
    original["ORIGINAL_following_count"] = row_value_for('following_count')

    # Convert all values to strings
    for key in original:
        original[key] = str(original[key])

    return original

In [31]:
create_json_original(tweet_id_most_reactions, merged_days)

{'original_tweet_id': '1366200664402006016',
 'original_text': 'Took my first dose of the COVID-19 vaccine at AIIMS. Remarkable how our doctors and scientists have worked in quick time to strengthen the global fight against COVID-19. I appeal to all those who are eligible to take the vaccine. Together, let us make India COVID-19 free! https://t.co/5z5cvAoMrv',
 'ORIGINAL_created_at': '2021-03-01 01:36:35+00:00',
 'ORIGINAL_location': 'India',
 'post_id': 'post_1366200664402006016',
 'original_author_id': 'ORIGINAL_TWEET_author_18839785',
 'ag_o_name': 'Narendra Modi',
 'original_author_props_id': 'original_author_props_18839785',
 'ORIGINAL_credible': '1',
 'ORIGINAL_username': 'narendramodi',
 'ORIGINAL_verified': 'True',
 'ORIGINAL_followers_count': '82431058',
 'ORIGINAL_following_count': '2448'}

In [45]:
reaction_labels = {
    'quoted': 'QUOTE',
    'replied_to': 'REPLY',
    'retweeted': 'RETWEET'
}

In [46]:
def row_value_for(column_name, original_tweet_id, dataset):
    return dataset.loc[dataset['tweet_id'] == original_tweet_id, column_name].item()

In [47]:
def parse_offset_string(offset_string):
    if offset_string.endswith("min"):  # Offset in minutes
        offset_minutes = int(offset_string[:-3])  # Extract the numeric part of the string (excluding the last 3 characters)
        return pd.Timedelta(minutes=offset_minutes)
    elif offset_string.endswith("h"):  # Offset in hours
        offset_hours = int(offset_string[:-1])  # Extract the numeric part of the string (excluding the last character)
        return pd.Timedelta(hours=offset_hours)
    else:
        raise ValueError("Invalid offset string. It should end with 'min' for minutes or h for hours.")

In [48]:
def get_reactions_to_original_tweet_in_interval(original_tweet_id, start_of_interval, end_of_interval, dataset):
    
    original_posting_time = row_value_for('created_at', original_tweet_id, dataset)

    offset1 = parse_offset_string(start_of_interval)
    start_time = original_posting_time + offset1

    if end_of_interval == "LAST_REACTION":
        end_time = pd.Timestamp.utcnow()
    else:
        offset2 = parse_offset_string(end_of_interval)
        end_time = original_posting_time + offset2

    reactions = dataset[(dataset['reference_id'] == original_tweet_id) &
                        (dataset['reference_type'] != '#') &
                        (dataset['created_at'] >= start_time) & 
                        (dataset['created_at'] < end_time)]

    return reactions

In [49]:
def create_json_group_of_reaction(original_tweet_id, start_of_interval, end_of_interval, dataset, total_nr_of_reactions):
    has_duplicates = dataset[dataset['tweet_id'] == original_tweet_id]['tweet_id'].duplicated().any()
    if has_duplicates:
        raise Exception("Duplicate tweet id. Make sure the tweet id is unique.")
    
    reactions = get_reactions_to_original_tweet_in_interval(original_tweet_id, start_of_interval, end_of_interval, dataset)
    nr_of_unique_author_ids = reactions['author_id'].nunique()

    
    group_of_reactions = {}
    group_of_reactions["react_id"] = f"reacts_for_{start_of_interval}_{end_of_interval}"
    group_of_reactions["reaction_group_of_authors_id"] = f"reactions_authors_for_{start_of_interval}_{end_of_interval}"
    group_of_reactions["nr_of_distinct_authors"] = nr_of_unique_author_ids
    group_of_reactions["reaction_group_of_tweets_id"] = f"reactions_for_{start_of_interval}_{end_of_interval}"
    group_of_reactions["time_interval"] = f"{start_of_interval} - {end_of_interval}"
    group_of_reactions["nr_of_reactions"] = len(reactions)
    group_of_reactions["percentage_out_of_total_reactions"] = f"{round(len(reactions) / total_nr_of_reactions * 100, 2)}%"
    group_of_reactions["nr_of_replies"] = reactions['reference_type'].value_counts().get('replied_to', 0)
    group_of_reactions["nr_of_quotes"] = reactions['reference_type'].value_counts().get('quoted', 0)
    group_of_reactions["nr_of_retweets"] = reactions['reference_type'].value_counts().get('retweeted', 0)

    # Convert all values to strings
    for key in group_of_reactions:
        group_of_reactions[key] = str(group_of_reactions[key])

    return group_of_reactions

In [37]:
create_json_group_of_reaction(tweet_id_most_reactions, "1h", "2h", merged_days)

{'react_id': 'reacts_for_1h_2h',
 'reaction_group_of_authors_id': 'reactions_authors_for_1h_2h',
 'nr_of_distinct_authors': '7285',
 'reaction_group_of_tweets_id': 'reactions_for_1h_2h',
 'time_interval': '1h - 2h',
 'total_nr_of_reactions': '7335',
 '%_of_total_reactions': '17.03%',
 'nr_of_replies': '105',
 'nr_of_quotes': '343',
 'nr_of_retweets': '6887'}

In [36]:
# TODO last point from Mirela mail:
# - 10 timestamps: 15', 1h, 2h, 3h, 6h, 12h, 24h, 48h, 72h, 72h+
# - sterg orig_tweet_props
# - adaug la proprietatile entitatii orig_tweet atributele: created_at, location - DONE
# - o activitate de react pentru toate reactiile din intervalul x de timp
# - entitatea asociata cu activitatea de react va fi: toate reactiile din intervalul x
# - la entitatea asta sterg type, value, adaug proprietatile: time_interval, nr_of_replies, nr_of_quotes, nr_of_retweets
# - o sa am un singur agent pentru grupul de autori ai reactiilor din intervalul x
# - proprietate la agentul grup: nr of distinct authors

In [50]:
reaction_intervals = {
    0: "0h-15min",
    1: "15min-1h",
    2: "1h-2h",
    3: "2h-3h",
    4: "3h-6h",
    5: "6h-12h",
    6: "12h-24h",
    7: "24h-48h",
    8: "48h-72h",
    9: "72h-LAST_REACTION"
}

In [51]:
def create_json_data(original_tweet_id, reaction_interval, dataset, total_nr_of_reactions):
    # Split the string based on the "-"
    intervals_boundaries = reaction_interval.split("-")
    start_of_interval = intervals_boundaries[0]
    end_of_interval = intervals_boundaries[1]

    data = {}
    data["original"] = create_json_original(original_tweet_id, dataset)
    data["group_of_reactions"] = create_json_group_of_reaction(original_tweet_id, start_of_interval, end_of_interval, dataset, total_nr_of_reactions)

    return data

In [41]:
data = create_json_data(tweet_id_most_reactions, "0h-15min", merged_days)

In [42]:
# Write the dictionary to a JSON file
with open("data2_test.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

In [None]:
# TODO:
# - write function to iterate through the dictionary (all intervals) and create 10 (or whatever) different JSON files
# - put the code in a Python script
# - add the Python script to the Maven project
# - define correct paths
# - add execution in pom.xml

In [52]:
def create_all_json_data(original_tweet_id, reaction_intervals, dataset, dirpath):
    if not isinstance(reaction_intervals, dict):
        raise TypeError("The reaction intervals have to be written in a dictionary.")
    
    total_nr_of_reactions = len(dataset[(dataset['reference_id'] == original_tweet_id) &
                               (dataset['reference_type'] != '#')])
    
    for interval in reaction_intervals.values():
        data = create_json_data(original_tweet_id, interval, dataset, total_nr_of_reactions)

        path = os.path.join(dirpath, f"data2_{interval}.json")

        # Write the dictionary to a JSON file
        with open(path, "w") as json_file:
            json.dump(data, json_file, indent=4)

In [53]:
create_all_json_data(tweet_id_most_reactions, reaction_intervals, merged_days, rootdir_path)

In [None]:
# TODO:
# - change the Python Script with the changes to the functions
# - copy the script to the Maven project
# - check the results

MODEL1

In [24]:
# Filter the dataset and keep only the reactions. Remove source tweets.
all_reactions = merged_days[merged_days['reference_id'] != '#']

In [25]:
all_reactions

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
5,2021-02-24 18:03:43+00:00,1364637145361567745,1,961965252,RT @sarahwoodwriter: I know more about Tiger W...,#,Erica,EELzilla,False,Denver,...,191546705,1364355166745612291,sarahwoodwriter,191546705,sarahwoodwriter,#,#,#,retweeted,1364355166745612291
6,2021-02-24 18:04:18+00:00,1364637292531380224,1,1006430633294376960,RT @eleanorbate: how i’ll be walking down the ...,"twitter.com,",Winnie⁷ Military wife,taejinwhore,False,She/her (20+),...,27000222,1364516433938497542,eleanorbate,27000222,eleanorbate,#,#,#,retweeted,1364516433938497542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723864,2021-03-20 23:59:42+00:00,1373424038111023106,1,827349983577796608,RT @altNOAA: PSA: If you're in #Oklahoma and a...,#,Howard Hudson,hwrdhdsn,False,"Medford, OR",...,824126001936474113,1373418517484220418,altNOAA,824126001936474113,altNOAA,#,#,#,retweeted,1373418517484220418
10723865,2021-03-20 23:59:42+00:00,1373424038895321101,1,828337737388285952,RT @maura_resister: Covid rates are trending i...,#,KY R.N. Votes Blue😷🦋🌊,changemustcome7,False,Blue dot in red state,...,1167775229109837825,1373420544830361602,maura_resister,1167775229109837825,maura_resister,#,#,#,retweeted,1373420544830361602
10723866,2021-03-20 23:59:43+00:00,1373424044100440072,1,126514742,RT @Best_of_PT: The tiny island of #Corvo in t...,#,Javier RG,Rxavier23,False,#,...,894905209159331840,1373313858476068869,Best_of_PT,894905209159331840,Best_of_PT,#,#,#,retweeted,1373313858476068869
10723867,2021-03-20 23:59:43+00:00,1373424044318420993,1,2918822815,@sethsliltweeter @EncoreBeachClub That’s a goo...,#,Jason Titus,eezeemonee,False,"San Jose, CA",...,#,#,#,1289682623757918209,sethsliltweeter,1289682623757918209,1373417100472033283,sethsliltweeter,replied_to,1373417100472033283


I make sure the tweet with the most reaction exists in our dataset.

In [26]:
most_frequent_value = all_reactions.loc[merged_days['tweet_id'].notnull()]['reference_id'].value_counts().idxmax()

In [27]:
most_frequent_value

1366200664402006016

In [28]:
# Find out the most frequent reference_id = the source tweet id which had the most reactions
# BUT this method doesn't check if the tweet exists in the dataset
central_node = all_reactions['reference_id'].mode()[0]

In [29]:
central_node

1366200664402006016

In [42]:
merged_days[merged_days['tweet_id'] == most_frequent_value]

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
30333,2021-03-01 01:36:35+00:00,1366200664402006016,1,18839785,Took my first dose of the COVID-19 vaccine at ...,"twitter.com,",Narendra Modi,narendramodi,True,India,...,#,#,#,#,#,#,#,#,#,#


In [89]:
def create_json_original(original_tweet_id, dataset):

    has_duplicates = dataset[dataset['tweet_id'] == original_tweet_id]['tweet_id'].duplicated().any()
    if has_duplicates:
        raise Exception("Duplicate tweet id. Make sure the tweet id is unique.")
    
    def row_value_for(column_name):
        return dataset.loc[dataset['tweet_id'] == original_tweet_id, column_name].item()

    original = {}
    original["post_id"] = f"post_{original_tweet_id}"
    original["original_author_id"] = f"ORIGINAL_TWEET_author_{row_value_for('author_id')}"
    original["ag_o_name"] = row_value_for('name')
    original["author_props_id"] = f"author_props_{row_value_for('author_id')}"
    original["credible"] = row_value_for('credible')
    original["username"] = row_value_for('username')
    original["verified"] = row_value_for('verified')
    original["followers_count"] = row_value_for('followers_count')
    original["following_count"] = row_value_for('following_count')
    original["original_tweet_id"] = original_tweet_id
    unfiltered_text = row_value_for('text')
    original["original_text"] = unfiltered_text.replace('\n', '')
    original["original_tweet_props_id"] = f"tweet_props_{original_tweet_id}"
    original["ORIGINAL_properties"] = "ORIGINAL TWEET properties"
    original["ORIGINAL_created_at"] = row_value_for('created_at')
    original["ORIGINAL_location"] = row_value_for('location')
    original["ORIGINAL_like_count"] = row_value_for('like_count')
    original["ORIGINAL_quote_count"] = row_value_for('quote_count')
    original["ORIGINAL_reply_count"] = row_value_for('reply_count')
    original["ORIGINAL_retweet_count"] = row_value_for('retweet_count')

    # Convert all values to strings
    for key in original:
        original[key] = str(original[key])

    return original

In [78]:
original = create_json_original(most_frequent_value, merged_days)

In [79]:
original

{'post_id': 'post_1366200664402006016',
 'original_author_id': 'ORIGINAL_TWEET_author_18839785',
 'ag_o_name': 'Narendra Modi',
 'author_props_id': 'author_props_18839785',
 'credible': '1',
 'username': 'narendramodi',
 'verified': 'True',
 'followers_count': '82431058',
 'following_count': '2448',
 'original_tweet_id': '1366200664402006016',
 'original_text': 'Took my first dose of the COVID-19 vaccine at AIIMS. Remarkable how our doctors and scientists have worked in quick time to strengthen the global fight against COVID-19. I appeal to all those who are eligible to take the vaccine. Together, let us make India COVID-19 free! https://t.co/5z5cvAoMrv',
 'original_tweet_props_id': 'tweet_props_1366200664402006016',
 'ORIGINAL_properties': 'ORIGINAL TWEET properties',
 'ORIGINAL_created_at': '2021-03-01 01:36:35+00:00',
 'ORIGINAL_location': 'India',
 'ORIGINAL_like_count': '220543',
 'ORIGINAL_quote_count': '4339',
 'ORIGINAL_reply_count': '11595',
 'ORIGINAL_retweet_count': '43621'}

In [90]:
reaction_labels = {
    'quoted': 'QUOTE',
    'replied_to': 'REPLY',
    'retweeted': 'RETWEET'
}

In [91]:
def create_json_reaction(reaction_tweet_id, dataset):
    has_duplicates = dataset[dataset['tweet_id'] == reaction_tweet_id]['tweet_id'].duplicated().any()
    if has_duplicates:
        raise Exception("Duplicate tweet id. Make sure the tweet id is unique.")
    
    def row_value_for(column_name):
        return dataset.loc[dataset['tweet_id'] == reaction_tweet_id, column_name].item()
    
    reaction_type = reaction_labels[f"{row_value_for('reference_type')}"]

    reaction = {}
    reaction["react_id"] = f"react_{reaction_type}_{reaction_tweet_id}"
    reaction["reaction_author_id"] = f"{reaction_type}_author_{row_value_for('author_id')}"
    reaction["ag_r_name"] = row_value_for('name')
    reaction["author_props_id"] = f"author_props_{row_value_for('author_id')}"
    reaction["credible"] = row_value_for('credible')
    reaction["username"] = row_value_for('username')
    reaction["verified"] = row_value_for('verified')
    reaction["followers_count"] = row_value_for('followers_count')
    reaction["following_count"] = row_value_for('following_count')
    reaction["reaction_tweet_id"] = reaction_tweet_id
    reaction["reply_retweet_quote"] = reaction_type
    unfiltered_text = row_value_for('text')
    reaction["reaction_text"] = unfiltered_text.replace('\n', '')
    reaction["reaction_tweet_props_id"] = f"{reaction_type}_props_{reaction_tweet_id}"
    reaction["REACTION_properties"] = f"{reaction_type} properties"
    reaction["REACTION_created_at"] = row_value_for('created_at')
    reaction["REACTION_location"] = row_value_for('location')
    reaction["REACTION_like_count"] = row_value_for('like_count')
    reaction["REACTION_retweet_count"] = row_value_for('retweet_count')
    reaction["REACTION_reference_id"] = row_value_for('reference_id')

    # Convert all values to strings
    for key in reaction:
        reaction[key] = str(reaction[key])

    return reaction

In [92]:
create_json_reaction(1373424044318420993, merged_days)

{'react_id': 'react_REPLY_1373424044318420993',
 'reaction_author_id': 'REPLY_author_2918822815',
 'ag_r_name': 'Jason Titus',
 'author_props_id': 'author_props_2918822815',
 'credible': '1',
 'username': 'eezeemonee',
 'verified': 'False',
 'followers_count': '1443',
 'following_count': '18',
 'reaction_tweet_id': '1373424044318420993',
 'reply_retweet_quote': 'REPLY',
 'reaction_text': '@sethsliltweeter @EncoreBeachClub That’s a good sign since you’re in the industry. I can’t wait to get this vaccine. Do you see Miami right now?🥴 At least with the vaccine we can kick it in the south. They DGAF',
 'reaction_tweet_props_id': 'REPLY_props_1373424044318420993',
 'REACTION_properties': 'REPLY properties',
 'REACTION_created_at': '2021-03-20 23:59:43+00:00',
 'REACTION_location': 'San Jose, CA',
 'REACTION_like_count': '1',
 'REACTION_retweet_count': '0',
 'REACTION_reference_id': '1373417100472033283'}

I will choose the most liked reaction for each type of reaction: reply, quote and retweet, as they generate the most interest.

Due to the fact that in my dataset, retweets have no likes, as well as no quotes an replies, I chose to select the number of retweets as the factor for choosing the top retweet.

In [93]:
def create_json_reaction_list(original_tweet_id, dataset):
    # retrieve unique types of reactions. Reactions have a value assigned to the 'reference_type' field, different than '#'
    # ('#' represent original tweets)
    tweet_types = dataset['reference_type'].unique()
    reaction_types = tweet_types[tweet_types != '#']

    reactions_of_original_tweet = dataset[(dataset['reference_type'].isin(reaction_types)) & (dataset['reference_id'] == original_tweet_id)]

    top_reactions = []
    for reaction_type in reaction_types:
        factor = 'like_count'
        factor = 'retweet_count' if reaction_type == 'retweeted' else factor

        idx_top_reaction = reactions_of_original_tweet[reactions_of_original_tweet['reference_type'] == reaction_type][factor].idxmax()
        top_reaction_tweet_id = reactions_of_original_tweet.loc[idx_top_reaction, 'tweet_id']

        json_top_reaction = create_json_reaction(top_reaction_tweet_id, reactions_of_original_tweet)

        top_reactions.append(json_top_reaction)


    return top_reactions
    

In [84]:
reaction_list = create_json_reaction_list(most_frequent_value, merged_days)

In [94]:
def create_json_data(original_tweet_id, dataset):
    data = {}
    data["original"] = create_json_original(original_tweet_id, dataset)
    data["reactions"] = create_json_reaction_list(original_tweet_id, dataset)

    return data

In [95]:
data = create_json_data(most_frequent_value, merged_days)

In [96]:
data

{'original': {'post_id': 'post_1366200664402006016',
  'original_author_id': 'ORIGINAL_TWEET_author_18839785',
  'ag_o_name': 'Narendra Modi',
  'author_props_id': 'author_props_18839785',
  'credible': '1',
  'username': 'narendramodi',
  'verified': 'True',
  'followers_count': '82431058',
  'following_count': '2448',
  'original_tweet_id': '1366200664402006016',
  'original_text': 'Took my first dose of the COVID-19 vaccine at AIIMS. Remarkable how our doctors and scientists have worked in quick time to strengthen the global fight against COVID-19. I appeal to all those who are eligible to take the vaccine. Together, let us make India COVID-19 free! https://t.co/5z5cvAoMrv',
  'original_tweet_props_id': 'tweet_props_1366200664402006016',
  'ORIGINAL_properties': 'ORIGINAL TWEET properties',
  'ORIGINAL_created_at': '2021-03-01 01:36:35+00:00',
  'ORIGINAL_location': 'India',
  'ORIGINAL_like_count': '220543',
  'ORIGINAL_quote_count': '4339',
  'ORIGINAL_reply_count': '11595',
  'OR

In [97]:
# Write the dictionary to a JSON file
with open("data.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

In [63]:
count = all_reactions['reference_id'].value_counts().max()

In [64]:
count

43065

In [24]:
# Create a list of the column names
col_names = merged_days.columns.to_list()

# save the list to a file
with open(os.path.join(files_path, 'columns.txt'), 'w') as f:
    for col_name in col_names:
        f.write(col_name + '\n')

EXACT DAYS IN OUR DATASET:

Note: I double checked which days were actually used in the dataset.

In [25]:
exact_days_info = []

with open(path_to_unique_dates, 'r') as f:
    for line in f:
        exact_days_info.append(line.strip())

for day_info in exact_days_info:
    print(day_info)

24-02-2021 - 334 tweets
25-02-2021 - 933 tweets
26-02-2021 - 1110 tweets
27-02-2021 - 871 tweets
28-02-2021 - 792 tweets
01-03-2021 - 608540 tweets
02-03-2021 - 691842 tweets
03-03-2021 - 667152 tweets
04-03-2021 - 516891 tweets
05-03-2021 - 504369 tweets
06-03-2021 - 401596 tweets
07-03-2021 - 362643 tweets
08-03-2021 - 359209 tweets
09-03-2021 - 466151 tweets
10-03-2021 - 541263 tweets
11-03-2021 - 667231 tweets
12-03-2021 - 745042 tweets
13-03-2021 - 431356 tweets
14-03-2021 - 442648 tweets
15-03-2021 - 586684 tweets
16-03-2021 - 674996 tweets
17-03-2021 - 536724 tweets
18-03-2021 - 581944 tweets
19-03-2021 - 501221 tweets
20-03-2021 - 432327 tweets


REACTIONS

There are 3 types of reactions:
- replies ('replied_to')
- quotes ('quoted')
- retweets ('retweeted')

All possible combinations of reactions types you may wish to take into account further down the line are specified in the full list below. 

The reaction_types list should be equal to one of the elements of the full list.

In [26]:
reaction_types_full_list = [['quoted'], 
                            ['quoted', 'retweeted'], 
                            ['replied_to'], 
                            ['replied_to', 'quoted'], 
                            ['replied_to', 'quoted', 'retweeted'],
                            ['replied_to', 'retweeted']]

Here, you can choose what (combination of) reaction types you wish to be included in the analysis.

In [27]:
reaction_types = reaction_types_full_list[4]

In [28]:
reaction_types

['replied_to', 'quoted', 'retweeted']

In [29]:
def create_path_to_opinion_changes(reaction_types):
    """Function to create the path to the opinion changes JSON file, based on the reaction types we took into consideration.

    Args:
        reaction_types (list): list of reaction types

    Returns:
        str: path to the opinion changes file
    """    
    type = "_".join(reaction_types)
    path = os.path.join(opinion_changes_path, f'{type}_OC.json')

    return path

In [30]:
def group_reactions(merged_days, reaction_types):
    """Function to group reactions based on the reaction types list given as an input parameter, by the
    'author_id' and 'reference_id' columns. This means that each group of reactions contains a (set of) reaction(s)
    posted by the user identified by the 'author_id' and the source tweet identified by the 'reference_id'.

    Args:
        merged_days (pandas.core.frame.DataFrame): dataframe with all the data
        reaction_types (list): list of reaction types we want to consider

    Returns:
        dict: dictionary where the key is a tuple of the form (author_id, reference_id)
              and the value is a dataframe with all reactions corresponding to that combination
    """    
    reactions = merged_days[merged_days['reference_type'].isin(reaction_types)]
    multiple_reactions = reactions[reactions.duplicated(subset=['author_id', 'reference_id'], keep=False)]

    # group the rows by the two columns
    grouped_df = multiple_reactions.groupby(['author_id', 'reference_id'])
    groups_of_reactions = grouped_df.groups

    return groups_of_reactions

In [31]:
groups_of_reactions = group_reactions(merged_days, reaction_types)

In [32]:
len(groups_of_reactions)

74314

LOAD DICTIONARY FROM JSON FILE

In [33]:
def load_opinion_changes(path_to_opinion_changes):
    """Function that generates a dictionary based on a JSON file which contains the opinion changes within the reactions of the dataset.

    Args:
        path_to_opinion_changes (str): path to the JSON file associated with the opinion changes within the reactions
                                               (e.g. /your/path/to/research-internship/files/opinion-changes-25_days/quoted_OC.json)

    Returns:
        dict: the original dictionary containing opinion changes from reactions
    """    
    with open(path_to_opinion_changes) as f:
        # Load the JSON data into a Python dictionary
        opinion_changes_from_file = json.load(f)
        # Create a new dictionary with tuple keys
        original_opinion_changes = {}
        for key in opinion_changes_from_file:
            # Convert the string key to a tuple
            new_key = eval(key)
            # Add the key-value pair to the new dictionary
            original_opinion_changes[new_key] = opinion_changes_from_file[key]
            
    return original_opinion_changes

In [34]:
opinion_changes = load_opinion_changes(create_path_to_opinion_changes(reaction_types))

INSIGHTS

In [35]:
reactions_labels = {
    'quoted': 'quotes',
    'replied_to': 'replies',
    'retweeted': 'retweets'
}

In [36]:
print(f'How many people, who reacted ({", ".join(map(lambda x: reactions_labels[x], reaction_types))}) multiple times to a source tweet, changed their opinion over time?')
print(f"{round(len(opinion_changes) / len(groups_of_reactions) * 100, 1)}%.")

How many people, who reacted (replies, quotes, retweets) multiple times to a source tweet, changed their opinion over time?
22.6%.


In [37]:
def biggest_opinion_change(opinion_changes):
    """Function that returns the group (pair of user id - source tweet id) which interacted more than once 
    in the context of a single source tweet, i.e. one user posted more than one reply to the same source tweet, 
    where the user who reacted had the most drastic opinion change,
    based on the previously computed sentiments of the text.

    Args:
        opinion_changes (dict): dictionary with opinion changes

    Returns:
        tuple: pair of user id - source tweet id, where the biggest opinion change occured
        str: type of change that occured, e.g. one user tends to agree with the source tweet after some time, 
             when initially he disagreed or vice-versa
    """    
    change_type = 'negative'
    biggest_change = 0
    target_group = tuple()
    for group, sentiments in opinion_changes.items():
        change = max(biggest_change, max(sentiments) - min(sentiments))
        if change > biggest_change:
            biggest_change = change
            target_group = group
    
    min_sentiment_index = opinion_changes[target_group].index(min(opinion_changes[target_group]))
    max_sentiment_index = opinion_changes[target_group].index(max(opinion_changes[target_group]))
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return target_group, change_type

In [38]:
target_group, change_type = biggest_opinion_change(opinion_changes)

In [39]:
target_group

(7027112, 1366720983525171208)

In [40]:
change_type

'negative'

In [41]:
opinion_changes[target_group]

[4, -4]

In [42]:
def reactions_with_biggest_opinion_change(reactions, target_group):
    """Function that queries the reactions dataset and returns a list of the actual texts that the pair of users
     (the author of the reaction and the author of the source tweet) posted.
     The user id and source tweet id are passed on as input parameters (the target group).

    Args:
        replies (pandas Dataframe): the dataframe with the reactions
        target_group (tuple): pair of user ids - source tweet id, whose posts had the biggest opinion change

    Returns:
        list: list of texts posted by the 2 users
    """    
    condition1 = reactions['author_id'] == target_group[0]
    condition2 = reactions['reference_id'] == target_group[1]

    return reactions[condition1 & condition2].loc[:, 'text'].tolist()

In [43]:
reactions_biggest_change = reactions_with_biggest_opinion_change(merged_days, target_group)

In [44]:
reactions_biggest_change

['RT @girlsreallyrule: After chastising President Biden at CPAC for getting the covid vaccine, we find out that both Donald and Melania Trump…',
 'Must be devastating news to the Qanon anti vaxxers thinking vaccines turn people into gay communist antifa zombies. https://t.co/6UDTy7mxOe']

In [45]:
def biggest_opinion_change_type(opinion_changes, group):
    """Function to detect what type of opinion change occured in the case of a group (pair of user id - source tweet id) 
    which interacted.

    Args:
        opinion_changes (dict): dictionary with opinion changes
        group (tuple): pair of user id - source tweet id that interacted through reactions 
                       and the respondent changed his/her viewpoint w.r.t. a source tweet

    Returns:
        str: either 'positive' (if the respondent now agrees after initially disagreeing) or 'negative'
    """    
    min_sentiment_index = opinion_changes[group].index(min(opinion_changes[group]))
    max_sentiment_index = opinion_changes[group].index(max(opinion_changes[group]))
    
    change_type = 'negative'
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return change_type

In [46]:
# Create a boolean mask indicating what type of opinion change each group has
mask = {group: biggest_opinion_change_type(opinion_changes, group) for group in opinion_changes}

In [47]:
def value_count_in_dict(dict, value_to_count):
    """Function to count the occurences of a certain value in a dictionary.

    Args:
        dict (dict): dictionary where we need to count the occurences of a value
        value_to_count (any): value to be counted

    Returns:
        int: number of occurences of value_to_count
    """    
    # Create a reverse dictionary that maps values to their frequencies
    reverse_dict = defaultdict(int)
    for value in dict.values():
        reverse_dict[value] += 1

    # Count the occurrences of the specific value
    count = reverse_dict.get(value_to_count, 0)

    return count

In [48]:
print(f"Percentage of positive opinion changes out of:")
print(f"- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'positive') / len(mask) * 100, 1)}%")

Percentage of positive opinion changes out of:
- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => 34.2%


In [49]:
print(f"Percentage of negative opinion changes out of:")
print(f"- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'negative') / len(mask) * 100, 1)}%")

Percentage of negative opinion changes out of:
- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => 65.8%


In [50]:
def compute_biggest_opinion_changes_deltas(path_to_opinion_changes):
    opinion_changes = load_opinion_changes(path_to_opinion_changes)
    deltas = { key: max(value) - min(value) for key, value in opinion_changes.items() }

    return deltas

In [51]:
deltas_labels = {
    2: 'minimum',
    3: 'slight',
    4: 'considerable',
    5: 'big',
    6: 'very big',
    7: 'huge',
    8: 'maximum'
}

In [52]:
# function to add value labels - adds the value of y
def add_labels_y_value(x,y):
    """Function that takes the x and y-axis to be passed onto a plot function and generates labels,
    such that on top of each y value, it is displayed centrally.

    Args:
        x (list): list of labels for x-axis of a plot
        y (list): list of values for y-axis of a plot
    """    
    for i in range(len(x)):
        plt.text(x[i], y[i], y[i], ha = 'center', va = 'bottom')

In [53]:
def plot_deltas_OC(reaction_types, deltas_labels, reactions_labels, root_path, number_of_days):
    opinion_changes_deltas = compute_biggest_opinion_changes_deltas(create_path_to_opinion_changes(reaction_types))
    # Get the values from the dictionary
    deltas = list(opinion_changes_deltas.values())

    # Use Counter to count the occurrences of each value
    deltas_count = Counter(deltas)
    percentages = { f'{deltas_labels[pair[0]]} ({pair[0]})': round(pair[1] / sum(deltas_count.values()) * 100, 1) 
               for pair in sorted(deltas_count.most_common(), key=lambda x: x[0])}
    
    keys = list(percentages.keys())
    values = list(percentages.values())

    # Create a bar chart of the counts
    plt.bar(keys, values, edgecolor='black')
    plt.xticks(rotation=45)
    plt.subplots_adjust(bottom=0.25)
    # Add labels to the top of each bar
    add_labels_y_value(keys, values)
    plt.xlabel('Intensity of opinion changes')
    plt.ylabel('Percentage of opinion changes')

    long_title = f'Distribution of Opinion Changes: Percentage by Intensity for { ", ".join(map(lambda x: reactions_labels[x], reaction_types)) }'
    # Wrap the title onto multiple lines
    wrapped_title = textwrap.fill(long_title, width=50)
    plt.title(wrapped_title, loc="center", pad=10)

    types = "_".join(reaction_types)
    path = os.path.join(root_path, f'{types}_deltas_OC_{number_of_days}.png')
    plt.savefig(path)
    plt.close()

In [54]:
def plot_all_deltas_OC(reaction_types_full_list, deltas_labels, reactions_labels, root_path, number_of_days):
    for reaction_types in reaction_types_full_list:
        plot_deltas_OC(reaction_types, deltas_labels, reactions_labels, root_path, number_of_days)

In [55]:
plot_all_deltas_OC(reaction_types_full_list, deltas_labels, reactions_labels, covaxxy_deltas_OC_graphs_path, number_of_days)

NetworkX graphs - longitudinal change of quote, reply and retweet subnetworks.

In [56]:
def reactions_for_central_node(reaction_type, central_node, all_reactions):
    reactions = all_reactions[(all_reactions['reference_id'] == central_node) & (all_reactions['reference_type'] == reaction_type)]

    return reactions

In [57]:
def reactions_for_node_by_date(date, reaction_type, central_node, all_reactions):
    reactions = reactions_for_central_node(reaction_type, central_node, all_reactions)
    reactions_by_date = reactions[reactions['created_at'].dt.date == pd.Timestamp(date).date()]

    return reactions_by_date

In [58]:
def plot_network(date, reaction_type, central_node, all_reactions, folder_path):

    if reaction_type != 'retweeted':
        print(f'Started building graph for {reactions_labels[reaction_type]} posted on {date}...')
        # print(f'Started filtering the reactions...')
        reactions_by_date = reactions_for_node_by_date(date, reaction_type, central_node, all_reactions)
        # print(f'Reactions dataframe ready. Starting building the graph...')

        # Create an empty directed graph
        G = nx.DiGraph()

        # Add the source tweet as the central node
        G.add_node(central_node)

        # Add the reply tweets as nodes and edges
        reaction_nodes = reactions_by_date['tweet_id'].to_numpy()
        G.add_nodes_from(reaction_nodes)
        G.add_edges_from(zip(reaction_nodes, [central_node] * len(reaction_nodes)))
        # print(f'Added all {G.number_of_nodes()} nodes. Now grouping the reactions by author_id...')

        # group the original dataframe by 'author_id' and count the number of occurrences
        grouped = reactions_by_date.groupby('author_id').size()

        # keep only the groups where the count is greater than 1
        grouped = grouped[grouped > 1]
        # print('Grouped reactions where author posted more than 1 reaction.')

        # create a new dataframe with the desired columns and 'author_count'
        df_authors_with_multiple_reactions = reactions_by_date[['tweet_id', 'author_id']].loc[reactions_by_date['author_id'].isin(grouped.index)]
        df_authors_with_multiple_reactions['author_count'] = df_authors_with_multiple_reactions['author_id'].apply(lambda x: grouped[x])
        # print('Created helper dataframe.')

        unique_author_counts = df_authors_with_multiple_reactions['author_count'].unique()
        color_dict = {count: "#" + ''.join([random.choice('0123456789ABCDEF') for j in range(6)])
                    for count in unique_author_counts}
        # print('Created node color dictionary.')


        def is_author_with_multiple_reactions(node, df_authors_with_multiple_reactions):
            return node in df_authors_with_multiple_reactions['tweet_id'].values

        def get_author_count(node, df_authors_with_multiple_reactions):
            row = df_authors_with_multiple_reactions.loc[df_authors_with_multiple_reactions['tweet_id'] == node]
            author_count = row['author_count'].iloc[0]

            return author_count

        # print('Adding colors to the node_colors list...')
        node_colors = []
        for node in G.nodes():
            if node == central_node:
                node_colors.append('red')
            else:
                node_colors.append(color_dict[get_author_count(node, df_authors_with_multiple_reactions)] 
                                if is_author_with_multiple_reactions(node, df_authors_with_multiple_reactions) else 'black')
        # print('Added all colors.')

        # Set the node size to 20
        node_size = 20
        # Set the edge color to grey and the opacity to 0.7
        edge_color = 'grey'
        edge_alpha = 0.7
        # print('Creating the layout...')
        # get the spring layout
        pos = nx.spring_layout(G)

        # print('Started drawing the graph...')
        # Draw the graph
        nx.draw(G, pos=pos, with_labels=False, node_size=node_size, node_color=node_colors, edge_color=edge_color, alpha=edge_alpha)

        # print('Finished drawing. Now saving to file...')
        path = os.path.join(folder_path, f'{reactions_labels[reaction_type]}_network_{date}.png')
        plt.savefig(path)
        plt.close()
    else:
        print(f'Appending retweets info on {date} to the summary file...')
        reactions_by_date = reactions_for_node_by_date(date, reaction_type, central_node, all_reactions)
        path = os.path.join(folder_path, f'{reactions_labels[reaction_type]}_summary.txt')
        new_line = f'On {date}, there were {len(reactions_by_date)} retweets.'

        try:
            with open(path, 'x') as f:
                f.write(new_line + '\n')
        except FileExistsError:
            # Open the file in read mode
            with open(path, 'r') as file:
                # Read all the lines in the file and store them in a list
                lines = file.readlines()

            # Open the file in append mode
            with open(path, 'a') as file:
                # If the line is not already in the file, write it to the file
                if new_line + '\n' not in lines:
                    file.write(new_line + '\n')



    print('Done')

In [65]:
reaction_types_list = ['quoted', 'replied_to', 'retweeted']

In [66]:
dates_list = ['2021-03-01', '2021-03-02','2021-03-03', '2021-03-04', '2021-03-05']

In [67]:
root_path = covaxxy_networks_graphs_path

In [68]:
def plot_all_networks(reaction_types_list, dates_list, central_node, all_reactions, root_path):
    for reaction_type in reaction_types_list:
        folder_path = os.path.join(root_path, f'{reactions_labels[reaction_type]}')
        for date in dates_list:
            plot_network(date, reaction_type, central_node, all_reactions, folder_path)

In [69]:
plot_all_networks(reaction_types_list, dates_list, central_node, all_reactions, root_path)

Started building graph for quotes posted on 2021-03-01...
Done
Started building graph for quotes posted on 2021-03-02...
Done
Started building graph for quotes posted on 2021-03-03...
Done
Started building graph for quotes posted on 2021-03-04...
Done
Started building graph for quotes posted on 2021-03-05...
Done
Started building graph for replies posted on 2021-03-01...
Done
Started building graph for replies posted on 2021-03-02...
Done
Started building graph for replies posted on 2021-03-03...
Done
Started building graph for replies posted on 2021-03-04...
Done
Started building graph for replies posted on 2021-03-05...
Done
Appending retweets info on 2021-03-01 to the summary file...
Done
Appending retweets info on 2021-03-02 to the summary file...
Done
Appending retweets info on 2021-03-03 to the summary file...
Done
Appending retweets info on 2021-03-04 to the summary file...
Done
Appending retweets info on 2021-03-05 to the summary file...
Done


NEW COLUMN ADDITION

CREATION OF REPLIES_AND_QUOTES DATAFRAME, FOR WHICH WE WANT TO SEE IF THEY SUPPORT OR NOT THE SOURCE TWEET.

In [69]:
def create_replies_and_quotes(full_dataset):
    condition_1 = full_dataset['reference_type'] == 'replied_to'
    condition_2 = full_dataset['reference_type'] == 'quoted'

    return full_dataset[condition_1 | condition_2].copy()

In [70]:
replies_and_quotes = create_replies_and_quotes(merged_days)

HELPER FUNCTIONS TO ADD A NEW COLUMN TO THE test_replies_and_quotes DATAFRAME IN PARALLEL.

In [71]:
test_replies_and_quotes = replies_and_quotes.head(1000).copy()

In [72]:
counter = 0
progress = 0.001

In [73]:
def print_progress():
    global counter
    global progress
    global test_replies_and_quotes
    ipython = get_ipython()
    if ipython is not None:
        counter = ipython.user_ns['counter']
        progress = ipython.user_ns['progress']
        test_replies_and_quotes = ipython.user_ns['test_replies_and_quotes']

    counter += 1

    if ((counter / len(test_replies_and_quotes)) >= progress):
        print(f"{counter} / {len(test_replies_and_quotes)} replies or quotes processed.\n")
        progress += 0.001
    if counter == len(test_replies_and_quotes):
        print("New column inserted in the replies_and_quotes dataframe.\n")

In [74]:
def supports_source_tweet(text):
    if not isinstance(text, str):
        return '#'
    
    sentiment = senti.getSentiment(text, score='scale')[0]

    # print_progress()
    
    return sentiment > 0

In [75]:
# Define a wrapper function that applies supports_source-tweet to a chunk of data
def apply_function_to_chunk(chunk):
    return chunk.apply(supports_source_tweet)

In [76]:
def add_support_source_tweet_column_parallel(replies_and_quotes):
    # Split the DataFrame into chunks for parallel processing
    num_chunks = multiprocessing.cpu_count()
    chunks = np.array_split(replies_and_quotes['text'], num_chunks)

    # Create a multiprocessing pool and apply the function to each chunk in parallel
    with multiprocessing.Pool(processes=num_chunks) as pool:
        results = pool.map(apply_function_to_chunk, chunks)

    # Concatenate the results back into a single DataFrame
    # replies_and_quotes['supports_source_tweet'] = pd.concat(results)
    replies_and_quotes.insert(replies_and_quotes.columns.get_loc('text') + 1, 'supports_source_tweet', pd.concat(results))

    return replies_and_quotes

PARALLEL EXECUTION. Benchmark tests (my machine): 1000 recordings => 1m11.1s

In [77]:
# test_replies_and_quotes_parallel = add_support_source_tweet_column_parallel(test_replies_and_quotes)

SEQUENTIAL EXECUTION. Benchmark tests (my machine): 1000 recordings => 2m30.0s

In [78]:
test_replies_and_quotes = replies_and_quotes.head(1000).copy()

In [79]:
# test_replies_and_quotes.insert(test_replies_and_quotes.columns.get_loc('text') + 1, 'supports_source_tweet', test_replies_and_quotes['text'].apply(supports_source_tweet))

CHECK IF RESULTS ARE THE SAME FOR BOTH METHODS

In [80]:
# test_replies_and_quotes_parallel.equals(test_replies_and_quotes)

CODE TO MODIFY THE ORIGINAL DATAFRAME (WITH ALL REPLIES AND QUOTES), WITH AN ADDED COLUMN NAMED 'supports_source_tweet', USING PARALLEL COMPUTATION, AS WELL AS SAVE IT TO A .CSV FILE (UNCOMMENT CELLS TO RUN)

NOTE: There are almost 1 million replies and quotes in the original dataframe, so the following statement is extremely time-consuming.

In [81]:
# replies_and_quotes = add_support_source_tweet_column_parallel(replies_and_quotes)

In [82]:
# path_to_replies_and_quotes = files_path + '/replies_and_quotes_modified.csv'

In [83]:
# # save the DataFrame to a CSV file
# replies_and_quotes.to_csv(path_to_replies_and_quotes, index=False)

In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [84]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [85]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [86]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
