In [3]:
# Libraries
import pandas as pd
import os

In [10]:
# NB raw data is not included in the repository
# for more details, see the paper 
# "ITA-ELECTION-2022: A multi-platform dataset of social media conversations around the 2022 Italian general election", Pierri et al. 2023

# Getting all filenames for each period sorted by date
# - each file contains a list of retweets for a given day
path = '../../data/raw_data/'
preprocessed_path = '../../data/preprocessed_data/'

before_path = os.path.join(path, 'before_campaign')
campaign_path = os.path.join(path, 'electoral_campaign')
after_path = os.path.join(path, 'after_elections')

before_filenames = sorted(os.listdir(before_path))
campaign_filenames = sorted(os.listdir(campaign_path))
after_filenames = sorted(os.listdir(after_path))

# Some info
print('Length of the period before the beginning of the electoral campaign (# of days): ', len(before_filenames))
print('Length of the electoral campaign (# of days): ', len(campaign_filenames))
print('Length of the period after the elections (# of days): ', len(after_filenames))

Length of the period before the beginning of the electoral campaign (# of days):  56
Length of the electoral campaign (# of days):  31
Length of the period after the elections (# of days):  26


In [8]:
# Parsing data
before_data = pd.DataFrame(columns=['source', 'dest', 'text_tweet_id', 'created_at', 'type']) 
print("Created empty DataFrame...")

for name in before_filenames:
    # Loading df
    df = pd.read_csv(os.path.join(before_path, name), sep=',')
    df['source'] = df['source'].astype(int)
    df['dest'] = df['dest'].astype(int)

    _items = df.shape[0]

    # Removing reply tweets
    df = df[df.type == 'retweet']
    _retweets = df.shape[0]

    # Some info
    print(f"{name} - {_retweets} retweets out of {_items} items ({round(_retweets/_items*100, 2)}%)")

    # Adding to the main dataframe
    before_data = pd.concat([before_data, df])

    # Some more info
    current_date = name.split('_')[-1]
    print(f"{current_date} - {before_data.shape[0]} total retweets")
    print()

Created empty DataFrame...
file_edges_date_2022-07-01 - 8353 retweets out of 36015 items (23.19%)
2022-07-01 - 8353 total retweets

file_edges_date_2022-07-02 - 11406 retweets out of 33181 items (34.38%)
2022-07-02 - 19759 total retweets

file_edges_date_2022-07-03 - 10987 retweets out of 36299 items (30.27%)
2022-07-03 - 30746 total retweets

file_edges_date_2022-07-04 - 9472 retweets out of 29158 items (32.49%)
2022-07-04 - 40218 total retweets

file_edges_date_2022-07-05 - 9097 retweets out of 26164 items (34.77%)
2022-07-05 - 49315 total retweets

file_edges_date_2022-07-06 - 12577 retweets out of 34584 items (36.37%)
2022-07-06 - 61892 total retweets

file_edges_date_2022-07-07 - 11908 retweets out of 35238 items (33.79%)
2022-07-07 - 73800 total retweets

file_edges_date_2022-07-08 - 9003 retweets out of 33122 items (27.18%)
2022-07-08 - 82803 total retweets

file_edges_date_2022-07-09 - 10037 retweets out of 32464 items (30.92%)
2022-07-09 - 92840 total retweets

file_edges_date

In [9]:
# Visualizing it
before_data.head()

Unnamed: 0,source,dest,text_tweet_id,created_at,type
2,1082281620546179072,2983707267,1542932295828381696,2022-07-01 23:59:34+00:00,retweet
3,1063507457589370880,980562312984780800,1542930963071746048,2022-07-01 23:59:13+00:00,retweet
4,337372191,977383349235961728,1542847308135632896,2022-07-01 23:59:03+00:00,retweet
5,1240214959776239616,432516884,1542935003264614400,2022-07-01 23:59:01+00:00,retweet
7,390427246,1300916262369984512,1542948182598062080,2022-07-01 23:58:40+00:00,retweet


In [11]:
# Exporting it
before_data.to_csv(os.path.join(preprocessed_path, 'before_campaign.csv'), index=False)
print("Done!")

Done!


In [14]:
# Parsing data
campaign_data = pd.DataFrame(columns=['source', 'dest', 'text_tweet_id', 'created_at', 'type'])
print("Created empty DataFrame...")

for name in campaign_filenames[1:]:
    # Loading df
    df = pd.read_csv(os.path.join(campaign_path, name), sep=',') 

    df['source'] = df['source'].astype(int)
    df['dest'] = df['dest'].astype(int)

    _items = df.shape[0]

    # Removing reply tweets
    df = df[df.type == 'retweet']
    _retweets = df.shape[0]

    # Some info
    print(f"{name} - {_retweets} retweets out of {_items} items ({round(_retweets/_items*100, 2)}%)")

    # Adding to the main dataframe
    campaign_data = pd.concat([campaign_data, df])

    # Some more info
    current_date = name.split('_')[-1]
    print(f"{current_date} - {campaign_data.shape[0]} total retweets")
    print()

Created empty DataFrame...
file_edges_date_2022-08-26 - 32533 retweets out of 105470 items (30.85%)
2022-08-26 - 2556559 total retweets

file_edges_date_2022-08-27 - 29990 retweets out of 96239 items (31.16%)
2022-08-27 - 2556559 total retweets

file_edges_date_2022-08-28 - 31810 retweets out of 94358 items (33.71%)
2022-08-28 - 2556559 total retweets

file_edges_date_2022-08-29 - 27805 retweets out of 91124 items (30.51%)
2022-08-29 - 2556559 total retweets

file_edges_date_2022-08-30 - 24201 retweets out of 89639 items (27.0%)
2022-08-30 - 2556559 total retweets

file_edges_date_2022-08-31 - 25680 retweets out of 82285 items (31.21%)
2022-08-31 - 2556559 total retweets

file_edges_date_2022-09-01 - 23994 retweets out of 80066 items (29.97%)
2022-09-01 - 2556559 total retweets

file_edges_date_2022-09-02 - 88495 retweets out of 430691 items (20.55%)
2022-09-02 - 2556559 total retweets

file_edges_date_2022-09-03 - 100309 retweets out of 491989 items (20.39%)
2022-09-03 - 2556559 total

In [15]:
# Visualizing it
campaign_data.head(10)

Unnamed: 0.1,source,dest,text_tweet_id,created_at,type,Unnamed: 0
0,59101703,1557514573,1563252095368605696,2022-08-26 23:59:59+00:00,retweet,
1,1480682060767506432,1096360092809810048,1563231317185085440,2022-08-26 23:59:56+00:00,retweet,
2,1363801884658696192,912281497314447232,1562718175103700992,2022-08-26 23:59:51+00:00,retweet,
5,1554231181687808000,171981193,1563129867235119104,2022-08-26 23:59:37+00:00,retweet,
6,1056590361722867712,1296055260394532864,1562466497582497792,2022-08-26 23:59:34+00:00,retweet,
19,426112917,1016249336903159808,1563300827200045056,2022-08-26 23:58:59+00:00,retweet,
20,304312411,497995758,1563078628208148480,2022-08-26 23:58:59+00:00,retweet,
24,810145644706926592,532238684,1563242165261996032,2022-08-26 23:58:53+00:00,retweet,
28,392987772,13294452,1563219874972725248,2022-08-26 23:58:37+00:00,retweet,
32,1470866013457686528,1220673282191523840,1563087881325137920,2022-08-26 23:58:21+00:00,retweet,


In [16]:
# drop unnamed column
campaign_data = campaign_data.drop(columns=['Unnamed: 0'])

In [18]:
# Exporting it
campaign_data.to_csv(os.path.join(preprocessed_path, 'electoral_campaign.csv'), index=False)

In [19]:
# Parsing data
after_data = pd.DataFrame(columns=['source', 'dest', 'text_tweet_id', 'created_at', 'type'])
print("Created empty DataFrame...")

for name in after_filenames:
    # Loading df
    df = pd.read_csv(os.path.join(after_path, name), sep=',') 
    _items = df.shape[0]

    # Removing reply tweets
    df = df[df.type == 'retweet']
    _retweets = df.shape[0]

    df['source'] = df['source'].astype(int)
    df['dest'] = df['dest'].astype(int)

    # Some info
    print(f"{name} - {_retweets} retweets out of {_items} items ({round(_retweets/_items*100, 2)}%)")

    # Adding to the main dataframe
    after_data = pd.concat([after_data, df])

    # Some more info
    current_date = name.split('_')[-1]
    print(f"{current_date} - {after_data.shape[0]} total retweets")
    print()

Created empty DataFrame...
file_edges_date_2022-09-25 - 202922 retweets out of 659046 items (30.79%)
2022-09-25 - 202922 total retweets

file_edges_date_2022-09-26 - 415623 retweets out of 1390815 items (29.88%)
2022-09-26 - 618545 total retweets

file_edges_date_2022-09-27 - 145686 retweets out of 627834 items (23.2%)
2022-09-27 - 764231 total retweets

file_edges_date_2022-09-28 - 105431 retweets out of 480108 items (21.96%)
2022-09-28 - 869662 total retweets

file_edges_date_2022-09-29 - 85880 retweets out of 401740 items (21.38%)
2022-09-29 - 955542 total retweets

file_edges_date_2022-09-30 - 72529 retweets out of 357819 items (20.27%)
2022-09-30 - 1028071 total retweets

file_edges_date_2022-10-01 - 53271 retweets out of 264755 items (20.12%)
2022-10-01 - 1081342 total retweets

file_edges_date_2022-10-02 - 47609 retweets out of 222799 items (21.37%)
2022-10-02 - 1128951 total retweets

file_edges_date_2022-10-03 - 48408 retweets out of 214751 items (22.54%)
2022-10-03 - 1177359 

In [20]:
# Visualizing it
after_data.head()

Unnamed: 0.1,source,dest,text_tweet_id,created_at,type,Unnamed: 0
2,1547691038034120705,1460611925906181888,1573654988177432576,2022-09-24 21:59:55+00:00,retweet,2.0
4,908147456,828717014,1573434419658248199,2022-09-24 21:59:56+00:00,retweet,4.0
6,379926998,2326660325,1573758541109907456,2022-09-24 21:59:57+00:00,retweet,6.0
8,1498444048079675398,873255372,1573593048495407104,2022-09-24 21:59:58+00:00,retweet,8.0
10,191522922,1244928423224586240,1573697306930696195,2022-09-24 21:59:58+00:00,retweet,10.0


In [21]:
# drop unnamed column
after_data = after_data.drop(columns=['Unnamed: 0'])

In [22]:
# Visualizing it
after_data.head()

Unnamed: 0,source,dest,text_tweet_id,created_at,type
2,1547691038034120705,1460611925906181888,1573654988177432576,2022-09-24 21:59:55+00:00,retweet
4,908147456,828717014,1573434419658248199,2022-09-24 21:59:56+00:00,retweet
6,379926998,2326660325,1573758541109907456,2022-09-24 21:59:57+00:00,retweet
8,1498444048079675398,873255372,1573593048495407104,2022-09-24 21:59:58+00:00,retweet
10,191522922,1244928423224586240,1573697306930696195,2022-09-24 21:59:58+00:00,retweet


In [23]:
# Exporting it
after_data.to_csv(os.path.join(preprocessed_path, 'after_elections.csv'), index=False)